{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# README\n", "\n", "This Notebook runs requiring the following environent variable:\n", "1. GOOGLE_CLOUD_PROJECT - The google cloud project id.\n", "1. BIGQUERY_LOCATION - can take values as per https://cloud.google.com/bigquery/docs/locations, e.g. `us`, `asia-east1`." ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Infer location and set up data in that location if needed" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "BigQuery project: bigframes-dev\n", "BigQuery location: us\n", "Penguins Table: bigquery-public-data.ml_datasets.penguins\n", "ML Model Dataset: bigframes-dev.bigframes_testing\n" ] } ], "source": [ "# Take multi-region US as the default BQ location, where most of the BQ data lies including the BQ public datasets\n", "import os\n", "\n", "PROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\")\n", "BQ_LOCATION = os.environ.get(\"BIGQUERY_LOCATION\")\n", "\n", "if not PROJECT_ID:\n", " raise ValueError(\"Project must be set via environment variable GOOGLE_CLOUD_PROJECT\")\n", "if not BQ_LOCATION:\n", " raise ValueError(\"BQ location must be set via environment variable BIGQUERY_LOCATION\")\n", "\n", "DATASET = \"bigframes_testing\"\n", "PENGUINS_TABLE = \"bigquery-public-data.ml_datasets.penguins\"\n", "\n", "\n", "# Check for a location set in the environment and do location-specific setup if needed\n", "\n", "import google.api_core.exceptions\n", "from google.cloud import bigquery\n", "import bigframes\n", "\n", "client = bigquery.Client()\n", "\n", "BQ_LOCATION = BQ_LOCATION.lower()\n", "if BQ_LOCATION != \"us\":\n", " bq_location_normalized = BQ_LOCATION.replace('-', '_')\n", "\n", " # Nominate a local penguins table\n", " penguins_table_ref = bigquery.TableReference.from_string(PENGUINS_TABLE)\n", " penguins_local_dataset_name = f\"{DATASET}_{bq_location_normalized}\"\n", " penguins_local_dataset_ref = bigquery.DatasetReference(project=PROJECT_ID, dataset_id=penguins_local_dataset_name)\n", " penguins_local_dataset = bigquery.Dataset(penguins_local_dataset_ref)\n", " penguins_local_dataset.location = BQ_LOCATION\n", " penguins_local_table_ref= bigquery.TableReference(penguins_local_dataset, penguins_table_ref.table_id)\n", " penguins_local_table = str(penguins_local_table_ref)\n", " try:\n", " client.get_table(penguins_local_table_ref)\n", " except google.api_core.exceptions.NotFound:\n", " client.create_dataset(penguins_local_dataset, exists_ok=True)\n", "\n", " # Read the public table as an in-memory dataframe and then write to the local table\n", " session_us = bigframes.connect()\n", " df = session_us.read_gbq(PENGUINS_TABLE).to_pandas()\n", " df.to_gbq(penguins_local_table)\n", "\n", " # Finally point the penguins table to the local table\n", " PENGUINS_TABLE=penguins_local_table\n", "\n", " # Also update the dataset name used for test artifacts\n", " DATASET = f\"{DATASET}_{bq_location_normalized}\"\n", "\n", "# Create the dataset to store the model if it doesn't exist \n", "model_local_dataset = bigquery.Dataset(bigquery.DatasetReference(project=PROJECT_ID, dataset_id=DATASET))\n", "model_local_dataset.location = BQ_LOCATION\n", "model_dataset = client.create_dataset(model_local_dataset, exists_ok=True)\n", "\n", "# Finally log the variables driving the core notebook execution\n", "log = ('\\n'.join(f\"{name}: {str(value)}\" for name, value in {\n", " \"BigQuery project\" : PROJECT_ID,\n", " \"BigQuery location\" : BQ_LOCATION,\n", " \"Penguins Table\" : PENGUINS_TABLE,\n", " \"ML Model Dataset\" : model_dataset.reference\n", "}.items())) \n", "print(log)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Using the BigQuery DataFrames API" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Set BigQuery DataFrames options" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import bigframes.pandas\n", "\n", "# Note: The project option is not required in all environments.\n", "# On BigQuery Studio, the project ID is automatically detected.\n", "bigframes.pandas.options.bigquery.project = PROJECT_ID\n", "\n", "# Note: The location option is not required.\n", "# It defaults to the location of the first table or query\n", "# passed to read_gbq(). For APIs where a location can't be\n", "# auto-detected, the location defaults to the \"US\" location.\n", "bigframes.pandas.options.bigquery.location = BQ_LOCATION" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Initialize a dataframe for a BigQuery table" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "02af1103261a4e63a4c15efd26b1bc9a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job e1a62d56-8cab-4bc1-9ad3-457f48b71d9c is RUNNING. \n", "\n", "
| \n", " | species | \n", "island | \n", "culmen_length_mm | \n", "culmen_depth_mm | \n", "flipper_length_mm | \n", "body_mass_g | \n", "sex | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "36.6 | \n", "18.4 | \n", "184.0 | \n", "3475.0 | \n", "FEMALE | \n", "
| 1 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "39.8 | \n", "19.1 | \n", "184.0 | \n", "4650.0 | \n", "MALE | \n", "
| 2 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.9 | \n", "18.9 | \n", "184.0 | \n", "3900.0 | \n", "MALE | \n", "
| 3 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "46.5 | \n", "17.9 | \n", "192.0 | \n", "3500.0 | \n", "FEMALE | \n", "
| 4 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.3 | \n", "16.8 | \n", "192.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 5 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "43.2 | \n", "18.5 | \n", "192.0 | \n", "4100.0 | \n", "MALE | \n", "
| 6 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "46.9 | \n", "16.6 | \n", "192.0 | \n", "2700.0 | \n", "FEMALE | \n", "
| 7 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "50.5 | \n", "18.4 | \n", "200.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 8 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "49.5 | \n", "19.0 | \n", "200.0 | \n", "3800.0 | \n", "MALE | \n", "
| 9 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.2 | \n", "20.1 | \n", "200.0 | \n", "3975.0 | \n", "MALE | \n", "
| 10 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.8 | \n", "18.9 | \n", "208.0 | \n", "4300.0 | \n", "MALE | \n", "
| 11 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "39.0 | \n", "18.7 | \n", "185.0 | \n", "3650.0 | \n", "MALE | \n", "
| 12 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.0 | \n", "16.9 | \n", "185.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 13 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "47.0 | \n", "17.3 | \n", "185.0 | \n", "3700.0 | \n", "FEMALE | \n", "
| 14 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "34.0 | \n", "17.1 | \n", "185.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 15 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.0 | \n", "16.5 | \n", "185.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 16 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "45.7 | \n", "17.3 | \n", "193.0 | \n", "3600.0 | \n", "FEMALE | \n", "
| 17 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "50.6 | \n", "19.4 | \n", "193.0 | \n", "3800.0 | \n", "MALE | \n", "
| 18 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "39.7 | \n", "17.9 | \n", "193.0 | \n", "4250.0 | \n", "MALE | \n", "
| 19 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.8 | \n", "18.1 | \n", "193.0 | \n", "3750.0 | \n", "MALE | \n", "
| 20 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "46.6 | \n", "17.8 | \n", "193.0 | \n", "3800.0 | \n", "FEMALE | \n", "
| 21 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "51.3 | \n", "19.2 | \n", "193.0 | \n", "3650.0 | \n", "MALE | \n", "
| 22 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.2 | \n", "17.1 | \n", "193.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 23 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "36.8 | \n", "18.5 | \n", "193.0 | \n", "3500.0 | \n", "FEMALE | \n", "
| 24 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "49.6 | \n", "18.2 | \n", "193.0 | \n", "3775.0 | \n", "MALE | \n", "
25 rows × 7 columns
\n", "[344 rows x 7 columns in total]" ], "text/plain": [ " species island culmen_length_mm \\\n", "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", "5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 \n", "6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 \n", "7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", "8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 \n", "9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", "10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 \n", "11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", "12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", "13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", "14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 \n", "15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", "16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 \n", "17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 \n", "18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 \n", "20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 \n", "21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 \n", "22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", "23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 \n", "24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", "0 18.4 184.0 3475.0 FEMALE \n", "1 19.1 184.0 4650.0 MALE \n", "2 18.9 184.0 3900.0 MALE \n", "3 17.9 192.0 3500.0 FEMALE \n", "4 16.8 192.0 3000.0 FEMALE \n", "5 18.5 192.0 4100.0 MALE \n", "6 16.6 192.0 2700.0 FEMALE \n", "7 18.4 200.0 3400.0 FEMALE \n", "8 19.0 200.0 3800.0 MALE \n", "9 20.1 200.0 3975.0 MALE \n", "10 18.9 208.0 4300.0 MALE \n", "11 18.7 185.0 3650.0 MALE \n", "12 16.9 185.0 3000.0 FEMALE \n", "13 17.3 185.0 3700.0 FEMALE \n", "14 17.1 185.0 3400.0 FEMALE \n", "15 16.5 185.0 3400.0 FEMALE \n", "16 17.3 193.0 3600.0 FEMALE \n", "17 19.4 193.0 3800.0 MALE \n", "18 17.9 193.0 4250.0 MALE \n", "19 18.1 193.0 3750.0 MALE \n", "20 17.8 193.0 3800.0 FEMALE \n", "21 19.2 193.0 3650.0 MALE \n", "22 17.1 193.0 3400.0 FEMALE \n", "23 18.5 193.0 3500.0 FEMALE \n", "24 18.2 193.0 3775.0 MALE \n", "...\n", "\n", "[344 rows x 7 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### View the column names in the dataframe (aka columns names in the table)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['species', 'island', 'culmen_length_mm', 'culmen_depth_mm',\n", " 'flipper_length_mm', 'body_mass_g', 'sex'],\n", " dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### View the table schema" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "species string[pyarrow]\n", "island string[pyarrow]\n", "culmen_length_mm Float64\n", "culmen_depth_mm Float64\n", "flipper_length_mm Float64\n", "body_mass_g Float64\n", "sex string[pyarrow]\n", "dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Select a subset of columns" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e8d0aab7b7284b2fb2b5931c8461b7f5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job ee2bae68-7c4b-49f4-b5c2-8818cec33a74 is DONE. 0 Bytes processed. \n", "\n", "| \n", " | species | \n", "island | \n", "body_mass_g | \n", "
|---|---|---|---|
| 0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3475.0 | \n", "
| 1 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4650.0 | \n", "
| 2 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3900.0 | \n", "
| 3 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3500.0 | \n", "
| 4 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3000.0 | \n", "
| 5 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4100.0 | \n", "
| 6 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "2700.0 | \n", "
| 7 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3400.0 | \n", "
| 8 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3800.0 | \n", "
| 9 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3975.0 | \n", "
| 10 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4300.0 | \n", "
| 11 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3650.0 | \n", "
| 12 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3000.0 | \n", "
| 13 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3700.0 | \n", "
| 14 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3400.0 | \n", "
| 15 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3400.0 | \n", "
| 16 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3600.0 | \n", "
| 17 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3800.0 | \n", "
| 18 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4250.0 | \n", "
| 19 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3750.0 | \n", "
| 20 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3800.0 | \n", "
| 21 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3650.0 | \n", "
| 22 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3400.0 | \n", "
| 23 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "3500.0 | \n", "
| 24 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "3775.0 | \n", "
25 rows × 3 columns
\n", "[344 rows x 3 columns in total]" ], "text/plain": [ " species island body_mass_g\n", "0 Adelie Penguin (Pygoscelis adeliae) Dream 3475.0\n", "1 Adelie Penguin (Pygoscelis adeliae) Dream 4650.0\n", "2 Adelie Penguin (Pygoscelis adeliae) Dream 3900.0\n", "3 Chinstrap penguin (Pygoscelis antarctica) Dream 3500.0\n", "4 Adelie Penguin (Pygoscelis adeliae) Dream 3000.0\n", "5 Adelie Penguin (Pygoscelis adeliae) Dream 4100.0\n", "6 Chinstrap penguin (Pygoscelis antarctica) Dream 2700.0\n", "7 Chinstrap penguin (Pygoscelis antarctica) Dream 3400.0\n", "8 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0\n", "9 Adelie Penguin (Pygoscelis adeliae) Dream 3975.0\n", "10 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0\n", "11 Adelie Penguin (Pygoscelis adeliae) Dream 3650.0\n", "12 Adelie Penguin (Pygoscelis adeliae) Dream 3000.0\n", "13 Chinstrap penguin (Pygoscelis antarctica) Dream 3700.0\n", "14 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0\n", "15 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0\n", "16 Chinstrap penguin (Pygoscelis antarctica) Dream 3600.0\n", "17 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0\n", "18 Adelie Penguin (Pygoscelis adeliae) Dream 4250.0\n", "19 Adelie Penguin (Pygoscelis adeliae) Dream 3750.0\n", "20 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0\n", "21 Chinstrap penguin (Pygoscelis antarctica) Dream 3650.0\n", "22 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0\n", "23 Adelie Penguin (Pygoscelis adeliae) Dream 3500.0\n", "24 Chinstrap penguin (Pygoscelis antarctica) Dream 3775.0\n", "...\n", "\n", "[344 rows x 3 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df[[\n", " \"species\",\n", " \"island\",\n", " \"body_mass_g\",\n", "]]\n", "df" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### View the first ten values of a series" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "38a3a113ceec4bd7a3acd29c60a1af34", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job 31a62db3-8550-43e9-8707-ed80de1382d3 is DONE. 2.8 kB processed. \n", "\n", "| \n", " | species | \n", "island | \n", "body_mass_g | \n", "
|---|---|---|---|
| 1 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4650.0 | \n", "
| 5 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4100.0 | \n", "
| 10 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4300.0 | \n", "
| 18 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4250.0 | \n", "
| 25 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4050.0 | \n", "
| 26 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4000.0 | \n", "
| 27 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4050.0 | \n", "
| 28 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4300.0 | \n", "
| 30 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4450.0 | \n", "
| 36 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4450.0 | \n", "
| 44 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4100.0 | \n", "
| 45 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4800.0 | \n", "
| 57 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4400.0 | \n", "
| 61 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4150.0 | \n", "
| 66 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4050.0 | \n", "
| 67 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4050.0 | \n", "
| 68 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4100.0 | \n", "
| 74 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4350.0 | \n", "
| 77 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4150.0 | \n", "
| 78 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4400.0 | \n", "
| 80 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4300.0 | \n", "
| 90 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4150.0 | \n", "
| 92 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4500.0 | \n", "
| 93 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "4300.0 | \n", "
| 94 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "4550.0 | \n", "
25 rows × 3 columns
\n", "[177 rows x 3 columns in total]" ], "text/plain": [ " species island body_mass_g\n", "1 Adelie Penguin (Pygoscelis adeliae) Dream 4650.0\n", "5 Adelie Penguin (Pygoscelis adeliae) Dream 4100.0\n", "10 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0\n", "18 Adelie Penguin (Pygoscelis adeliae) Dream 4250.0\n", "25 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0\n", "26 Adelie Penguin (Pygoscelis adeliae) Dream 4000.0\n", "27 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0\n", "28 Chinstrap penguin (Pygoscelis antarctica) Dream 4300.0\n", "30 Chinstrap penguin (Pygoscelis antarctica) Dream 4450.0\n", "36 Adelie Penguin (Pygoscelis adeliae) Dream 4450.0\n", "44 Chinstrap penguin (Pygoscelis antarctica) Dream 4100.0\n", "45 Chinstrap penguin (Pygoscelis antarctica) Dream 4800.0\n", "57 Chinstrap penguin (Pygoscelis antarctica) Dream 4400.0\n", "61 Chinstrap penguin (Pygoscelis antarctica) Dream 4150.0\n", "66 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0\n", "67 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0\n", "68 Chinstrap penguin (Pygoscelis antarctica) Dream 4100.0\n", "74 Adelie Penguin (Pygoscelis adeliae) Dream 4350.0\n", "77 Adelie Penguin (Pygoscelis adeliae) Dream 4150.0\n", "78 Adelie Penguin (Pygoscelis adeliae) Dream 4400.0\n", "80 Chinstrap penguin (Pygoscelis antarctica) Dream 4300.0\n", "90 Chinstrap penguin (Pygoscelis antarctica) Dream 4150.0\n", "92 Chinstrap penguin (Pygoscelis antarctica) Dream 4500.0\n", "93 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0\n", "94 Chinstrap penguin (Pygoscelis antarctica) Dream 4550.0\n", "...\n", "\n", "[177 rows x 3 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['body_mass_g'] >= 4000.0]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Using the Remote Functions" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## BigQuery DataFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function.", "\n", "It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via `help` command." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on function remote_function in module bigframes.pandas:\n", "\n", "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)\n", " Decorator to turn a user defined function into a BigQuery remote function.\n", " \n", " .. note::\n", " Please make sure following is setup before using this API:\n", " \n", " 1. Have the below APIs enabled for your project:\n", " a. BigQuery Connection API\n", " b. Cloud Functions API\n", " c. Cloud Run API\n", " d. Cloud Build API\n", " e. Artifact Registry API\n", " f. Cloud Resource Manager API\n", " \n", " This can be done from the cloud console (change PROJECT_ID to yours):\n", " https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID\n", " Or from the gcloud CLI:\n", " $ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com\n", " \n", " 2. Have following IAM roles enabled for you:\n", " a. BigQuery Data Editor (roles/bigquery.dataEditor)\n", " b. BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n", " c. Cloud Functions Developer (roles/cloudfunctions.developer)\n", " d. Service Account User (roles/iam.serviceAccountUser)\n", " e. Storage Object Viewer (roles/storage.objectViewer)\n", " f. Project IAM Admin (roles/resourcemanager.projectIamAdmin)\n", " (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n", " \n", " 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:\n", " a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection\n", " b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function\n", " Alternatively, the IAM could also be setup via the gcloud CLI:\n", " $ gcloud projects add-iam-policy-binding PROJECT_ID --member=\"serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID\" --role=\"roles/run.invoker\"\n", " \n", " Args:\n", " input_types (list(type)):\n", " List of input data types in the user defined function.\n", " output_type (type):\n", " Data type of the output in the user defined function.\n", " dataset (str, Optional):\n", " Dataset to use to create a BigQuery function. It should be in\n", " `| \n", " | body_mass_g | \n", "body_mass_bucket | \n", "
|---|---|---|
| 0 | \n", "3475.0 | \n", "below_4000 | \n", "
| 1 | \n", "4650.0 | \n", "at_or_above_4000 | \n", "
| 2 | \n", "3900.0 | \n", "below_4000 | \n", "
| 3 | \n", "3500.0 | \n", "below_4000 | \n", "
| 4 | \n", "3000.0 | \n", "below_4000 | \n", "
| 5 | \n", "4100.0 | \n", "at_or_above_4000 | \n", "
| 6 | \n", "2700.0 | \n", "below_4000 | \n", "
| 7 | \n", "3400.0 | \n", "below_4000 | \n", "
| 8 | \n", "3800.0 | \n", "below_4000 | \n", "
| 9 | \n", "3975.0 | \n", "below_4000 | \n", "
10 rows × 2 columns
\n", "[10 rows x 2 columns in total]" ], "text/plain": [ " body_mass_g body_mass_bucket\n", "0 3475.0 below_4000\n", "1 4650.0 at_or_above_4000\n", "2 3900.0 below_4000\n", "3 3500.0 below_4000\n", "4 3000.0 below_4000\n", "5 4100.0 at_or_above_4000\n", "6 2700.0 below_4000\n", "7 3400.0 below_4000\n", "8 3800.0 below_4000\n", "9 3975.0 below_4000\n", "\n", "[10 rows x 2 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))\n", "df[['body_mass_g', 'body_mass_bucket']].head(10)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Using the ML API" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize a DataFrame from a BigQuery table" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "89bf7ae3fd8641bcbdc5a4614a82f48a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job acd770bb-5ccb-463f-beec-2386132ded6b is RUNNING. \n", "\n", "| \n", " | species | \n", "island | \n", "culmen_length_mm | \n", "culmen_depth_mm | \n", "flipper_length_mm | \n", "body_mass_g | \n", "sex | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "36.6 | \n", "18.4 | \n", "184.0 | \n", "3475.0 | \n", "FEMALE | \n", "
| 1 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "39.8 | \n", "19.1 | \n", "184.0 | \n", "4650.0 | \n", "MALE | \n", "
| 2 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.9 | \n", "18.9 | \n", "184.0 | \n", "3900.0 | \n", "MALE | \n", "
| 3 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "46.5 | \n", "17.9 | \n", "192.0 | \n", "3500.0 | \n", "FEMALE | \n", "
| 4 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.3 | \n", "16.8 | \n", "192.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 5 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "43.2 | \n", "18.5 | \n", "192.0 | \n", "4100.0 | \n", "MALE | \n", "
| 6 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "46.9 | \n", "16.6 | \n", "192.0 | \n", "2700.0 | \n", "FEMALE | \n", "
| 7 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "50.5 | \n", "18.4 | \n", "200.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 8 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "49.5 | \n", "19.0 | \n", "200.0 | \n", "3800.0 | \n", "MALE | \n", "
| 9 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.2 | \n", "20.1 | \n", "200.0 | \n", "3975.0 | \n", "MALE | \n", "
| 10 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.8 | \n", "18.9 | \n", "208.0 | \n", "4300.0 | \n", "MALE | \n", "
| 11 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "39.0 | \n", "18.7 | \n", "185.0 | \n", "3650.0 | \n", "MALE | \n", "
| 12 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.0 | \n", "16.9 | \n", "185.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 13 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "47.0 | \n", "17.3 | \n", "185.0 | \n", "3700.0 | \n", "FEMALE | \n", "
| 14 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "34.0 | \n", "17.1 | \n", "185.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 15 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.0 | \n", "16.5 | \n", "185.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 16 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "45.7 | \n", "17.3 | \n", "193.0 | \n", "3600.0 | \n", "FEMALE | \n", "
| 17 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "50.6 | \n", "19.4 | \n", "193.0 | \n", "3800.0 | \n", "MALE | \n", "
| 18 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "39.7 | \n", "17.9 | \n", "193.0 | \n", "4250.0 | \n", "MALE | \n", "
| 19 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.8 | \n", "18.1 | \n", "193.0 | \n", "3750.0 | \n", "MALE | \n", "
| 20 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "46.6 | \n", "17.8 | \n", "193.0 | \n", "3800.0 | \n", "FEMALE | \n", "
| 21 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "51.3 | \n", "19.2 | \n", "193.0 | \n", "3650.0 | \n", "MALE | \n", "
| 22 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.2 | \n", "17.1 | \n", "193.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 23 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "36.8 | \n", "18.5 | \n", "193.0 | \n", "3500.0 | \n", "FEMALE | \n", "
| 24 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "49.6 | \n", "18.2 | \n", "193.0 | \n", "3775.0 | \n", "MALE | \n", "
25 rows × 7 columns
\n", "[344 rows x 7 columns in total]" ], "text/plain": [ " species island culmen_length_mm \\\n", "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", "5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 \n", "6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 \n", "7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", "8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 \n", "9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", "10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 \n", "11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", "12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", "13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", "14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 \n", "15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", "16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 \n", "17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 \n", "18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 \n", "20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 \n", "21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 \n", "22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", "23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 \n", "24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", "0 18.4 184.0 3475.0 FEMALE \n", "1 19.1 184.0 4650.0 MALE \n", "2 18.9 184.0 3900.0 MALE \n", "3 17.9 192.0 3500.0 FEMALE \n", "4 16.8 192.0 3000.0 FEMALE \n", "5 18.5 192.0 4100.0 MALE \n", "6 16.6 192.0 2700.0 FEMALE \n", "7 18.4 200.0 3400.0 FEMALE \n", "8 19.0 200.0 3800.0 MALE \n", "9 20.1 200.0 3975.0 MALE \n", "10 18.9 208.0 4300.0 MALE \n", "11 18.7 185.0 3650.0 MALE \n", "12 16.9 185.0 3000.0 FEMALE \n", "13 17.3 185.0 3700.0 FEMALE \n", "14 17.1 185.0 3400.0 FEMALE \n", "15 16.5 185.0 3400.0 FEMALE \n", "16 17.3 193.0 3600.0 FEMALE \n", "17 19.4 193.0 3800.0 MALE \n", "18 17.9 193.0 4250.0 MALE \n", "19 18.1 193.0 3750.0 MALE \n", "20 17.8 193.0 3800.0 FEMALE \n", "21 19.2 193.0 3650.0 MALE \n", "22 17.1 193.0 3400.0 FEMALE \n", "23 18.5 193.0 3500.0 FEMALE \n", "24 18.2 193.0 3775.0 MALE \n", "...\n", "\n", "[344 rows x 7 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = bigframes.pandas.read_gbq(PENGUINS_TABLE)\n", "df" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Clean and prepare the data" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3b6cf41c4b2843ee81aa536863f02f75", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job 6cb59d7e-091b-4c08-833d-3e189972c28e is DONE. 28.9 kB processed. \n", "\n", "| \n", " | island | \n", "culmen_length_mm | \n", "culmen_depth_mm | \n", "flipper_length_mm | \n", "body_mass_g | \n", "sex | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "Dream | \n", "36.6 | \n", "18.4 | \n", "184.0 | \n", "3475.0 | \n", "FEMALE | \n", "
| 1 | \n", "Dream | \n", "39.8 | \n", "19.1 | \n", "184.0 | \n", "4650.0 | \n", "MALE | \n", "
| 2 | \n", "Dream | \n", "40.9 | \n", "18.9 | \n", "184.0 | \n", "3900.0 | \n", "MALE | \n", "
| 4 | \n", "Dream | \n", "37.3 | \n", "16.8 | \n", "192.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 5 | \n", "Dream | \n", "43.2 | \n", "18.5 | \n", "192.0 | \n", "4100.0 | \n", "MALE | \n", "
| 9 | \n", "Dream | \n", "40.2 | \n", "20.1 | \n", "200.0 | \n", "3975.0 | \n", "MALE | \n", "
| 10 | \n", "Dream | \n", "40.8 | \n", "18.9 | \n", "208.0 | \n", "4300.0 | \n", "MALE | \n", "
| 11 | \n", "Dream | \n", "39.0 | \n", "18.7 | \n", "185.0 | \n", "3650.0 | \n", "MALE | \n", "
| 12 | \n", "Dream | \n", "37.0 | \n", "16.9 | \n", "185.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 14 | \n", "Dream | \n", "34.0 | \n", "17.1 | \n", "185.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 15 | \n", "Dream | \n", "37.0 | \n", "16.5 | \n", "185.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 18 | \n", "Dream | \n", "39.7 | \n", "17.9 | \n", "193.0 | \n", "4250.0 | \n", "MALE | \n", "
| 19 | \n", "Dream | \n", "37.8 | \n", "18.1 | \n", "193.0 | \n", "3750.0 | \n", "MALE | \n", "
| 22 | \n", "Dream | \n", "40.2 | \n", "17.1 | \n", "193.0 | \n", "3400.0 | \n", "FEMALE | \n", "
| 23 | \n", "Dream | \n", "36.8 | \n", "18.5 | \n", "193.0 | \n", "3500.0 | \n", "FEMALE | \n", "
| 26 | \n", "Dream | \n", "41.5 | \n", "18.5 | \n", "201.0 | \n", "4000.0 | \n", "MALE | \n", "
| 31 | \n", "Dream | \n", "33.1 | \n", "16.1 | \n", "178.0 | \n", "2900.0 | \n", "FEMALE | \n", "
| 32 | \n", "Dream | \n", "37.2 | \n", "18.1 | \n", "178.0 | \n", "3900.0 | \n", "MALE | \n", "
| 33 | \n", "Dream | \n", "39.5 | \n", "16.7 | \n", "178.0 | \n", "3250.0 | \n", "FEMALE | \n", "
| 35 | \n", "Dream | \n", "36.0 | \n", "18.5 | \n", "186.0 | \n", "3100.0 | \n", "FEMALE | \n", "
| 36 | \n", "Dream | \n", "39.6 | \n", "18.1 | \n", "186.0 | \n", "4450.0 | \n", "MALE | \n", "
| 38 | \n", "Dream | \n", "41.3 | \n", "20.3 | \n", "194.0 | \n", "3550.0 | \n", "MALE | \n", "
| 41 | \n", "Dream | \n", "35.7 | \n", "18.0 | \n", "202.0 | \n", "3550.0 | \n", "FEMALE | \n", "
| 51 | \n", "Dream | \n", "38.1 | \n", "17.6 | \n", "187.0 | \n", "3425.0 | \n", "FEMALE | \n", "
| 53 | \n", "Dream | \n", "36.0 | \n", "17.1 | \n", "187.0 | \n", "3700.0 | \n", "FEMALE | \n", "
25 rows × 6 columns
\n", "[146 rows x 6 columns in total]" ], "text/plain": [ " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", "0 Dream 36.6 18.4 184.0 3475.0 \n", "1 Dream 39.8 19.1 184.0 4650.0 \n", "2 Dream 40.9 18.9 184.0 3900.0 \n", "4 Dream 37.3 16.8 192.0 3000.0 \n", "5 Dream 43.2 18.5 192.0 4100.0 \n", "9 Dream 40.2 20.1 200.0 3975.0 \n", "10 Dream 40.8 18.9 208.0 4300.0 \n", "11 Dream 39.0 18.7 185.0 3650.0 \n", "12 Dream 37.0 16.9 185.0 3000.0 \n", "14 Dream 34.0 17.1 185.0 3400.0 \n", "15 Dream 37.0 16.5 185.0 3400.0 \n", "18 Dream 39.7 17.9 193.0 4250.0 \n", "19 Dream 37.8 18.1 193.0 3750.0 \n", "22 Dream 40.2 17.1 193.0 3400.0 \n", "23 Dream 36.8 18.5 193.0 3500.0 \n", "26 Dream 41.5 18.5 201.0 4000.0 \n", "31 Dream 33.1 16.1 178.0 2900.0 \n", "32 Dream 37.2 18.1 178.0 3900.0 \n", "33 Dream 39.5 16.7 178.0 3250.0 \n", "35 Dream 36.0 18.5 186.0 3100.0 \n", "36 Dream 39.6 18.1 186.0 4450.0 \n", "38 Dream 41.3 20.3 194.0 3550.0 \n", "41 Dream 35.7 18.0 202.0 3550.0 \n", "51 Dream 38.1 17.6 187.0 3425.0 \n", "53 Dream 36.0 17.1 187.0 3700.0 \n", "\n", " sex \n", "0 FEMALE \n", "1 MALE \n", "2 MALE \n", "4 FEMALE \n", "5 MALE \n", "9 MALE \n", "10 MALE \n", "11 MALE \n", "12 FEMALE \n", "14 FEMALE \n", "15 FEMALE \n", "18 MALE \n", "19 MALE \n", "22 FEMALE \n", "23 FEMALE \n", "26 MALE \n", "31 FEMALE \n", "32 MALE \n", "33 FEMALE \n", "35 FEMALE \n", "36 MALE \n", "38 MALE \n", "41 FEMALE \n", "51 FEMALE \n", "53 FEMALE \n", "...\n", "\n", "[146 rows x 6 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# filter down to the data we want to analyze\n", "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", "\n", "# drop the columns we don't care about\n", "adelie_data = adelie_data.drop(columns=[\"species\"])\n", "\n", "# drop rows with nulls to get our training data\n", "training_data = adelie_data.dropna()\n", "\n", "# take a peek at the training data\n", "training_data" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# pick feature columns and label column\n", "feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", "label_columns = training_data[['body_mass_g']]\n", "\n", "# also get the rows that we want to make predictions for (i.e. where the feature column is null)\n", "missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Train and evaluate a linear regression model using the ML API" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "865f6bb75c5b48e4a52a3183fe3c2582", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job 4b0c58e4-4752-4b96-b490-a95e3ae326c0 is DONE. 31.9 kB processed. \n", "\n", "| \n", " | mean_absolute_error | \n", "mean_squared_error | \n", "mean_squared_log_error | \n", "median_absolute_error | \n", "r2_score | \n", "explained_variance | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "223.878763 | \n", "78553.601634 | \n", "0.005614 | \n", "181.330911 | \n", "0.623951 | \n", "0.623951 | \n", "
1 rows × 6 columns
\n", "[1 rows x 6 columns in total]" ], "text/plain": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", "0 223.878763 78553.601634 0.005614 \n", "\n", " median_absolute_error r2_score explained_variance \n", "0 181.330911 0.623951 0.623951 \n", "\n", "[1 rows x 6 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from bigframes.ml.linear_model import LinearRegression\n", "\n", "# as in scikit-learn, a newly created model is just a bundle of parameters\n", "# default parameters are fine here\n", "model = LinearRegression()\n", "\n", "# this will train a temporary model in BigQuery Machine Learning\n", "model.fit(feature_columns, label_columns)\n", "\n", "# check how the model performed\n", "model.score(feature_columns, label_columns)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Make predictions using the model" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "caf8cefe06a14da2a02f31aa1e12c23a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job fe2fe252-8433-4d20-861c-681a8dfbf2c4 is RUNNING. \n", "\n", "| \n", " | predicted_body_mass_g | \n", "
|---|---|
| 292 | \n", "3459.735118 | \n", "
1 rows × 1 columns
\n", "[1 rows x 1 columns in total]" ], "text/plain": [ " predicted_body_mass_g\n", "292 3459.735118\n", "\n", "[1 rows x 1 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(missing_body_mass)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Save the trained model to BigQuery, so we can load it later" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression()" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.to_gbq(f\"{DATASET}.penguins_model\", replace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Clean Up" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bpd.close_session()" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }