{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# README\n", "\n", "This Notebook runs requiring the following environent variable:\n", "1. GOOGLE_CLOUD_PROJECT - The google cloud project id.\n", "1. BIGQUERY_LOCATION - can take values as per https://cloud.google.com/bigquery/docs/locations, e.g. `us`, `asia-east1`." ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Infer location and set up data in that location if needed" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "BigQuery project: bigframes-dev\n", "BigQuery location: us\n", "Penguins Table: bigquery-public-data.ml_datasets.penguins\n", "ML Model Dataset: bigframes-dev.bigframes_testing\n" ] } ], "source": [ "# Take multi-region US as the default BQ location, where most of the BQ data lies including the BQ public datasets\n", "import os\n", "\n", "PROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\")\n", "BQ_LOCATION = os.environ.get(\"BIGQUERY_LOCATION\")\n", "\n", "if not PROJECT_ID:\n", " raise ValueError(\"Project must be set via environment variable GOOGLE_CLOUD_PROJECT\")\n", "if not BQ_LOCATION:\n", " raise ValueError(\"BQ location must be set via environment variable BIGQUERY_LOCATION\")\n", "\n", "DATASET = \"bigframes_testing\"\n", "PENGUINS_TABLE = \"bigquery-public-data.ml_datasets.penguins\"\n", "\n", "\n", "# Check for a location set in the environment and do location-specific setup if needed\n", "\n", "import google.api_core.exceptions\n", "from google.cloud import bigquery\n", "import bigframes\n", "\n", "client = bigquery.Client()\n", "\n", "BQ_LOCATION = BQ_LOCATION.lower()\n", "if BQ_LOCATION != \"us\":\n", " bq_location_normalized = BQ_LOCATION.replace('-', '_')\n", "\n", " # Nominate a local penguins table\n", " penguins_table_ref = bigquery.TableReference.from_string(PENGUINS_TABLE)\n", " penguins_local_dataset_name = f\"{DATASET}_{bq_location_normalized}\"\n", " penguins_local_dataset_ref = bigquery.DatasetReference(project=PROJECT_ID, dataset_id=penguins_local_dataset_name)\n", " penguins_local_dataset = bigquery.Dataset(penguins_local_dataset_ref)\n", " penguins_local_dataset.location = BQ_LOCATION\n", " penguins_local_table_ref= bigquery.TableReference(penguins_local_dataset, penguins_table_ref.table_id)\n", " penguins_local_table = str(penguins_local_table_ref)\n", " try:\n", " client.get_table(penguins_local_table_ref)\n", " except google.api_core.exceptions.NotFound:\n", " client.create_dataset(penguins_local_dataset, exists_ok=True)\n", "\n", " # Read the public table as an in-memory dataframe and then write to the local table\n", " session_us = bigframes.connect()\n", " df = session_us.read_gbq(PENGUINS_TABLE).to_pandas()\n", " df.to_gbq(penguins_local_table)\n", "\n", " # Finally point the penguins table to the local table\n", " PENGUINS_TABLE=penguins_local_table\n", "\n", " # Also update the dataset name used for test artifacts\n", " DATASET = f\"{DATASET}_{bq_location_normalized}\"\n", "\n", "# Create the dataset to store the model if it doesn't exist \n", "model_local_dataset = bigquery.Dataset(bigquery.DatasetReference(project=PROJECT_ID, dataset_id=DATASET))\n", "model_local_dataset.location = BQ_LOCATION\n", "model_dataset = client.create_dataset(model_local_dataset, exists_ok=True)\n", "\n", "# Finally log the variables driving the core notebook execution\n", "log = ('\\n'.join(f\"{name}: {str(value)}\" for name, value in {\n", " \"BigQuery project\" : PROJECT_ID,\n", " \"BigQuery location\" : BQ_LOCATION,\n", " \"Penguins Table\" : PENGUINS_TABLE,\n", " \"ML Model Dataset\" : model_dataset.reference\n", "}.items())) \n", "print(log)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Using the BigQuery DataFrames API" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Set BigQuery DataFrames options" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import bigframes.pandas\n", "\n", "# Note: The project option is not required in all environments.\n", "# On BigQuery Studio, the project ID is automatically detected.\n", "bigframes.pandas.options.bigquery.project = PROJECT_ID\n", "\n", "# Note: The location option is not required.\n", "# It defaults to the location of the first table or query\n", "# passed to read_gbq(). For APIs where a location can't be\n", "# auto-detected, the location defaults to the \"US\" location.\n", "bigframes.pandas.options.bigquery.location = BQ_LOCATION" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Initialize a dataframe for a BigQuery table" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "02af1103261a4e63a4c15efd26b1bc9a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job e1a62d56-8cab-4bc1-9ad3-457f48b71d9c is RUNNING. = 4000.0]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Using the Remote Functions" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## BigQuery DataFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function.", "\n", "It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via `help` command." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on function remote_function in module bigframes.pandas:\n", "\n", "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)\n", " Decorator to turn a user defined function into a BigQuery remote function.\n", " \n", " .. note::\n", " Please make sure following is setup before using this API:\n", " \n", " 1. Have the below APIs enabled for your project:\n", " a. BigQuery Connection API\n", " b. Cloud Functions API\n", " c. Cloud Run API\n", " d. Cloud Build API\n", " e. Artifact Registry API\n", " f. Cloud Resource Manager API\n", " \n", " This can be done from the cloud console (change PROJECT_ID to yours):\n", " https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID\n", " Or from the gcloud CLI:\n", " $ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com\n", " \n", " 2. Have following IAM roles enabled for you:\n", " a. BigQuery Data Editor (roles/bigquery.dataEditor)\n", " b. BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n", " c. Cloud Functions Developer (roles/cloudfunctions.developer)\n", " d. Service Account User (roles/iam.serviceAccountUser)\n", " e. Storage Object Viewer (roles/storage.objectViewer)\n", " f. Project IAM Admin (roles/resourcemanager.projectIamAdmin)\n", " (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n", " \n", " 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:\n", " a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection\n", " b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function\n", " Alternatively, the IAM could also be setup via the gcloud CLI:\n", " $ gcloud projects add-iam-policy-binding PROJECT_ID --member=\"serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID\" --role=\"roles/run.invoker\"\n", " \n", " Args:\n", " input_types (list(type)):\n", " List of input data types in the user defined function.\n", " output_type (type):\n", " Data type of the output in the user defined function.\n", " dataset (str, Optional):\n", " Dataset to use to create a BigQuery function. It should be in\n", " `.` or `` format. If this\n", " param is not provided then session dataset id would be used.\n", " bigquery_connection (str, Optional):\n", " Name of the BigQuery connection. If it is pre created in the same\n", " location as the `bigquery_client.location` then it would be used,\n", " otherwise it would be created dynamically assuming the user has\n", " necessary priviliges. If this param is not provided then the\n", " bigquery connection from the session would be used.\n", " reuse (bool, Optional):\n", " Reuse the remote function if already exists.\n", " `True` by default, which will result in reusing an existing remote\n", " function (if any) that was previously created for the same udf.\n", " Setting it to false would force creating a unique remote function.\n", " If the required remote function does not exist then it would be\n", " created irrespective of this param.\n", "\n" ] } ], "source": [ "import bigframes.pandas as bpd\n", "help(bpd.remote_function)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Define a custom function, and specify the intent to turn it into a remote function.\n", "\n", "It requires a BigQuery connection. If the connection is not already created,\n", "the BigQuery DataFrames package attempts to create one assuming the necessary\n", "APIs and IAM permissions are setup in the project." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[INFO][2023-08-05 23:12:12,870][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-f9320ad496b5aeca2d7f343cbab03e2f --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmps5m0qu4z --entry-point=udf_http --trigger-http --no-allow-unauthenticated\n", "[INFO][2023-08-05 23:13:20,660][bigframes.remote_function] Successfully created cloud function bigframes-f9320ad496b5aeca2d7f343cbab03e2f with uri (https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app)\n", "[INFO][2023-08-05 23:13:32,717][bigframes.remote_function] Connector bigframes-rf-conn already exists\n", "[INFO][2023-08-05 23:13:32,719][bigframes.remote_function] Creating BQ remote function: \n", " CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_f9320ad496b5aeca2d7f343cbab03e2f(num FLOAT64)\n", " RETURNS STRING\n", " REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`\n", " OPTIONS (\n", " endpoint = \"https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app\"\n", " )\n", "[INFO][2023-08-05 23:13:33,697][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_f9320ad496b5aeca2d7f343cbab03e2f\n" ] } ], "source": [ "@bpd.remote_function(bigquery_connection='bigframes-rf-conn', cloud_function_service_account=\"default\")\n", "def get_bucket(num: float) -> str:\n", " if not num: return \"NA\"\n", " boundary = 4000\n", " return \"at_or_above_4000\" if num >= boundary else \"below_4000\"" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Run the custom function on the BigQuery-backed dataframe" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0dba87f5bcb74dca9efebe8f522beeff", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job 5f30816f-f4d0-4063-bb9e-2905b89f717d is DONE. 2.8 kB processed. \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
body_mass_gbody_mass_bucket
03475.0below_4000
14650.0at_or_above_4000
23900.0below_4000
33500.0below_4000
43000.0below_4000
54100.0at_or_above_4000
62700.0below_4000
73400.0below_4000
83800.0below_4000
93975.0below_4000
\n", "

10 rows × 2 columns

\n", "[10 rows x 2 columns in total]" ], "text/plain": [ " body_mass_g body_mass_bucket\n", "0 3475.0 below_4000\n", "1 4650.0 at_or_above_4000\n", "2 3900.0 below_4000\n", "3 3500.0 below_4000\n", "4 3000.0 below_4000\n", "5 4100.0 at_or_above_4000\n", "6 2700.0 below_4000\n", "7 3400.0 below_4000\n", "8 3800.0 below_4000\n", "9 3975.0 below_4000\n", "\n", "[10 rows x 2 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))\n", "df[['body_mass_g', 'body_mass_bucket']].head(10)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Using the ML API" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Initialize a DataFrame from a BigQuery table" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "89bf7ae3fd8641bcbdc5a4614a82f48a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job acd770bb-5ccb-463f-beec-2386132ded6b is RUNNING.
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
islandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Dream36.618.4184.03475.0FEMALE
1Dream39.819.1184.04650.0MALE
2Dream40.918.9184.03900.0MALE
4Dream37.316.8192.03000.0FEMALE
5Dream43.218.5192.04100.0MALE
9Dream40.220.1200.03975.0MALE
10Dream40.818.9208.04300.0MALE
11Dream39.018.7185.03650.0MALE
12Dream37.016.9185.03000.0FEMALE
14Dream34.017.1185.03400.0FEMALE
15Dream37.016.5185.03400.0FEMALE
18Dream39.717.9193.04250.0MALE
19Dream37.818.1193.03750.0MALE
22Dream40.217.1193.03400.0FEMALE
23Dream36.818.5193.03500.0FEMALE
26Dream41.518.5201.04000.0MALE
31Dream33.116.1178.02900.0FEMALE
32Dream37.218.1178.03900.0MALE
33Dream39.516.7178.03250.0FEMALE
35Dream36.018.5186.03100.0FEMALE
36Dream39.618.1186.04450.0MALE
38Dream41.320.3194.03550.0MALE
41Dream35.718.0202.03550.0FEMALE
51Dream38.117.6187.03425.0FEMALE
53Dream36.017.1187.03700.0FEMALE
\n", "

25 rows × 6 columns

\n", "[146 rows x 6 columns in total]" ], "text/plain": [ " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", "0 Dream 36.6 18.4 184.0 3475.0 \n", "1 Dream 39.8 19.1 184.0 4650.0 \n", "2 Dream 40.9 18.9 184.0 3900.0 \n", "4 Dream 37.3 16.8 192.0 3000.0 \n", "5 Dream 43.2 18.5 192.0 4100.0 \n", "9 Dream 40.2 20.1 200.0 3975.0 \n", "10 Dream 40.8 18.9 208.0 4300.0 \n", "11 Dream 39.0 18.7 185.0 3650.0 \n", "12 Dream 37.0 16.9 185.0 3000.0 \n", "14 Dream 34.0 17.1 185.0 3400.0 \n", "15 Dream 37.0 16.5 185.0 3400.0 \n", "18 Dream 39.7 17.9 193.0 4250.0 \n", "19 Dream 37.8 18.1 193.0 3750.0 \n", "22 Dream 40.2 17.1 193.0 3400.0 \n", "23 Dream 36.8 18.5 193.0 3500.0 \n", "26 Dream 41.5 18.5 201.0 4000.0 \n", "31 Dream 33.1 16.1 178.0 2900.0 \n", "32 Dream 37.2 18.1 178.0 3900.0 \n", "33 Dream 39.5 16.7 178.0 3250.0 \n", "35 Dream 36.0 18.5 186.0 3100.0 \n", "36 Dream 39.6 18.1 186.0 4450.0 \n", "38 Dream 41.3 20.3 194.0 3550.0 \n", "41 Dream 35.7 18.0 202.0 3550.0 \n", "51 Dream 38.1 17.6 187.0 3425.0 \n", "53 Dream 36.0 17.1 187.0 3700.0 \n", "\n", " sex \n", "0 FEMALE \n", "1 MALE \n", "2 MALE \n", "4 FEMALE \n", "5 MALE \n", "9 MALE \n", "10 MALE \n", "11 MALE \n", "12 FEMALE \n", "14 FEMALE \n", "15 FEMALE \n", "18 MALE \n", "19 MALE \n", "22 FEMALE \n", "23 FEMALE \n", "26 MALE \n", "31 FEMALE \n", "32 MALE \n", "33 FEMALE \n", "35 FEMALE \n", "36 MALE \n", "38 MALE \n", "41 FEMALE \n", "51 FEMALE \n", "53 FEMALE \n", "...\n", "\n", "[146 rows x 6 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# filter down to the data we want to analyze\n", "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", "\n", "# drop the columns we don't care about\n", "adelie_data = adelie_data.drop(columns=[\"species\"])\n", "\n", "# drop rows with nulls to get our training data\n", "training_data = adelie_data.dropna()\n", "\n", "# take a peek at the training data\n", "training_data" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# pick feature columns and label column\n", "feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", "label_columns = training_data[['body_mass_g']]\n", "\n", "# also get the rows that we want to make predictions for (i.e. where the feature column is null)\n", "missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Train and evaluate a linear regression model using the ML API" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "865f6bb75c5b48e4a52a3183fe3c2582", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job 4b0c58e4-4752-4b96-b490-a95e3ae326c0 is DONE. 31.9 kB processed.
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
0223.87876378553.6016340.005614181.3309110.6239510.623951
\n", "

1 rows × 6 columns

\n", "[1 rows x 6 columns in total]" ], "text/plain": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", "0 223.878763 78553.601634 0.005614 \n", "\n", " median_absolute_error r2_score explained_variance \n", "0 181.330911 0.623951 0.623951 \n", "\n", "[1 rows x 6 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from bigframes.ml.linear_model import LinearRegression\n", "\n", "# as in scikit-learn, a newly created model is just a bundle of parameters\n", "# default parameters are fine here\n", "model = LinearRegression()\n", "\n", "# this will train a temporary model in BigQuery Machine Learning\n", "model.fit(feature_columns, label_columns)\n", "\n", "# check how the model performed\n", "model.score(feature_columns, label_columns)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Make predictions using the model" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "caf8cefe06a14da2a02f31aa1e12c23a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HTML(value='Query job fe2fe252-8433-4d20-861c-681a8dfbf2c4 is RUNNING.
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
predicted_body_mass_g
2923459.735118
\n", "

1 rows × 1 columns

\n", "[1 rows x 1 columns in total]" ], "text/plain": [ " predicted_body_mass_g\n", "292 3459.735118\n", "\n", "[1 rows x 1 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(missing_body_mass)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Save the trained model to BigQuery, so we can load it later" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression()" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.to_gbq(f\"{DATASET}.penguins_model\", replace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Clean Up" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bpd.close_session()" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }