README#
This Notebook runs requiring the following environent variable:
GOOGLE_CLOUD_PROJECT - The google cloud project id.
BIGQUERY_LOCATION - can take values as per https://cloud.google.com/bigquery/docs/locations, e.g.
us,asia-east1.
Infer location and set up data in that location if needed#
# Take multi-region US as the default BQ location, where most of the BQ data lies including the BQ public datasets
import os
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
BQ_LOCATION = os.environ.get("BIGQUERY_LOCATION")
if not PROJECT_ID:
raise ValueError("Project must be set via environment variable GOOGLE_CLOUD_PROJECT")
if not BQ_LOCATION:
raise ValueError("BQ location must be set via environment variable BIGQUERY_LOCATION")
DATASET = "bigframes_testing"
PENGUINS_TABLE = "bigquery-public-data.ml_datasets.penguins"
# Check for a location set in the environment and do location-specific setup if needed
import google.api_core.exceptions
from google.cloud import bigquery
import bigframes
client = bigquery.Client()
BQ_LOCATION = BQ_LOCATION.lower()
if BQ_LOCATION != "us":
bq_location_normalized = BQ_LOCATION.replace('-', '_')
# Nominate a local penguins table
penguins_table_ref = bigquery.TableReference.from_string(PENGUINS_TABLE)
penguins_local_dataset_name = f"{DATASET}_{bq_location_normalized}"
penguins_local_dataset_ref = bigquery.DatasetReference(project=PROJECT_ID, dataset_id=penguins_local_dataset_name)
penguins_local_dataset = bigquery.Dataset(penguins_local_dataset_ref)
penguins_local_dataset.location = BQ_LOCATION
penguins_local_table_ref= bigquery.TableReference(penguins_local_dataset, penguins_table_ref.table_id)
penguins_local_table = str(penguins_local_table_ref)
try:
client.get_table(penguins_local_table_ref)
except google.api_core.exceptions.NotFound:
client.create_dataset(penguins_local_dataset, exists_ok=True)
# Read the public table as an in-memory dataframe and then write to the local table
session_us = bigframes.connect()
df = session_us.read_gbq(PENGUINS_TABLE).to_pandas()
df.to_gbq(penguins_local_table)
# Finally point the penguins table to the local table
PENGUINS_TABLE=penguins_local_table
# Also update the dataset name used for test artifacts
DATASET = f"{DATASET}_{bq_location_normalized}"
# Create the dataset to store the model if it doesn't exist
model_local_dataset = bigquery.Dataset(bigquery.DatasetReference(project=PROJECT_ID, dataset_id=DATASET))
model_local_dataset.location = BQ_LOCATION
model_dataset = client.create_dataset(model_local_dataset, exists_ok=True)
# Finally log the variables driving the core notebook execution
log = ('\n'.join(f"{name}: {str(value)}" for name, value in {
"BigQuery project" : PROJECT_ID,
"BigQuery location" : BQ_LOCATION,
"Penguins Table" : PENGUINS_TABLE,
"ML Model Dataset" : model_dataset.reference
}.items()))
print(log)
/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds.
warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)
/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds.
warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)
BigQuery project: bigframes-dev
BigQuery location: us
Penguins Table: bigquery-public-data.ml_datasets.penguins
ML Model Dataset: bigframes-dev.bigframes_testing
Using the BigQuery DataFrames API#
Set BigQuery DataFrames options#
import bigframes.pandas
# Note: The project option is not required in all environments.
# On BigQuery Studio, the project ID is automatically detected.
bigframes.pandas.options.bigquery.project = PROJECT_ID
# Note: The location option is not required.
# It defaults to the location of the first table or query
# passed to read_gbq(). For APIs where a location can't be
# auto-detected, the location defaults to the "US" location.
bigframes.pandas.options.bigquery.location = BQ_LOCATION
Initialize a dataframe for a BigQuery table#
df = bigframes.pandas.read_gbq(PENGUINS_TABLE)
/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds.
warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)
View the DataFrame#
df
| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie Penguin (Pygoscelis adeliae) | Dream | 36.6 | 18.4 | 184.0 | 3475.0 | FEMALE |
| 1 | Adelie Penguin (Pygoscelis adeliae) | Dream | 39.8 | 19.1 | 184.0 | 4650.0 | MALE |
| 2 | Adelie Penguin (Pygoscelis adeliae) | Dream | 40.9 | 18.9 | 184.0 | 3900.0 | MALE |
| 3 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 46.5 | 17.9 | 192.0 | 3500.0 | FEMALE |
| 4 | Adelie Penguin (Pygoscelis adeliae) | Dream | 37.3 | 16.8 | 192.0 | 3000.0 | FEMALE |
| 5 | Adelie Penguin (Pygoscelis adeliae) | Dream | 43.2 | 18.5 | 192.0 | 4100.0 | MALE |
| 6 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 46.9 | 16.6 | 192.0 | 2700.0 | FEMALE |
| 7 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 50.5 | 18.4 | 200.0 | 3400.0 | FEMALE |
| 8 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 49.5 | 19.0 | 200.0 | 3800.0 | MALE |
| 9 | Adelie Penguin (Pygoscelis adeliae) | Dream | 40.2 | 20.1 | 200.0 | 3975.0 | MALE |
| 10 | Adelie Penguin (Pygoscelis adeliae) | Dream | 40.8 | 18.9 | 208.0 | 4300.0 | MALE |
| 11 | Adelie Penguin (Pygoscelis adeliae) | Dream | 39.0 | 18.7 | 185.0 | 3650.0 | MALE |
| 12 | Adelie Penguin (Pygoscelis adeliae) | Dream | 37.0 | 16.9 | 185.0 | 3000.0 | FEMALE |
| 13 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 47.0 | 17.3 | 185.0 | 3700.0 | FEMALE |
| 14 | Adelie Penguin (Pygoscelis adeliae) | Dream | 34.0 | 17.1 | 185.0 | 3400.0 | FEMALE |
| 15 | Adelie Penguin (Pygoscelis adeliae) | Dream | 37.0 | 16.5 | 185.0 | 3400.0 | FEMALE |
| 16 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 45.7 | 17.3 | 193.0 | 3600.0 | FEMALE |
| 17 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 50.6 | 19.4 | 193.0 | 3800.0 | MALE |
| 18 | Adelie Penguin (Pygoscelis adeliae) | Dream | 39.7 | 17.9 | 193.0 | 4250.0 | MALE |
| 19 | Adelie Penguin (Pygoscelis adeliae) | Dream | 37.8 | 18.1 | 193.0 | 3750.0 | MALE |
| 20 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 46.6 | 17.8 | 193.0 | 3800.0 | FEMALE |
| 21 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 51.3 | 19.2 | 193.0 | 3650.0 | MALE |
| 22 | Adelie Penguin (Pygoscelis adeliae) | Dream | 40.2 | 17.1 | 193.0 | 3400.0 | FEMALE |
| 23 | Adelie Penguin (Pygoscelis adeliae) | Dream | 36.8 | 18.5 | 193.0 | 3500.0 | FEMALE |
| 24 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 49.6 | 18.2 | 193.0 | 3775.0 | MALE |
25 rows × 7 columns
View the column names in the dataframe (aka columns names in the table)#
df.columns
Index(['species', 'island', 'culmen_length_mm', 'culmen_depth_mm',
'flipper_length_mm', 'body_mass_g', 'sex'],
dtype='object')
View the table schema#
df.dtypes
species string[pyarrow]
island string[pyarrow]
culmen_length_mm Float64
culmen_depth_mm Float64
flipper_length_mm Float64
body_mass_g Float64
sex string[pyarrow]
dtype: object
Select a subset of columns#
df = df[[
"species",
"island",
"body_mass_g",
]]
df
| species | island | body_mass_g | |
|---|---|---|---|
| 0 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3475.0 |
| 1 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4650.0 |
| 2 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3900.0 |
| 3 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3500.0 |
| 4 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3000.0 |
| 5 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4100.0 |
| 6 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 2700.0 |
| 7 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3400.0 |
| 8 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3800.0 |
| 9 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3975.0 |
| 10 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4300.0 |
| 11 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3650.0 |
| 12 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3000.0 |
| 13 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3700.0 |
| 14 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3400.0 |
| 15 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3400.0 |
| 16 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3600.0 |
| 17 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3800.0 |
| 18 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4250.0 |
| 19 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3750.0 |
| 20 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3800.0 |
| 21 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3650.0 |
| 22 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3400.0 |
| 23 | Adelie Penguin (Pygoscelis adeliae) | Dream | 3500.0 |
| 24 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 3775.0 |
25 rows × 3 columns
View the first ten values of a series#
df['body_mass_g'].head(10)
0 3475.0
1 4650.0
2 3900.0
3 3500.0
4 3000.0
5 4100.0
6 2700.0
7 3400.0
8 3800.0
9 3975.0
Name: body_mass_g, dtype: Float64
Compute the mean of a series#
df['body_mass_g'].mean()
4201.754385964912
Filter the DataFrame#
df[df['body_mass_g'] >= 4000.0]
| species | island | body_mass_g | |
|---|---|---|---|
| 1 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4650.0 |
| 5 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4100.0 |
| 10 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4300.0 |
| 18 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4250.0 |
| 25 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4050.0 |
| 26 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4000.0 |
| 27 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4050.0 |
| 28 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4300.0 |
| 30 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4450.0 |
| 36 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4450.0 |
| 44 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4100.0 |
| 45 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4800.0 |
| 57 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4400.0 |
| 61 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4150.0 |
| 66 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4050.0 |
| 67 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4050.0 |
| 68 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4100.0 |
| 74 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4350.0 |
| 77 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4150.0 |
| 78 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4400.0 |
| 80 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4300.0 |
| 90 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4150.0 |
| 92 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4500.0 |
| 93 | Adelie Penguin (Pygoscelis adeliae) | Dream | 4300.0 |
| 94 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 4550.0 |
25 rows × 3 columns
Using the Remote Functions#
BigQuery DataFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function.#
It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via help command.
import bigframes.pandas as bpd
help(bpd.remote_function)
Help on function remote_function in module bigframes.pandas:
remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)
Decorator to turn a user defined function into a BigQuery remote function.
.. note::
Please make sure following is setup before using this API:
1. Have the below APIs enabled for your project:
a. BigQuery Connection API
b. Cloud Functions API
c. Cloud Run API
d. Cloud Build API
e. Artifact Registry API
f. Cloud Resource Manager API
This can be done from the cloud console (change PROJECT_ID to yours):
https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID
Or from the gcloud CLI:
$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com
2. Have following IAM roles enabled for you:
a. BigQuery Data Editor (roles/bigquery.dataEditor)
b. BigQuery Connection Admin (roles/bigquery.connectionAdmin)
c. Cloud Functions Developer (roles/cloudfunctions.developer)
d. Service Account User (roles/iam.serviceAccountUser)
e. Storage Object Viewer (roles/storage.objectViewer)
f. Project IAM Admin (roles/resourcemanager.projectIamAdmin)
(Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)
3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:
a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection
b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function
Alternatively, the IAM could also be setup via the gcloud CLI:
$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"
Args:
input_types (list(type)):
List of input data types in the user defined function.
output_type (type):
Data type of the output in the user defined function.
dataset (str, Optional):
Dataset to use to create a BigQuery function. It should be in
`<project_id>.<dataset_name>` or `<dataset_name>` format. If this
param is not provided then session dataset id would be used.
bigquery_connection (str, Optional):
Name of the BigQuery connection. If it is pre created in the same
location as the `bigquery_client.location` then it would be used,
otherwise it would be created dynamically assuming the user has
necessary priviliges. If this param is not provided then the
bigquery connection from the session would be used.
reuse (bool, Optional):
Reuse the remote function if already exists.
`True` by default, which will result in reusing an existing remote
function (if any) that was previously created for the same udf.
Setting it to false would force creating a unique remote function.
If the required remote function does not exist then it would be
created irrespective of this param.
Define a custom function, and specify the intent to turn it into a remote function.#
It requires a BigQuery connection. If the connection is not already created, the BigQuery DataFrames package attempts to create one assuming the necessary APIs and IAM permissions are setup in the project.
@bpd.remote_function(bigquery_connection='bigframes-rf-conn', cloud_function_service_account="default")
def get_bucket(num: float) -> str:
if not num: return "NA"
boundary = 4000
return "at_or_above_4000" if num >= boundary else "below_4000"
[INFO][2023-08-05 23:12:12,870][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-f9320ad496b5aeca2d7f343cbab03e2f --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmps5m0qu4z --entry-point=udf_http --trigger-http --no-allow-unauthenticated
[INFO][2023-08-05 23:13:20,660][bigframes.remote_function] Successfully created cloud function bigframes-f9320ad496b5aeca2d7f343cbab03e2f with uri (https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app)
[INFO][2023-08-05 23:13:32,717][bigframes.remote_function] Connector bigframes-rf-conn already exists
[INFO][2023-08-05 23:13:32,719][bigframes.remote_function] Creating BQ remote function:
CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_f9320ad496b5aeca2d7f343cbab03e2f(num FLOAT64)
RETURNS STRING
REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`
OPTIONS (
endpoint = "https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app"
)
[INFO][2023-08-05 23:13:33,697][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_f9320ad496b5aeca2d7f343cbab03e2f
Run the custom function on the BigQuery-backed dataframe#
df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))
df[['body_mass_g', 'body_mass_bucket']].head(10)
| body_mass_g | body_mass_bucket | |
|---|---|---|
| 0 | 3475.0 | below_4000 |
| 1 | 4650.0 | at_or_above_4000 |
| 2 | 3900.0 | below_4000 |
| 3 | 3500.0 | below_4000 |
| 4 | 3000.0 | below_4000 |
| 5 | 4100.0 | at_or_above_4000 |
| 6 | 2700.0 | below_4000 |
| 7 | 3400.0 | below_4000 |
| 8 | 3800.0 | below_4000 |
| 9 | 3975.0 | below_4000 |
10 rows × 2 columns
Using the ML API#
Initialize a DataFrame from a BigQuery table#
df = bigframes.pandas.read_gbq(PENGUINS_TABLE)
df
| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie Penguin (Pygoscelis adeliae) | Dream | 36.6 | 18.4 | 184.0 | 3475.0 | FEMALE |
| 1 | Adelie Penguin (Pygoscelis adeliae) | Dream | 39.8 | 19.1 | 184.0 | 4650.0 | MALE |
| 2 | Adelie Penguin (Pygoscelis adeliae) | Dream | 40.9 | 18.9 | 184.0 | 3900.0 | MALE |
| 3 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 46.5 | 17.9 | 192.0 | 3500.0 | FEMALE |
| 4 | Adelie Penguin (Pygoscelis adeliae) | Dream | 37.3 | 16.8 | 192.0 | 3000.0 | FEMALE |
| 5 | Adelie Penguin (Pygoscelis adeliae) | Dream | 43.2 | 18.5 | 192.0 | 4100.0 | MALE |
| 6 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 46.9 | 16.6 | 192.0 | 2700.0 | FEMALE |
| 7 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 50.5 | 18.4 | 200.0 | 3400.0 | FEMALE |
| 8 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 49.5 | 19.0 | 200.0 | 3800.0 | MALE |
| 9 | Adelie Penguin (Pygoscelis adeliae) | Dream | 40.2 | 20.1 | 200.0 | 3975.0 | MALE |
| 10 | Adelie Penguin (Pygoscelis adeliae) | Dream | 40.8 | 18.9 | 208.0 | 4300.0 | MALE |
| 11 | Adelie Penguin (Pygoscelis adeliae) | Dream | 39.0 | 18.7 | 185.0 | 3650.0 | MALE |
| 12 | Adelie Penguin (Pygoscelis adeliae) | Dream | 37.0 | 16.9 | 185.0 | 3000.0 | FEMALE |
| 13 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 47.0 | 17.3 | 185.0 | 3700.0 | FEMALE |
| 14 | Adelie Penguin (Pygoscelis adeliae) | Dream | 34.0 | 17.1 | 185.0 | 3400.0 | FEMALE |
| 15 | Adelie Penguin (Pygoscelis adeliae) | Dream | 37.0 | 16.5 | 185.0 | 3400.0 | FEMALE |
| 16 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 45.7 | 17.3 | 193.0 | 3600.0 | FEMALE |
| 17 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 50.6 | 19.4 | 193.0 | 3800.0 | MALE |
| 18 | Adelie Penguin (Pygoscelis adeliae) | Dream | 39.7 | 17.9 | 193.0 | 4250.0 | MALE |
| 19 | Adelie Penguin (Pygoscelis adeliae) | Dream | 37.8 | 18.1 | 193.0 | 3750.0 | MALE |
| 20 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 46.6 | 17.8 | 193.0 | 3800.0 | FEMALE |
| 21 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 51.3 | 19.2 | 193.0 | 3650.0 | MALE |
| 22 | Adelie Penguin (Pygoscelis adeliae) | Dream | 40.2 | 17.1 | 193.0 | 3400.0 | FEMALE |
| 23 | Adelie Penguin (Pygoscelis adeliae) | Dream | 36.8 | 18.5 | 193.0 | 3500.0 | FEMALE |
| 24 | Chinstrap penguin (Pygoscelis antarctica) | Dream | 49.6 | 18.2 | 193.0 | 3775.0 | MALE |
25 rows × 7 columns
Clean and prepare the data#
# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]
# drop the columns we don't care about
adelie_data = adelie_data.drop(columns=["species"])
# drop rows with nulls to get our training data
training_data = adelie_data.dropna()
# take a peek at the training data
training_data
| island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|
| 0 | Dream | 36.6 | 18.4 | 184.0 | 3475.0 | FEMALE |
| 1 | Dream | 39.8 | 19.1 | 184.0 | 4650.0 | MALE |
| 2 | Dream | 40.9 | 18.9 | 184.0 | 3900.0 | MALE |
| 4 | Dream | 37.3 | 16.8 | 192.0 | 3000.0 | FEMALE |
| 5 | Dream | 43.2 | 18.5 | 192.0 | 4100.0 | MALE |
| 9 | Dream | 40.2 | 20.1 | 200.0 | 3975.0 | MALE |
| 10 | Dream | 40.8 | 18.9 | 208.0 | 4300.0 | MALE |
| 11 | Dream | 39.0 | 18.7 | 185.0 | 3650.0 | MALE |
| 12 | Dream | 37.0 | 16.9 | 185.0 | 3000.0 | FEMALE |
| 14 | Dream | 34.0 | 17.1 | 185.0 | 3400.0 | FEMALE |
| 15 | Dream | 37.0 | 16.5 | 185.0 | 3400.0 | FEMALE |
| 18 | Dream | 39.7 | 17.9 | 193.0 | 4250.0 | MALE |
| 19 | Dream | 37.8 | 18.1 | 193.0 | 3750.0 | MALE |
| 22 | Dream | 40.2 | 17.1 | 193.0 | 3400.0 | FEMALE |
| 23 | Dream | 36.8 | 18.5 | 193.0 | 3500.0 | FEMALE |
| 26 | Dream | 41.5 | 18.5 | 201.0 | 4000.0 | MALE |
| 31 | Dream | 33.1 | 16.1 | 178.0 | 2900.0 | FEMALE |
| 32 | Dream | 37.2 | 18.1 | 178.0 | 3900.0 | MALE |
| 33 | Dream | 39.5 | 16.7 | 178.0 | 3250.0 | FEMALE |
| 35 | Dream | 36.0 | 18.5 | 186.0 | 3100.0 | FEMALE |
| 36 | Dream | 39.6 | 18.1 | 186.0 | 4450.0 | MALE |
| 38 | Dream | 41.3 | 20.3 | 194.0 | 3550.0 | MALE |
| 41 | Dream | 35.7 | 18.0 | 202.0 | 3550.0 | FEMALE |
| 51 | Dream | 38.1 | 17.6 | 187.0 | 3425.0 | FEMALE |
| 53 | Dream | 36.0 | 17.1 | 187.0 | 3700.0 | FEMALE |
25 rows × 6 columns
# pick feature columns and label column
feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']]
# also get the rows that we want to make predictions for (i.e. where the feature column is null)
missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]
Train and evaluate a linear regression model using the ML API#
from bigframes.ml.linear_model import LinearRegression
# as in scikit-learn, a newly created model is just a bundle of parameters
# default parameters are fine here
model = LinearRegression()
# this will train a temporary model in BigQuery Machine Learning
model.fit(feature_columns, label_columns)
# check how the model performed
model.score(feature_columns, label_columns)
| mean_absolute_error | mean_squared_error | mean_squared_log_error | median_absolute_error | r2_score | explained_variance | |
|---|---|---|---|---|---|---|
| 0 | 223.878763 | 78553.601634 | 0.005614 | 181.330911 | 0.623951 | 0.623951 |
1 rows × 6 columns
Make predictions using the model#
model.predict(missing_body_mass)
| predicted_body_mass_g | |
|---|---|
| 292 | 3459.735118 |
1 rows × 1 columns
Save the trained model to BigQuery, so we can load it later#
model.to_gbq(f"{DATASET}.penguins_model", replace=True)
LinearRegression()
Clean Up#
bpd.close_session()