README#

This Notebook runs requiring the following environent variable:

  1. GOOGLE_CLOUD_PROJECT - The google cloud project id.

  2. BIGQUERY_LOCATION - can take values as per https://cloud.google.com/bigquery/docs/locations, e.g. us, asia-east1.

Infer location and set up data in that location if needed#

# Take multi-region US as the default BQ location, where most of the BQ data lies including the BQ public datasets
import os

PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
BQ_LOCATION = os.environ.get("BIGQUERY_LOCATION")

if not PROJECT_ID:
    raise ValueError("Project must be set via environment variable GOOGLE_CLOUD_PROJECT")
if not BQ_LOCATION:
    raise ValueError("BQ location must be set via environment variable BIGQUERY_LOCATION")

DATASET = "bigframes_testing"
PENGUINS_TABLE = "bigquery-public-data.ml_datasets.penguins"


# Check for a location set in the environment and do location-specific setup if needed

import google.api_core.exceptions
from google.cloud import bigquery
import bigframes

client = bigquery.Client()

BQ_LOCATION = BQ_LOCATION.lower()
if BQ_LOCATION != "us":
    bq_location_normalized = BQ_LOCATION.replace('-', '_')

    # Nominate a local penguins table
    penguins_table_ref = bigquery.TableReference.from_string(PENGUINS_TABLE)
    penguins_local_dataset_name = f"{DATASET}_{bq_location_normalized}"
    penguins_local_dataset_ref = bigquery.DatasetReference(project=PROJECT_ID, dataset_id=penguins_local_dataset_name)
    penguins_local_dataset = bigquery.Dataset(penguins_local_dataset_ref)
    penguins_local_dataset.location = BQ_LOCATION
    penguins_local_table_ref= bigquery.TableReference(penguins_local_dataset, penguins_table_ref.table_id)
    penguins_local_table = str(penguins_local_table_ref)
    try:
        client.get_table(penguins_local_table_ref)
    except google.api_core.exceptions.NotFound:
        client.create_dataset(penguins_local_dataset, exists_ok=True)

        # Read the public table as an in-memory dataframe and then write to the local table
        session_us = bigframes.connect()
        df = session_us.read_gbq(PENGUINS_TABLE).to_pandas()
        df.to_gbq(penguins_local_table)

    # Finally point the penguins table to the local table
    PENGUINS_TABLE=penguins_local_table

    # Also update the dataset name used for test artifacts
    DATASET = f"{DATASET}_{bq_location_normalized}"

# Create the dataset to store the model if it doesn't exist    
model_local_dataset = bigquery.Dataset(bigquery.DatasetReference(project=PROJECT_ID, dataset_id=DATASET))
model_local_dataset.location = BQ_LOCATION
model_dataset = client.create_dataset(model_local_dataset, exists_ok=True)

# Finally log the variables driving the core notebook execution
log = ('\n'.join(f"{name}: {str(value)}" for name, value in {
    "BigQuery project" : PROJECT_ID,
    "BigQuery location" : BQ_LOCATION,
    "Penguins Table" : PENGUINS_TABLE,
    "ML Model Dataset" : model_dataset.reference
}.items())) 
print(log)
/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. 
  warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)
/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. 
  warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)
BigQuery project: bigframes-dev
BigQuery location: us
Penguins Table: bigquery-public-data.ml_datasets.penguins
ML Model Dataset: bigframes-dev.bigframes_testing

Using the BigQuery DataFrames API#

Set BigQuery DataFrames options#

import bigframes.pandas

# Note: The project option is not required in all environments.
# On BigQuery Studio, the project ID is automatically detected.
bigframes.pandas.options.bigquery.project = PROJECT_ID

# Note: The location option is not required.
# It defaults to the location of the first table or query
# passed to read_gbq(). For APIs where a location can't be
# auto-detected, the location defaults to the "US" location.
bigframes.pandas.options.bigquery.location = BQ_LOCATION

Initialize a dataframe for a BigQuery table#

df = bigframes.pandas.read_gbq(PENGUINS_TABLE)
/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. 
  warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)

View the DataFrame#

df
species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 18.4 184.0 3475.0 FEMALE
1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 19.1 184.0 4650.0 MALE
2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 18.9 184.0 3900.0 MALE
3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 17.9 192.0 3500.0 FEMALE
4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 16.8 192.0 3000.0 FEMALE
5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 18.5 192.0 4100.0 MALE
6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 16.6 192.0 2700.0 FEMALE
7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 18.4 200.0 3400.0 FEMALE
8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 19.0 200.0 3800.0 MALE
9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 20.1 200.0 3975.0 MALE
10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 18.9 208.0 4300.0 MALE
11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 18.7 185.0 3650.0 MALE
12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 16.9 185.0 3000.0 FEMALE
13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 17.3 185.0 3700.0 FEMALE
14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 17.1 185.0 3400.0 FEMALE
15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 16.5 185.0 3400.0 FEMALE
16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 17.3 193.0 3600.0 FEMALE
17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 19.4 193.0 3800.0 MALE
18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 17.9 193.0 4250.0 MALE
19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 18.1 193.0 3750.0 MALE
20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 17.8 193.0 3800.0 FEMALE
21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 19.2 193.0 3650.0 MALE
22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 17.1 193.0 3400.0 FEMALE
23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 18.5 193.0 3500.0 FEMALE
24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 18.2 193.0 3775.0 MALE

25 rows × 7 columns

[344 rows x 7 columns in total]

View the column names in the dataframe (aka columns names in the table)#

df.columns
Index(['species', 'island', 'culmen_length_mm', 'culmen_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

View the table schema#

df.dtypes
species              string[pyarrow]
island               string[pyarrow]
culmen_length_mm             Float64
culmen_depth_mm              Float64
flipper_length_mm            Float64
body_mass_g                  Float64
sex                  string[pyarrow]
dtype: object

Select a subset of columns#

df = df[[
    "species",
    "island",
    "body_mass_g",
]]
df
species island body_mass_g
0 Adelie Penguin (Pygoscelis adeliae) Dream 3475.0
1 Adelie Penguin (Pygoscelis adeliae) Dream 4650.0
2 Adelie Penguin (Pygoscelis adeliae) Dream 3900.0
3 Chinstrap penguin (Pygoscelis antarctica) Dream 3500.0
4 Adelie Penguin (Pygoscelis adeliae) Dream 3000.0
5 Adelie Penguin (Pygoscelis adeliae) Dream 4100.0
6 Chinstrap penguin (Pygoscelis antarctica) Dream 2700.0
7 Chinstrap penguin (Pygoscelis antarctica) Dream 3400.0
8 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0
9 Adelie Penguin (Pygoscelis adeliae) Dream 3975.0
10 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0
11 Adelie Penguin (Pygoscelis adeliae) Dream 3650.0
12 Adelie Penguin (Pygoscelis adeliae) Dream 3000.0
13 Chinstrap penguin (Pygoscelis antarctica) Dream 3700.0
14 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0
15 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0
16 Chinstrap penguin (Pygoscelis antarctica) Dream 3600.0
17 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0
18 Adelie Penguin (Pygoscelis adeliae) Dream 4250.0
19 Adelie Penguin (Pygoscelis adeliae) Dream 3750.0
20 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0
21 Chinstrap penguin (Pygoscelis antarctica) Dream 3650.0
22 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0
23 Adelie Penguin (Pygoscelis adeliae) Dream 3500.0
24 Chinstrap penguin (Pygoscelis antarctica) Dream 3775.0

25 rows × 3 columns

[344 rows x 3 columns in total]

View the first ten values of a series#

df['body_mass_g'].head(10)
0    3475.0
1    4650.0
2    3900.0
3    3500.0
4    3000.0
5    4100.0
6    2700.0
7    3400.0
8    3800.0
9    3975.0
Name: body_mass_g, dtype: Float64

Compute the mean of a series#

df['body_mass_g'].mean()
4201.754385964912

Filter the DataFrame#

df[df['body_mass_g'] >= 4000.0]
species island body_mass_g
1 Adelie Penguin (Pygoscelis adeliae) Dream 4650.0
5 Adelie Penguin (Pygoscelis adeliae) Dream 4100.0
10 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0
18 Adelie Penguin (Pygoscelis adeliae) Dream 4250.0
25 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0
26 Adelie Penguin (Pygoscelis adeliae) Dream 4000.0
27 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0
28 Chinstrap penguin (Pygoscelis antarctica) Dream 4300.0
30 Chinstrap penguin (Pygoscelis antarctica) Dream 4450.0
36 Adelie Penguin (Pygoscelis adeliae) Dream 4450.0
44 Chinstrap penguin (Pygoscelis antarctica) Dream 4100.0
45 Chinstrap penguin (Pygoscelis antarctica) Dream 4800.0
57 Chinstrap penguin (Pygoscelis antarctica) Dream 4400.0
61 Chinstrap penguin (Pygoscelis antarctica) Dream 4150.0
66 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0
67 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0
68 Chinstrap penguin (Pygoscelis antarctica) Dream 4100.0
74 Adelie Penguin (Pygoscelis adeliae) Dream 4350.0
77 Adelie Penguin (Pygoscelis adeliae) Dream 4150.0
78 Adelie Penguin (Pygoscelis adeliae) Dream 4400.0
80 Chinstrap penguin (Pygoscelis antarctica) Dream 4300.0
90 Chinstrap penguin (Pygoscelis antarctica) Dream 4150.0
92 Chinstrap penguin (Pygoscelis antarctica) Dream 4500.0
93 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0
94 Chinstrap penguin (Pygoscelis antarctica) Dream 4550.0

25 rows × 3 columns

[177 rows x 3 columns in total]

Using the Remote Functions#

BigQuery DataFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function.#

It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via help command.

import bigframes.pandas as bpd
help(bpd.remote_function)
Help on function remote_function in module bigframes.pandas:

remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)
    Decorator to turn a user defined function into a BigQuery remote function.
    
    .. note::
        Please make sure following is setup before using this API:
    
        1. Have the below APIs enabled for your project:
              a. BigQuery Connection API
              b. Cloud Functions API
              c. Cloud Run API
              d. Cloud Build API
              e. Artifact Registry API
              f. Cloud Resource Manager API
    
          This can be done from the cloud console (change PROJECT_ID to yours):
              https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID
          Or from the gcloud CLI:
              $ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com
    
        2. Have following IAM roles enabled for you:
              a. BigQuery Data Editor (roles/bigquery.dataEditor)
              b. BigQuery Connection Admin (roles/bigquery.connectionAdmin)
              c. Cloud Functions Developer (roles/cloudfunctions.developer)
              d. Service Account User (roles/iam.serviceAccountUser)
              e. Storage Object Viewer (roles/storage.objectViewer)
              f. Project IAM Admin (roles/resourcemanager.projectIamAdmin)
                 (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)
    
        3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:
              a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection
              b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function
           Alternatively, the IAM could also be setup via the gcloud CLI:
              $ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"
    
    Args:
        input_types (list(type)):
            List of input data types in the user defined function.
        output_type (type):
            Data type of the output in the user defined function.
        dataset (str, Optional):
            Dataset to use to create a BigQuery function. It should be in
            `<project_id>.<dataset_name>` or `<dataset_name>` format. If this
            param is not provided then session dataset id would be used.
        bigquery_connection (str, Optional):
            Name of the BigQuery connection. If it is pre created in the same
            location as the `bigquery_client.location` then it would be used,
            otherwise it would be created dynamically assuming the user has
            necessary priviliges. If this param is not provided then the
            bigquery connection from the session would be used.
        reuse (bool, Optional):
            Reuse the remote function if already exists.
            `True` by default, which will result in reusing an existing remote
            function (if any) that was previously created for the same udf.
            Setting it to false would force creating a unique remote function.
            If the required remote function does not exist then it would be
            created irrespective of this param.

Define a custom function, and specify the intent to turn it into a remote function.#

It requires a BigQuery connection. If the connection is not already created, the BigQuery DataFrames package attempts to create one assuming the necessary APIs and IAM permissions are setup in the project.

@bpd.remote_function(bigquery_connection='bigframes-rf-conn', cloud_function_service_account="default")
def get_bucket(num: float) -> str:
    if not num: return "NA"
    boundary = 4000
    return "at_or_above_4000" if num >= boundary else "below_4000"
[INFO][2023-08-05 23:12:12,870][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-f9320ad496b5aeca2d7f343cbab03e2f --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmps5m0qu4z --entry-point=udf_http --trigger-http --no-allow-unauthenticated
[INFO][2023-08-05 23:13:20,660][bigframes.remote_function] Successfully created cloud function bigframes-f9320ad496b5aeca2d7f343cbab03e2f with uri (https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app)
[INFO][2023-08-05 23:13:32,717][bigframes.remote_function] Connector bigframes-rf-conn already exists
[INFO][2023-08-05 23:13:32,719][bigframes.remote_function] Creating BQ remote function: 
    CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_f9320ad496b5aeca2d7f343cbab03e2f(num FLOAT64)
    RETURNS STRING
    REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`
    OPTIONS (
      endpoint = "https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app"
    )
[INFO][2023-08-05 23:13:33,697][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_f9320ad496b5aeca2d7f343cbab03e2f

Run the custom function on the BigQuery-backed dataframe#

df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))
df[['body_mass_g', 'body_mass_bucket']].head(10)
body_mass_g body_mass_bucket
0 3475.0 below_4000
1 4650.0 at_or_above_4000
2 3900.0 below_4000
3 3500.0 below_4000
4 3000.0 below_4000
5 4100.0 at_or_above_4000
6 2700.0 below_4000
7 3400.0 below_4000
8 3800.0 below_4000
9 3975.0 below_4000

10 rows × 2 columns

[10 rows x 2 columns in total]

Using the ML API#

Initialize a DataFrame from a BigQuery table#

df = bigframes.pandas.read_gbq(PENGUINS_TABLE)
df
species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 18.4 184.0 3475.0 FEMALE
1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 19.1 184.0 4650.0 MALE
2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 18.9 184.0 3900.0 MALE
3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 17.9 192.0 3500.0 FEMALE
4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 16.8 192.0 3000.0 FEMALE
5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 18.5 192.0 4100.0 MALE
6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 16.6 192.0 2700.0 FEMALE
7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 18.4 200.0 3400.0 FEMALE
8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 19.0 200.0 3800.0 MALE
9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 20.1 200.0 3975.0 MALE
10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 18.9 208.0 4300.0 MALE
11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 18.7 185.0 3650.0 MALE
12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 16.9 185.0 3000.0 FEMALE
13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 17.3 185.0 3700.0 FEMALE
14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 17.1 185.0 3400.0 FEMALE
15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 16.5 185.0 3400.0 FEMALE
16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 17.3 193.0 3600.0 FEMALE
17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 19.4 193.0 3800.0 MALE
18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 17.9 193.0 4250.0 MALE
19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 18.1 193.0 3750.0 MALE
20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 17.8 193.0 3800.0 FEMALE
21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 19.2 193.0 3650.0 MALE
22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 17.1 193.0 3400.0 FEMALE
23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 18.5 193.0 3500.0 FEMALE
24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 18.2 193.0 3775.0 MALE

25 rows × 7 columns

[344 rows x 7 columns in total]

Clean and prepare the data#

# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]

# drop the columns we don't care about
adelie_data = adelie_data.drop(columns=["species"])

# drop rows with nulls to get our training data
training_data = adelie_data.dropna()

# take a peek at the training data
training_data
island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
0 Dream 36.6 18.4 184.0 3475.0 FEMALE
1 Dream 39.8 19.1 184.0 4650.0 MALE
2 Dream 40.9 18.9 184.0 3900.0 MALE
4 Dream 37.3 16.8 192.0 3000.0 FEMALE
5 Dream 43.2 18.5 192.0 4100.0 MALE
9 Dream 40.2 20.1 200.0 3975.0 MALE
10 Dream 40.8 18.9 208.0 4300.0 MALE
11 Dream 39.0 18.7 185.0 3650.0 MALE
12 Dream 37.0 16.9 185.0 3000.0 FEMALE
14 Dream 34.0 17.1 185.0 3400.0 FEMALE
15 Dream 37.0 16.5 185.0 3400.0 FEMALE
18 Dream 39.7 17.9 193.0 4250.0 MALE
19 Dream 37.8 18.1 193.0 3750.0 MALE
22 Dream 40.2 17.1 193.0 3400.0 FEMALE
23 Dream 36.8 18.5 193.0 3500.0 FEMALE
26 Dream 41.5 18.5 201.0 4000.0 MALE
31 Dream 33.1 16.1 178.0 2900.0 FEMALE
32 Dream 37.2 18.1 178.0 3900.0 MALE
33 Dream 39.5 16.7 178.0 3250.0 FEMALE
35 Dream 36.0 18.5 186.0 3100.0 FEMALE
36 Dream 39.6 18.1 186.0 4450.0 MALE
38 Dream 41.3 20.3 194.0 3550.0 MALE
41 Dream 35.7 18.0 202.0 3550.0 FEMALE
51 Dream 38.1 17.6 187.0 3425.0 FEMALE
53 Dream 36.0 17.1 187.0 3700.0 FEMALE

25 rows × 6 columns

[146 rows x 6 columns in total]
# pick feature columns and label column
feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']]

# also get the rows that we want to make predictions for (i.e. where the feature column is null)
missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]

Train and evaluate a linear regression model using the ML API#

from bigframes.ml.linear_model import LinearRegression

# as in scikit-learn, a newly created model is just a bundle of parameters
# default parameters are fine here
model = LinearRegression()

# this will train a temporary model in BigQuery Machine Learning
model.fit(feature_columns, label_columns)

# check how the model performed
model.score(feature_columns, label_columns)
mean_absolute_error mean_squared_error mean_squared_log_error median_absolute_error r2_score explained_variance
0 223.878763 78553.601634 0.005614 181.330911 0.623951 0.623951

1 rows × 6 columns

[1 rows x 6 columns in total]

Make predictions using the model#

model.predict(missing_body_mass)
predicted_body_mass_g
292 3459.735118

1 rows × 1 columns

[1 rows x 1 columns in total]

Save the trained model to BigQuery, so we can load it later#

model.to_gbq(f"{DATASET}.penguins_model", replace=True)
LinearRegression()

Clean Up#

bpd.close_session()