{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "ur8xi4C7S06n" }, "outputs": [], "source": [ "# Copyright 2023 Google LLC\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", "# You may obtain a copy of the License at\n", "#\n", "# https://www.apache.org/licenses/LICENSE-2.0\n", "#\n", "# Unless required by applicable law or agreed to in writing, software\n", "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." ] }, { "cell_type": "markdown", "metadata": { "id": "JAPoU8Sm5E6e" }, "source": [ "# Train a linear regression model with BigQuery DataFrames ML", "\n", "\n", "
\n",
" \n",
" Run in Colab\n",
" \n",
" | \n",
" \n",
" \n",
" \n",
" View on GitHub\n",
" \n",
" | \n",
" \n",
" \n",
" | \n",
" \n",
" \n",
" | \n",
"
| \n", " | species | \n", "island | \n", "culmen_length_mm | \n", "culmen_depth_mm | \n", "flipper_length_mm | \n", "body_mass_g | \n", "sex | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "36.6 | \n", "18.4 | \n", "184.0 | \n", "3475.0 | \n", "FEMALE | \n", "
| 1 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "39.8 | \n", "19.1 | \n", "184.0 | \n", "4650.0 | \n", "MALE | \n", "
| 2 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "40.9 | \n", "18.9 | \n", "184.0 | \n", "3900.0 | \n", "MALE | \n", "
| 3 | \n", "Chinstrap penguin (Pygoscelis antarctica) | \n", "Dream | \n", "46.5 | \n", "17.9 | \n", "192.0 | \n", "3500.0 | \n", "FEMALE | \n", "
| 4 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Dream | \n", "37.3 | \n", "16.8 | \n", "192.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| \n", " | island | \n", "culmen_length_mm | \n", "culmen_depth_mm | \n", "flipper_length_mm | \n", "body_mass_g | \n", "sex | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "Dream | \n", "36.6 | \n", "18.4 | \n", "184.0 | \n", "3475.0 | \n", "FEMALE | \n", "
| 1 | \n", "Dream | \n", "39.8 | \n", "19.1 | \n", "184.0 | \n", "4650.0 | \n", "MALE | \n", "
| 2 | \n", "Dream | \n", "40.9 | \n", "18.9 | \n", "184.0 | \n", "3900.0 | \n", "MALE | \n", "
| 3 | \n", "Dream | \n", "37.3 | \n", "16.8 | \n", "192.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 4 | \n", "Dream | \n", "43.2 | \n", "18.5 | \n", "192.0 | \n", "4100.0 | \n", "MALE | \n", "
| 5 | \n", "Dream | \n", "40.2 | \n", "20.1 | \n", "200.0 | \n", "3975.0 | \n", "MALE | \n", "
| 6 | \n", "Dream | \n", "40.8 | \n", "18.9 | \n", "208.0 | \n", "4300.0 | \n", "MALE | \n", "
| 7 | \n", "Dream | \n", "39.0 | \n", "18.7 | \n", "185.0 | \n", "3650.0 | \n", "MALE | \n", "
| 8 | \n", "Dream | \n", "37.0 | \n", "16.9 | \n", "185.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 9 | \n", "Dream | \n", "34.0 | \n", "17.1 | \n", "185.0 | \n", "3400.0 | \n", "FEMALE | \n", "
10 rows × 6 columns
\n", "| \n", " | island | \n", "culmen_length_mm | \n", "culmen_depth_mm | \n", "flipper_length_mm | \n", "body_mass_g | \n", "sex | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "Dream | \n", "36.6 | \n", "18.4 | \n", "184.0 | \n", "3475.0 | \n", "FEMALE | \n", "
| 1 | \n", "Dream | \n", "39.8 | \n", "19.1 | \n", "184.0 | \n", "4650.0 | \n", "MALE | \n", "
| 2 | \n", "Dream | \n", "40.9 | \n", "18.9 | \n", "184.0 | \n", "3900.0 | \n", "MALE | \n", "
| 3 | \n", "Dream | \n", "37.3 | \n", "16.8 | \n", "192.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 4 | \n", "Dream | \n", "43.2 | \n", "18.5 | \n", "192.0 | \n", "4100.0 | \n", "MALE | \n", "
| 5 | \n", "Dream | \n", "40.2 | \n", "20.1 | \n", "200.0 | \n", "3975.0 | \n", "MALE | \n", "
| 6 | \n", "Dream | \n", "40.8 | \n", "18.9 | \n", "208.0 | \n", "4300.0 | \n", "MALE | \n", "
| 7 | \n", "Dream | \n", "39.0 | \n", "18.7 | \n", "185.0 | \n", "3650.0 | \n", "MALE | \n", "
| 8 | \n", "Dream | \n", "37.0 | \n", "16.9 | \n", "185.0 | \n", "3000.0 | \n", "FEMALE | \n", "
| 9 | \n", "Dream | \n", "34.0 | \n", "17.1 | \n", "185.0 | \n", "3400.0 | \n", "FEMALE | \n", "
10 rows × 6 columns
\n", "CREATE OR REPLACE MODEL `bigframes-dev.bqml_tutorial.penguin_weight`\n",
"OPTIONS(model_type = 'LINEAR_REG')\n",
"AS SELECT\n",
"`bfuid_col_3` AS `island`,\n",
"`bfuid_col_4` AS `culmen_length_mm`,\n",
"`bfuid_col_5` AS `culmen_depth_mm`,\n",
"`bfuid_col_6` AS `flipper_length_mm`,\n",
"`bfuid_col_7` AS `label`,\n",
"`bfuid_col_8` AS `sex`\n",
"FROM\n",
"(SELECT\n",
" `t0`.`bfuid_col_3`,\n",
" `t0`.`bfuid_col_4`,\n",
" `t0`.`bfuid_col_5`,\n",
" `t0`.`bfuid_col_6`,\n",
" `t0`.`bfuid_col_7`,\n",
" `t0`.`bfuid_col_8`\n",
"FROM `bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e._e154f0aa_5b29_492a_b464_a77c5f5a3dbd_bqdf_60fa3196-5a3e-45ae-898e-c2b473bfa1e9` AS `t0`)\n",
"| \n", " | mean_absolute_error | \n", "mean_squared_error | \n", "mean_squared_log_error | \n", "median_absolute_error | \n", "r2_score | \n", "explained_variance | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "223.878763 | \n", "78553.601634 | \n", "0.005614 | \n", "181.330911 | \n", "0.623951 | \n", "0.623951 | \n", "
1 rows × 6 columns
\n", "| \n", " | predicted_label | \n", "species | \n", "island | \n", "culmen_length_mm | \n", "culmen_depth_mm | \n", "flipper_length_mm | \n", "body_mass_g | \n", "sex | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "3945.010052 | \n", "Gentoo penguin (Pygoscelis papua) | \n", "Biscoe | \n", "<NA> | \n", "<NA> | \n", "<NA> | \n", "<NA> | \n", "<NA> | \n", "
| 1 | \n", "3914.916297 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "39.7 | \n", "18.9 | \n", "184.0 | \n", "3550.0 | \n", "MALE | \n", "
| 2 | \n", "3278.611224 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "36.4 | \n", "17.1 | \n", "184.0 | \n", "2850.0 | \n", "FEMALE | \n", "
| 3 | \n", "4006.367355 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "41.6 | \n", "18.0 | \n", "192.0 | \n", "3950.0 | \n", "MALE | \n", "
| 4 | \n", "3417.610478 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "35.0 | \n", "17.9 | \n", "192.0 | \n", "3725.0 | \n", "FEMALE | \n", "
| 5 | \n", "4009.612421 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "41.1 | \n", "18.2 | \n", "192.0 | \n", "4050.0 | \n", "MALE | \n", "
| 6 | \n", "4231.330911 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "42.0 | \n", "19.5 | \n", "200.0 | \n", "4050.0 | \n", "MALE | \n", "
| 7 | \n", "3554.308906 | \n", "Gentoo penguin (Pygoscelis papua) | \n", "Biscoe | \n", "43.8 | \n", "13.9 | \n", "208.0 | \n", "4300.0 | \n", "FEMALE | \n", "
| 8 | \n", "3550.677455 | \n", "Gentoo penguin (Pygoscelis papua) | \n", "Biscoe | \n", "43.3 | \n", "14.0 | \n", "208.0 | \n", "4575.0 | \n", "FEMALE | \n", "
| 9 | \n", "3537.882543 | \n", "Gentoo penguin (Pygoscelis papua) | \n", "Biscoe | \n", "44.0 | \n", "13.6 | \n", "208.0 | \n", "4350.0 | \n", "FEMALE | \n", "
10 rows × 8 columns
\n", "SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `bigframes-dev.bqml_tutorial.penguin_weight`, (SELECT\n",
"`bfuid_col_22` AS `species`,\n",
"`bfuid_col_23` AS `island`,\n",
"`bfuid_col_24` AS `culmen_length_mm`,\n",
"`bfuid_col_25` AS `culmen_depth_mm`,\n",
"`bfuid_col_26` AS `flipper_length_mm`,\n",
"`bfuid_col_27` AS `body_mass_g`,\n",
"`bfuid_col_28` AS `sex`\n",
"FROM\n",
"(SELECT\n",
" `t0`.`species`,\n",
" `t0`.`island`,\n",
" `t0`.`culmen_length_mm`,\n",
" `t0`.`culmen_depth_mm`,\n",
" `t0`.`flipper_length_mm`,\n",
" `t0`.`body_mass_g`,\n",
" `t0`.`sex`,\n",
" `t0`.`species` AS `bfuid_col_22`,\n",
" `t0`.`island` AS `bfuid_col_23`,\n",
" `t0`.`culmen_length_mm` AS `bfuid_col_24`,\n",
" `t0`.`culmen_depth_mm` AS `bfuid_col_25`,\n",
" `t0`.`flipper_length_mm` AS `bfuid_col_26`,\n",
" `t0`.`body_mass_g` AS `bfuid_col_27`,\n",
" `t0`.`sex` AS `bfuid_col_28`,\n",
" regexp_contains(`t0`.`island`, 'Biscoe') AS `bfuid_col_29`\n",
"FROM (\n",
" SELECT\n",
" `species`,\n",
" `island`,\n",
" `culmen_length_mm`,\n",
" `culmen_depth_mm`,\n",
" `flipper_length_mm`,\n",
" `body_mass_g`,\n",
" `sex`\n",
" FROM `bigquery-public-data.ml_datasets.penguins` FOR SYSTEM_TIME AS OF TIMESTAMP('2025-12-03T16:30:18.272882+00:00')\n",
") AS `t0`\n",
"WHERE\n",
" regexp_contains(`t0`.`island`, 'Biscoe'))), STRUCT(3 AS top_k_features))\n",
"| \n", " | predicted_label | \n", "top_feature_attributions | \n", "baseline_prediction_value | \n", "prediction_value | \n", "approximation_error | \n", "species | \n", "island | \n", "culmen_length_mm | \n", "culmen_depth_mm | \n", "flipper_length_mm | \n", "body_mass_g | \n", "sex | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "3945.010052 | \n", "[{'feature': 'island', 'attribution': 0.0}\n", " {'... | \n", "3945.010052 | \n", "3945.010052 | \n", "0.0 | \n", "Gentoo penguin (Pygoscelis papua) | \n", "Biscoe | \n", "<NA> | \n", "<NA> | \n", "<NA> | \n", "<NA> | \n", "<NA> | \n", "
| 1 | \n", "3914.916297 | \n", "[{'feature': 'flipper_length_mm', 'attribution... | \n", "3945.010052 | \n", "3914.916297 | \n", "0.0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "39.7 | \n", "18.9 | \n", "184.0 | \n", "3550.0 | \n", "MALE | \n", "
| 2 | \n", "3278.611224 | \n", "[{'feature': 'sex', 'attribution': -443.175184... | \n", "3945.010052 | \n", "3278.611224 | \n", "0.0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "36.4 | \n", "17.1 | \n", "184.0 | \n", "2850.0 | \n", "FEMALE | \n", "
| 3 | \n", "4006.367355 | \n", "[{'feature': 'culmen_length_mm', 'attribution'... | \n", "3945.010052 | \n", "4006.367355 | \n", "0.0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "41.6 | \n", "18.0 | \n", "192.0 | \n", "3950.0 | \n", "MALE | \n", "
| 4 | \n", "3417.610478 | \n", "[{'feature': 'sex', 'attribution': -443.175184... | \n", "3945.010052 | \n", "3417.610478 | \n", "0.0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "35.0 | \n", "17.9 | \n", "192.0 | \n", "3725.0 | \n", "FEMALE | \n", "
| 5 | \n", "4009.612421 | \n", "[{'feature': 'culmen_length_mm', 'attribution'... | \n", "3945.010052 | \n", "4009.612421 | \n", "0.0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "41.1 | \n", "18.2 | \n", "192.0 | \n", "4050.0 | \n", "MALE | \n", "
| 6 | \n", "4231.330911 | \n", "[{'feature': 'flipper_length_mm', 'attribution... | \n", "3945.010052 | \n", "4231.330911 | \n", "0.0 | \n", "Adelie Penguin (Pygoscelis adeliae) | \n", "Biscoe | \n", "42.0 | \n", "19.5 | \n", "200.0 | \n", "4050.0 | \n", "MALE | \n", "
| 7 | \n", "3554.308906 | \n", "[{'feature': 'sex', 'attribution': -443.175184... | \n", "3945.010052 | \n", "3554.308906 | \n", "0.0 | \n", "Gentoo penguin (Pygoscelis papua) | \n", "Biscoe | \n", "43.8 | \n", "13.9 | \n", "208.0 | \n", "4300.0 | \n", "FEMALE | \n", "
| 8 | \n", "3550.677455 | \n", "[{'feature': 'sex', 'attribution': -443.175184... | \n", "3945.010052 | \n", "3550.677455 | \n", "0.0 | \n", "Gentoo penguin (Pygoscelis papua) | \n", "Biscoe | \n", "43.3 | \n", "14.0 | \n", "208.0 | \n", "4575.0 | \n", "FEMALE | \n", "
| 9 | \n", "3537.882543 | \n", "[{'feature': 'sex', 'attribution': -443.175184... | \n", "3945.010052 | \n", "3537.882543 | \n", "0.0 | \n", "Gentoo penguin (Pygoscelis papua) | \n", "Biscoe | \n", "44.0 | \n", "13.6 | \n", "208.0 | \n", "4350.0 | \n", "FEMALE | \n", "
10 rows × 12 columns
\n", "| \n", " | feature | \n", "attribution | \n", "
|---|---|---|
| 0 | \n", "sex | \n", "221.587592 | \n", "
| 1 | \n", "flipper_length_mm | \n", "71.311846 | \n", "
| 2 | \n", "culmen_depth_mm | \n", "66.17986 | \n", "
| 3 | \n", "culmen_length_mm | \n", "45.443363 | \n", "
| 4 | \n", "island | \n", "17.258076 | \n", "
5 rows × 2 columns
\n", "SELECT * FROM ML.PREDICT(MODEL `bigframes-dev.bqml_tutorial.penguin_weight_with_global_explain`, (SELECT\n",
"`column_0` AS `sex`,\n",
"`column_1` AS `flipper_length_mm`,\n",
"`column_2` AS `culmen_depth_mm`,\n",
"`column_3` AS `culmen_length_mm`,\n",
"`column_4` AS `island`\n",
"FROM\n",
"(SELECT\n",
" *\n",
"FROM (\n",
" SELECT\n",
" *\n",
" FROM UNNEST(ARRAY<STRUCT<`column_0` STRING, `column_1` INT64, `column_2` INT64, `column_3` INT64, `column_4` STRING>>[STRUCT('MALE', 180, 15, 40, 'Biscoe'), STRUCT('FEMALE', 190, 16, 41, 'Biscoe'), STRUCT('MALE', 200, 17, 42, 'Dream'), STRUCT('FEMALE', 210, 18, 43, 'Dream')]) AS `column_0`\n",
") AS `t0`)))\n",
"| \n", " | predicted_body_mass_g | \n", "sex | \n", "flipper_length_mm | \n", "culmen_depth_mm | \n", "culmen_length_mm | \n", "island | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "3596.332211 | \n", "MALE | \n", "180 | \n", "15 | \n", "40 | \n", "Biscoe | \n", "
| 1 | \n", "3384.699918 | \n", "FEMALE | \n", "190 | \n", "16 | \n", "41 | \n", "Biscoe | \n", "
| 2 | \n", "4049.581796 | \n", "MALE | \n", "200 | \n", "17 | \n", "42 | \n", "Dream | \n", "
| 3 | \n", "3837.949503 | \n", "FEMALE | \n", "210 | \n", "18 | \n", "43 | \n", "Dream | \n", "
| \n", " | mean_absolute_error | \n", "mean_squared_error | \n", "mean_squared_log_error | \n", "median_absolute_error | \n", "r2_score | \n", "explained_variance | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "223.878763 | \n", "78553.601634 | \n", "0.005614 | \n", "181.330911 | \n", "0.623951 | \n", "0.623951 | \n", "
1 rows × 6 columns
\n", "