# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Set Up#
import bigframes.pandas as bpd
df = bpd.read_gbq("bigquery-public-data.baseball.schedules")[["homeTeamName", "awayTeamName", "duration_minutes"]]
df.peek()
| homeTeamName | awayTeamName | duration_minutes | |
|---|---|---|---|
| 88 | Royals | Athletics | 176 |
| 106 | Dodgers | Giants | 216 |
| 166 | Phillies | Royals | 162 |
| 247 | Rangers | Royals | 161 |
| 374 | Athletics | Astros | 161 |
Notes#
The API reference documentation for the
remote_functioncan be found at https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.session.Session#bigframes_session_Session_remote_functionMore code samples for
remote_functioncan be found in the BigQuery DataFrames API reference documentation, e.g.https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_apply
https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_map
https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_apply
The following examples are only for the purpose of demonstrating
remote_functionusage. They are not necessarily the best way to achieve the end result.In the examples in this notebook we are using
reuse=Falsejust as a caution to avoid concurrent runs of this notebook in the same google cloud project stepping over each other’s remote function deployment. It may not be neccesary in a simple use case.
Self-contained function#
Let’s consider a scenario where we want to categorize the matches as short,
medium or long duration based on the duration_minutes column.
@bpd.remote_function(reuse=False, cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
if duration_minutes < 90:
return "short"
elif duration_minutes < 180:
return "medium"
else:
return "long"
print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/functions/_function_session.py:335: UserWarning: You have not explicitly set a user-managed cloud_function_service_account. Using the default compute service account, {cloud_function_service_account}. To use Bigframes 2.0, please set an explicit user-managed cloud_function_service_account or set cloud_function_service_account explicitly to `default`.See, https://cloud.google.com/functions/docs/securing/function-identity.
warnings.warn(msg, category=UserWarning)
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-sessionca6012-ca541a90249f8b62951f38b7aba6a711-49to' and BQ remote function 'bigframes-dev._ed1e4d0f7d41174ba506d34d15dccf040d13f69e.bigframes_sessionca6012_ca541a90249f8b62951f38b7aba6a711_49to'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | medium |
| 2365 | Athletics | Angels | 134 | medium |
| 1977 | Athletics | Angels | 139 | medium |
| 554 | Cubs | Angels | 142 | medium |
| 654 | Astros | Angels | 143 | medium |
Function referring to variables outside the function body#
Let’s consider a slight variation of the earlier example where the labels for
the short, medium and long duration matches are defined outside the function
body. They would be captured at the time of remote_function deployment and
any change in their values in the notebook after the deployment will not
automatically propagate to the remote_function.
DURATION_CATEGORY_SHORT = "S"
DURATION_CATEGORY_MEDIUM = "M"
DURATION_CATEGORY_LONG = "L"
@bpd.remote_function(reuse=False, cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
if duration_minutes < 90:
return DURATION_CATEGORY_SHORT
elif duration_minutes < 180:
return DURATION_CATEGORY_MEDIUM
else:
return DURATION_CATEGORY_LONG
print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-4191f0fce98d46cc09359de47e203236-e009' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_4191f0fce98d46cc09359de47e203236_e009'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | M |
| 2365 | Athletics | Angels | 134 | M |
| 1977 | Athletics | Angels | 139 | M |
| 554 | Cubs | Angels | 142 | M |
| 654 | Astros | Angels | 143 | M |
Function referring to imports (built-in) outside the function body#
Let’s consider a scenario in which we want to categorize the matches in terms of
hour buckets. E.g. a match finishing in 0-60 minutes would be in 1h category,
61-120 minutes in 2h category and so on. The function itself makes use of the
math module (a built-in module in a standard python installation) which
happens to be imported outside the function body, let’s say in one of the
previous cells. For the demo purpose we have aliased the import to mymath, but
it is not necessary.
Later in the notebook we will see another example with a third-party module.
import math as mymath
@bpd.remote_function(reuse=False, cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
duration_hours = mymath.ceil(duration_minutes / 60)
return f"{duration_hours}h"
print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-cf31fc2d2c7fe111afa5526f5a9cdf06-gmmo' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_cf31fc2d2c7fe111afa5526f5a9cdf06_gmmo'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | 3h |
| 2365 | Athletics | Angels | 134 | 3h |
| 1977 | Athletics | Angels | 139 | 3h |
| 554 | Cubs | Angels | 142 | 3h |
| 654 | Astros | Angels | 143 | 3h |
Function referring to another function outside the function body#
In this example let’s create a remote_function from a function
duration_category which depends upon another function get_hour_ceiling,
which further depends on another function get_minutes_in_hour. This dependency
chain could be even longer in a real world example. The behaviors of the
dependencies would be captured at the time of the remote function
deployment.
Please ntoe that any changes in those functions in the notebook after the deployment would not automatically propagate to the remote function.
import math
def get_minutes_in_hour():
return 60
def get_hour_ceiling(minutes):
return math.ceil(minutes / get_minutes_in_hour())
@bpd.remote_function(reuse=False, cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
duration_hours = get_hour_ceiling(duration_minutes)
return f"{duration_hours} hrs"
print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-3c03836c2044bf625d02e25ccdbfe101-k1m4' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_3c03836c2044bf625d02e25ccdbfe101_k1m4'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | 3 hrs |
| 2365 | Athletics | Angels | 134 | 3 hrs |
| 1977 | Athletics | Angels | 139 | 3 hrs |
| 554 | Cubs | Angels | 142 | 3 hrs |
| 654 | Astros | Angels | 143 | 3 hrs |
Function requiring external packages#
In this example let’s say we want to redact the homeTeamName values, and we
choose to use a third party library cryptography. Any third party dependencies
can be specified in pip format
(with or without version number) as a list via the packages parameter.
@bpd.remote_function(reuse=False, packages=["cryptography"], cloud_function_service_account="default")
def get_hash(input: str) -> str:
from cryptography.fernet import Fernet
# handle missing value
if input is None:
input = ""
key = Fernet.generate_key()
f = Fernet(key)
return f.encrypt(input.encode()).decode()
df1 = df.assign(homeTeamNameRedacted=df["homeTeamName"].apply(get_hash))
df1.peek()
| homeTeamName | awayTeamName | duration_minutes | homeTeamNameRedacted | |
|---|---|---|---|---|
| 641 | American League | National League | 185 | gAAAAABmo0n2I391cbYwIYeg8lyJq1MSFZatrtpvuUD5v-... |
| 349 | Angels | Astros | 187 | gAAAAABmo0n2pX-siRwl2tIZA4m--swndC_b7vgGXrqSNM... |
| 2349 | Angels | Astros | 160 | gAAAAABmo0n28Q9RwH62HvYRhTDpQ9lo8c6G8F5bnn7wgF... |
| 557 | Angels | Astros | 166 | gAAAAABmo0n2YlwHlSGQ0_XvXd-QVBtB_Lq2zUifu7vKhg... |
| 220 | Angels | Astros | 162 | gAAAAABmo0n2l8HMSGKYizxfEmRvGQy96mrjwx734-Rl_Z... |
Function referring to imports (third-party) outside the function body#
In this scenario the function depends on a third party library and the module
from the third party library used in the function is imported outside the
function body in a previous cell. Below is such an example where the third-party
dependency is humanize and its module of the same name is imported outside the
function body.
import datetime as dt
import humanize
@bpd.remote_function(reuse=False, packages=["humanize"], cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
timedelta = dt.timedelta(minutes=duration_minutes)
return humanize.naturaldelta(timedelta)
print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-a5e21a4ad488ce8b90de19c3c8cd33b6-0ab2' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_a5e21a4ad488ce8b90de19c3c8cd33b6_0ab2'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | 2 hours |
| 2365 | Athletics | Angels | 134 | 2 hours |
| 1977 | Athletics | Angels | 139 | 2 hours |
| 554 | Cubs | Angels | 142 | 2 hours |
| 654 | Astros | Angels | 143 | 2 hours |
Clean Up#
bpd.close_session()