# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Set Up#

import bigframes.pandas as bpd
df = bpd.read_gbq("bigquery-public-data.baseball.schedules")[["homeTeamName", "awayTeamName", "duration_minutes"]]
df.peek()
Query job 1f6094e9-1942-477c-9ce3-87a614d71294 is DONE. 0 Bytes processed. Open Job
Query job ba19f29c-33d3-4f12-9605-ddeafb74918e is DONE. 582.8 kB processed. Open Job
Query job dd1ff8be-700a-4ce5-91a0-31413f70cfad is DONE. 82.0 kB processed. Open Job
homeTeamName awayTeamName duration_minutes
88 Royals Athletics 176
106 Dodgers Giants 216
166 Phillies Royals 162
247 Rangers Royals 161
374 Athletics Astros 161

Notes#

  • The API reference documentation for the remote_function can be found at https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.session.Session#bigframes_session_Session_remote_function

  • More code samples for remote_function can be found in the BigQuery DataFrames API reference documentation, e.g.

    • https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_apply

    • https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_map

    • https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_apply

  • The following examples are only for the purpose of demonstrating remote_function usage. They are not necessarily the best way to achieve the end result.

  • In the examples in this notebook we are using reuse=False just as a caution to avoid concurrent runs of this notebook in the same google cloud project stepping over each other’s remote function deployment. It may not be neccesary in a simple use case.

Self-contained function#

Let’s consider a scenario where we want to categorize the matches as short, medium or long duration based on the duration_minutes column.

@bpd.remote_function(reuse=False, cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
    if duration_minutes < 90:
        return "short"
    elif duration_minutes < 180:
        return "medium"
    else:
        return "long"

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/functions/_function_session.py:335: UserWarning: You have not explicitly set a user-managed cloud_function_service_account. Using the default compute service account, {cloud_function_service_account}. To use Bigframes 2.0, please set an explicit user-managed cloud_function_service_account or set cloud_function_service_account explicitly to `default`.See, https://cloud.google.com/functions/docs/securing/function-identity.
  warnings.warn(msg, category=UserWarning)
Query job 7c021760-59c4-4f3a-846c-9693a4d16eef is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-sessionca6012-ca541a90249f8b62951f38b7aba6a711-49to' and BQ remote function 'bigframes-dev._ed1e4d0f7d41174ba506d34d15dccf040d13f69e.bigframes_sessionca6012_ca541a90249f8b62951f38b7aba6a711_49to'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
Query job 4b116e3e-d4d3-4eb6-9764-0a29a7c5d036 is DONE. 58.3 kB processed. Open Job
Query job d62ac4f0-47c9-47ae-8611-c9ecf78f20c9 is DONE. 157.2 kB processed. Open Job
Query job 5f876ebb-2d95-4c68-9d84-947e02b37bad is DONE. 98.8 kB processed. Open Job
homeTeamName awayTeamName duration_minutes duration_cat
1911 Dodgers Angels 132 medium
2365 Athletics Angels 134 medium
1977 Athletics Angels 139 medium
554 Cubs Angels 142 medium
654 Astros Angels 143 medium

Function referring to variables outside the function body#

Let’s consider a slight variation of the earlier example where the labels for the short, medium and long duration matches are defined outside the function body. They would be captured at the time of remote_function deployment and any change in their values in the notebook after the deployment will not automatically propagate to the remote_function.

DURATION_CATEGORY_SHORT = "S"
DURATION_CATEGORY_MEDIUM = "M"
DURATION_CATEGORY_LONG = "L"
@bpd.remote_function(reuse=False, cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
    if duration_minutes < 90:
        return DURATION_CATEGORY_SHORT
    elif duration_minutes < 180:
        return DURATION_CATEGORY_MEDIUM
    else:
        return DURATION_CATEGORY_LONG

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Query job 1909a652-5735-401b-8a77-674d8539ded0 is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-4191f0fce98d46cc09359de47e203236-e009' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_4191f0fce98d46cc09359de47e203236_e009'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
Query job a942bdc5-6a6d-4db8-b2aa-a556197377b3 is DONE. 58.3 kB processed. Open Job
Query job 175ae9d3-604f-495b-a167-8b06c0283bd2 is DONE. 147.7 kB processed. Open Job
Query job d331a785-e574-45c9-86c8-d29ddd79a4d1 is DONE. 89.3 kB processed. Open Job
homeTeamName awayTeamName duration_minutes duration_cat
1911 Dodgers Angels 132 M
2365 Athletics Angels 134 M
1977 Athletics Angels 139 M
554 Cubs Angels 142 M
654 Astros Angels 143 M

Function referring to imports (built-in) outside the function body#

Let’s consider a scenario in which we want to categorize the matches in terms of hour buckets. E.g. a match finishing in 0-60 minutes would be in 1h category, 61-120 minutes in 2h category and so on. The function itself makes use of the math module (a built-in module in a standard python installation) which happens to be imported outside the function body, let’s say in one of the previous cells. For the demo purpose we have aliased the import to mymath, but it is not necessary.

Later in the notebook we will see another example with a third-party module.

import math as mymath
@bpd.remote_function(reuse=False, cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
    duration_hours = mymath.ceil(duration_minutes / 60)
    return f"{duration_hours}h"

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Query job bbc0b78f-bc04-4bd5-b711-399786a51519 is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-cf31fc2d2c7fe111afa5526f5a9cdf06-gmmo' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_cf31fc2d2c7fe111afa5526f5a9cdf06_gmmo'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
Query job 991b54ed-9eaa-450f-9208-3e73404bb112 is DONE. 58.3 kB processed. Open Job
Query job 4e464a58-ac5b-42fd-91e3-92c115bdd273 is DONE. 150.1 kB processed. Open Job
Query job d340f55d-1511-431a-970d-a70ed4356935 is DONE. 91.7 kB processed. Open Job
homeTeamName awayTeamName duration_minutes duration_cat
1911 Dodgers Angels 132 3h
2365 Athletics Angels 134 3h
1977 Athletics Angels 139 3h
554 Cubs Angels 142 3h
654 Astros Angels 143 3h

Function referring to another function outside the function body#

In this example let’s create a remote_function from a function duration_category which depends upon another function get_hour_ceiling, which further depends on another function get_minutes_in_hour. This dependency chain could be even longer in a real world example. The behaviors of the dependencies would be captured at the time of the remote function deployment.

Please ntoe that any changes in those functions in the notebook after the deployment would not automatically propagate to the remote function.

import math

def get_minutes_in_hour():
    return 60

def get_hour_ceiling(minutes):
    return math.ceil(minutes / get_minutes_in_hour())
@bpd.remote_function(reuse=False, cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
    duration_hours = get_hour_ceiling(duration_minutes)
    return f"{duration_hours} hrs"

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Query job 10d1afa3-349b-49a8-adbd-79a8309ce77c is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-3c03836c2044bf625d02e25ccdbfe101-k1m4' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_3c03836c2044bf625d02e25ccdbfe101_k1m4'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
Query job 33aff336-48d6-4caa-8cae-f459d21b180e is DONE. 58.3 kB processed. Open Job
Query job 561e0aa7-3962-4ef3-b308-a117a0ac3a7d is DONE. 157.4 kB processed. Open Job
Query job 759dccf8-3d88-40e1-a38a-2a2064e1d269 is DONE. 99.0 kB processed. Open Job
homeTeamName awayTeamName duration_minutes duration_cat
1911 Dodgers Angels 132 3 hrs
2365 Athletics Angels 134 3 hrs
1977 Athletics Angels 139 3 hrs
554 Cubs Angels 142 3 hrs
654 Astros Angels 143 3 hrs

Function requiring external packages#

In this example let’s say we want to redact the homeTeamName values, and we choose to use a third party library cryptography. Any third party dependencies can be specified in pip format (with or without version number) as a list via the packages parameter.

@bpd.remote_function(reuse=False, packages=["cryptography"], cloud_function_service_account="default")
def get_hash(input: str) -> str:
    from cryptography.fernet import Fernet

    # handle missing value
    if input is None:
        input = ""

    key = Fernet.generate_key()
    f = Fernet(key)
    return f.encrypt(input.encode()).decode()
Query job e2a44878-2564-44a5-8dec-b7ea2f42afd4 is DONE. 0 Bytes processed. Open Job
df1 = df.assign(homeTeamNameRedacted=df["homeTeamName"].apply(get_hash))
df1.peek()
Query job bcfab000-ca19-4633-bf0e-45e7d053f3eb is DONE. 60.5 kB processed. Open Job
Query job 139a6449-c07e-41ff-9aed-c6fdd633740a is DONE. 388.3 kB processed. Open Job
Query job 035fa2fb-0a55-4358-bb50-3ef915f5bf54 is DONE. 330.0 kB processed. Open Job
homeTeamName awayTeamName duration_minutes homeTeamNameRedacted
641 American League National League 185 gAAAAABmo0n2I391cbYwIYeg8lyJq1MSFZatrtpvuUD5v-...
349 Angels Astros 187 gAAAAABmo0n2pX-siRwl2tIZA4m--swndC_b7vgGXrqSNM...
2349 Angels Astros 160 gAAAAABmo0n28Q9RwH62HvYRhTDpQ9lo8c6G8F5bnn7wgF...
557 Angels Astros 166 gAAAAABmo0n2YlwHlSGQ0_XvXd-QVBtB_Lq2zUifu7vKhg...
220 Angels Astros 162 gAAAAABmo0n2l8HMSGKYizxfEmRvGQy96mrjwx734-Rl_Z...

Function referring to imports (third-party) outside the function body#

In this scenario the function depends on a third party library and the module from the third party library used in the function is imported outside the function body in a previous cell. Below is such an example where the third-party dependency is humanize and its module of the same name is imported outside the function body.

import datetime as dt
import humanize
@bpd.remote_function(reuse=False, packages=["humanize"], cloud_function_service_account="default")
def duration_category(duration_minutes: int) -> str:
    timedelta = dt.timedelta(minutes=duration_minutes)
    return humanize.naturaldelta(timedelta)

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Query job af73ab2d-8d88-4cbe-863f-d35e48af84e1 is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-a5e21a4ad488ce8b90de19c3c8cd33b6-0ab2' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_a5e21a4ad488ce8b90de19c3c8cd33b6_0ab2'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()
Query job 0a9ac329-619d-4303-8dbd-176a576d4ce8 is DONE. 58.3 kB processed. Open Job
Query job 456bb9b4-0576-4c04-b707-4a04496aa538 is DONE. 162.2 kB processed. Open Job
Query job 37f59939-5d2c-4fb1-839b-282ae3702d3d is DONE. 103.9 kB processed. Open Job
homeTeamName awayTeamName duration_minutes duration_cat
1911 Dodgers Angels 132 2 hours
2365 Athletics Angels 134 2 hours
1977 Athletics Angels 139 2 hours
554 Cubs Angels 142 2 hours
654 Astros Angels 143 2 hours

Clean Up#

bpd.close_session()