# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Integrating with BigQuery DataFrames#

This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one.

Attributing requests initiated by BigQuery DataFrames#

Partners are required to attribute API calls to BigQuery and other Google APIs. Where possible, this should be done via the User-Agent string, but can also be done via job labels if your integration doesn’t initialize the BigQuery DataFrames session.

Setting the User-Agent#

Set bpd.options.bigquery.application_name to a compliant string. Reach out to your Google Partner Engineering team contact for further instructions.

import bigframes.pandas as bpd

# Set this to the string informed by your Google Partner Engineering team contact.
# Note: This can only be set once per session, so is most appropriate for partners
# who provide a Python + BigQuery DataFrames environment to their customers.
bpd.options.bigquery.application_name = "notebook-samples/1.0.0 (GPN:notebook-samples)"
import bigframes.pandas as bpd

# Sample data
df = bpd.DataFrame({
    "index": [0, 1, 2, 3, 4],
    "int_col": [1, 2, 3, 4, 5],
    "float_col": [1.0, -0.5, 0.25, -0.125, 0.0625],
    "string_col": ["a", "b", "c", "d", "e"],
}).set_index("index")
/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.
  _global_session = bigframes.session.connect(
Query job 1772ca28-2ef5-425c-87fe-8227aeb9318c is DONE. 0 Bytes processed. Open Job

Setting the job label#

If your application works with customer-created BigQuery DataFrames objects, you might not be able to set the user-agent header because the session has already started (watch https://github.com/googleapis/python-bigquery-dataframes/issues/833 for updates on this limitation). Instead, attach a label to the jobs your application initiates, such as if you are performing to_gbq()on an existing DataFrame, as described below.

Use bpd.option_context() so that the labels are only set during the operations your application performs.

with bpd.option_context("compute.extra_query_labels", {"application-name": "notebook-samples"}):
    table_id = df.to_gbq()
Query job 33bd5814-b594-4ec4-baba-8f6b6e285e48 is DONE. 0 Bytes processed. Open Job

Accepting a BigQuery DataFrames (bigframes) DataFrame#

The recommended serialization format for a BigQuery DataFrames (bigframes) DataFrame is a BigQuery table. To write a DataFrame to a BigQuery table, use the DataFrame.to_gbq() method. With no destination_table, BigQuery DataFrames creates a table in the anonymous dataset corresponding to the BigQuery user & location and returns the corresponding table ID.

table_id = df.to_gbq()
table_id
Query job 1594d97a-1203-4c28-8730-caffb3ac4e9e is DONE. 0 Bytes processed. Open Job
'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_7578d5bd9949422599ccb9e4fe6451be'

Sharing the table with your application’s backend#

Tables created in the user’s anonymous dataset are only queryable by the user who created them. Many applications authenticate with a service account, which may be different from the end-user running BigQuery DataFrames (bigframes).

Grant your application access to this table by granting your application’s service account associated with the customer the roles/bigquery.dataViewer role on the BigQuery table with an IAM policy.

# This sample assumes the client code knows which service account to share with.
your_service_account_email = "your-service-account@bigframes-samples.iam.gserviceaccount.com"


def df_to_gbq_plus_workoad(df):
    table_id = df.to_gbq()

    bqclient = df.bqclient
    policy = bqclient.get_iam_policy(table_id)
    binding = {
        "role": "roles/bigquery.dataViewer",
        "members": {f"serviceAccount:{your_service_account_email}"},
    }
    policy.bindings.append(binding)
    bqclient.set_iam_policy(table_id, policy)

    # TODO(developer): Pass table_id to your application and start your workload.
    example_workload(table_id)


def example_workload(table_id):
    # For example, for one node workloads, use the client library to read the table
    # as a pandas DataFrame.
    from google.cloud import bigquery

    # This sample assumes this client is authenticated as the user
    # your_service_account_email.
    client = bigquery.Client()
    pandas_df = client.list_rows(table_id).to_dataframe()
    print(pandas_df)


df_to_gbq_plus_workoad(df)
Query job 8afc1538-9779-487a-a063-def5f438ee11 is DONE. 0 Bytes processed. Open Job
   index  int_col  float_col string_col
0      1        2    -0.5000          b
1      2        3     0.2500          c
2      0        1     1.0000          a
3      3        4    -0.1250          d
4      4        5     0.0625          e
# This sample assumes the client code doesn't know which service account to share with.


def df_to_gbq_plus_workoad(df):
    table_id = df.to_gbq()

    bqclient = df.bqclient
    token = bqclient._http.credentials.token
    project_id = bqclient.project

    share_table_and_start_workload(table_id, token, project_id)


def share_table_and_start_workload(table_id, token, project_id):
    # This code runs in the backend for your application.
    from google.cloud import bigquery
    import google.oauth2.credentials

    # Note: these credentials don't have any way to be refreshed,
    # so only use them long enough to share the table with the
    # service account.
    credentials = google.oauth2.credentials.Credentials(token)
    bqclient = bigquery.Client(
        project=project_id,
        credentials=credentials,
    )

    # This is assumed to only be available on the backend.
    your_service_account_email = "your-service-account@bigframes-samples.iam.gserviceaccount.com"
    policy = bqclient.get_iam_policy(table_id)
    binding = {
        "role": "roles/bigquery.dataViewer",
        "members": {f"serviceAccount:{your_service_account_email}"},
    }
    policy.bindings.append(binding)
    bqclient.set_iam_policy(table_id, policy)

    # Now that the table has been shared, bqclient with the temporary token
    # is no longer needed.
    example_workload(table_id)


def example_workload(table_id):
    # For example, for one node workloads, use the client library to read the table
    # as a pandas DataFrame.
    from google.cloud import bigquery

    # This sample assumes this client is authenticated as the user
    # your_service_account_email.
    client = bigquery.Client()
    pandas_df = client.list_rows(table_id).to_dataframe()
    print(pandas_df)


df_to_gbq_plus_workoad(df)
Query job b6f68a49-5129-448d-bca3-62a23dced10d is DONE. 0 Bytes processed. Open Job
   index  int_col  float_col string_col
0      3        4    -0.1250          d
1      1        2    -0.5000          b
2      4        5     0.0625          e
3      2        3     0.2500          c
4      0        1     1.0000          a

Preserving order#

Depending on your use case, you may want to include the ordering so that it can be restored withing your application.

ordering_column = "ordering_id_maybe_with_some_random_text_to_avoid_collisions"
table_id = df.to_gbq(ordering_id=ordering_column)
table_id
Query job 0f205180-cf26-46e5-950d-109947b7f5a1 is DONE. 0 Bytes processed. Open Job
'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_240520e0723548f18fd3bd5d24cbbf82'

Creating clustered tables#

Large tables can be optimized by passing in clustering_columns to create a clustered table.

table_id = df.to_gbq(clustering_columns=("index", "int_col"))
table_id
Query job 80177f9a-4f6e-4a4e-97db-f119ea686c62 is DONE. 0 Bytes processed. Open Job
'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_4ca41d2f28f84feca1bbafe9304fd89f'

Returning a BigQuery DataFrames (bigframes) DataFrame#

The recommended way to construct a DataFrame is from a BigQuery table which has a unique primary key. By default a primary key is used as the index, which allows for more efficient queries than the default index generation.

This sample assumes there is a shared dataset that

  1. The application can write to and

  2. the bigframes user can read from.

There are many ways an application can write to a BigQuery table, including BigQuery load jobs, DML, streaming REST API, and the BigQuery Write API. Each has different costs, performance, and limitations. Choose the one that best suits your application’s needs.

# The assumption is that there is a shared dataset to work with.
from google.cloud import bigquery

bqclient = bigquery.Client()
bqclient.create_dataset("my_dataset", exists_ok=True)
Dataset(DatasetReference('bigframes-dev', 'my_dataset'))
# For simplicity, this sample assumes your application uses
# a load job with the CSV file format.
# See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#python
import datetime
import io
import random


def create_table_for_bigframes():
    # This code is assumed to run on the application's backend.
    from google.cloud import bigquery

    client = bigquery.Client()

    # The end-user is expected to have read access to this table.
    table_suffix = f"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{random.randrange(1_000_000)}"
    table_id = f"{client.project}.my_dataset.integrations_ipynb_{table_suffix}"

    # Best practice: set the primary key to a unique column to use as the
    # index and default ordering in a BigQuery DataFrames (bigframes) DataFrame.
    # Having a unique identity column allows the DataFrame to be constructed
    # more efficiently.
    #
    # Note 1: Even a random UUID would be helpful for efficiency.
    #
    # Note 2: Don't do this if you can't guarantee uniqueness, as the BigQuery
    # query engine uses this property to optimize queries. Non-unique primary
    # keys result in undefined behavior.
    #
    # Note 3: client.create_table doesn't support primary key, so instead
    # use DDL to create the table.
    create_table_ddl = f"""
    CREATE OR REPLACE TABLE `{table_id}`
    (
        unique_index INT64,
        state STRING,
        postal_code STRING,
        pop INT64,
        PRIMARY KEY (unique_index) NOT ENFORCED
    )
    -- Clustering by the index column can make joins and loc operations more efficient.
    -- Also cluster by columns which are expected to be used as common filters.
    CLUSTER BY unique_index, state
    """
    client.query_and_wait(create_table_ddl)

    csv_file = io.BytesIO(
b"""unique_index,state,postal_code,pop
2,MI,48105,669
3,GA,30309,2581
5,TX,78701,5373
7,CO,80301,2087
11,MA,02142,2592
13,IL,60607,2630
17,MI,48201,2
19,NC,27701,801
23,CA,92612,1115
29,WA,98033,4952
"""
    )
    job_config = bigquery.LoadJobConfig(
        skip_leading_rows=1,
        source_format=bigquery.SourceFormat.CSV,
    )
    load_job = client.load_table_from_file(
        csv_file, table_id, job_config=job_config
    )
    load_job.result()  # Waits for the job to complete.

    return table_id


table_id = create_table_for_bigframes()


# This is assumed to run on the client.
import bigframes.pandas as bpd
df = bpd.read_gbq_table(table_id, index_col=["unique_index"])
df
state postal_code pop
unique_index
2 MI 48105 669
3 GA 30309 2581
5 TX 78701 5373
7 CO 80301 2087
11 MA 02142 2592
13 IL 60607 2630
17 MI 48201 2
19 NC 27701 801
23 CA 92612 1115
29 WA 98033 4952

10 rows × 3 columns

[10 rows x 3 columns in total]