# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Integrating with BigQuery DataFrames#
This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one.
Attributing requests initiated by BigQuery DataFrames#
Partners are required to attribute API calls to BigQuery and other Google APIs. Where possible, this should be done via the User-Agent string, but can also be done via job labels if your integration doesn’t initialize the BigQuery DataFrames session.
Setting the User-Agent#
Set bpd.options.bigquery.application_name to a compliant string. Reach out to your Google Partner Engineering team contact for further instructions.
import bigframes.pandas as bpd
# Set this to the string informed by your Google Partner Engineering team contact.
# Note: This can only be set once per session, so is most appropriate for partners
# who provide a Python + BigQuery DataFrames environment to their customers.
bpd.options.bigquery.application_name = "notebook-samples/1.0.0 (GPN:notebook-samples)"
import bigframes.pandas as bpd
# Sample data
df = bpd.DataFrame({
"index": [0, 1, 2, 3, 4],
"int_col": [1, 2, 3, 4, 5],
"float_col": [1.0, -0.5, 0.25, -0.125, 0.0625],
"string_col": ["a", "b", "c", "d", "e"],
}).set_index("index")
/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.
_global_session = bigframes.session.connect(
Setting the job label#
If your application works with customer-created BigQuery DataFrames objects, you might not be able to set the user-agent header because the session has already started (watch https://github.com/googleapis/python-bigquery-dataframes/issues/833 for updates on this limitation). Instead, attach a label to the jobs your application initiates, such as if you are performing to_gbq()on an existing DataFrame, as described below.
Use bpd.option_context() so that the labels are only set during the operations your application performs.
with bpd.option_context("compute.extra_query_labels", {"application-name": "notebook-samples"}):
table_id = df.to_gbq()
Accepting a BigQuery DataFrames (bigframes) DataFrame#
The recommended serialization format for a BigQuery DataFrames (bigframes) DataFrame is a BigQuery table. To write a DataFrame to a BigQuery table, use the DataFrame.to_gbq() method. With no destination_table, BigQuery DataFrames creates a table in the anonymous dataset corresponding to the BigQuery user & location and returns the corresponding table ID.
table_id = df.to_gbq()
table_id
'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_7578d5bd9949422599ccb9e4fe6451be'
Preserving order#
Depending on your use case, you may want to include the ordering so that it can be restored withing your application.
ordering_column = "ordering_id_maybe_with_some_random_text_to_avoid_collisions"
table_id = df.to_gbq(ordering_id=ordering_column)
table_id
'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_240520e0723548f18fd3bd5d24cbbf82'
Creating clustered tables#
Large tables can be optimized by passing in clustering_columns to create a clustered table.
table_id = df.to_gbq(clustering_columns=("index", "int_col"))
table_id
'bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20250530_session9fdc39_4ca41d2f28f84feca1bbafe9304fd89f'
Returning a BigQuery DataFrames (bigframes) DataFrame#
The recommended way to construct a DataFrame is from a BigQuery table which has a unique primary key. By default a primary key is used as the index, which allows for more efficient queries than the default index generation.
This sample assumes there is a shared dataset that
The application can write to and
the bigframes user can read from.
There are many ways an application can write to a BigQuery table, including BigQuery load jobs, DML, streaming REST API, and the BigQuery Write API. Each has different costs, performance, and limitations. Choose the one that best suits your application’s needs.
# The assumption is that there is a shared dataset to work with.
from google.cloud import bigquery
bqclient = bigquery.Client()
bqclient.create_dataset("my_dataset", exists_ok=True)
Dataset(DatasetReference('bigframes-dev', 'my_dataset'))
# For simplicity, this sample assumes your application uses
# a load job with the CSV file format.
# See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#python
import datetime
import io
import random
def create_table_for_bigframes():
# This code is assumed to run on the application's backend.
from google.cloud import bigquery
client = bigquery.Client()
# The end-user is expected to have read access to this table.
table_suffix = f"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{random.randrange(1_000_000)}"
table_id = f"{client.project}.my_dataset.integrations_ipynb_{table_suffix}"
# Best practice: set the primary key to a unique column to use as the
# index and default ordering in a BigQuery DataFrames (bigframes) DataFrame.
# Having a unique identity column allows the DataFrame to be constructed
# more efficiently.
#
# Note 1: Even a random UUID would be helpful for efficiency.
#
# Note 2: Don't do this if you can't guarantee uniqueness, as the BigQuery
# query engine uses this property to optimize queries. Non-unique primary
# keys result in undefined behavior.
#
# Note 3: client.create_table doesn't support primary key, so instead
# use DDL to create the table.
create_table_ddl = f"""
CREATE OR REPLACE TABLE `{table_id}`
(
unique_index INT64,
state STRING,
postal_code STRING,
pop INT64,
PRIMARY KEY (unique_index) NOT ENFORCED
)
-- Clustering by the index column can make joins and loc operations more efficient.
-- Also cluster by columns which are expected to be used as common filters.
CLUSTER BY unique_index, state
"""
client.query_and_wait(create_table_ddl)
csv_file = io.BytesIO(
b"""unique_index,state,postal_code,pop
2,MI,48105,669
3,GA,30309,2581
5,TX,78701,5373
7,CO,80301,2087
11,MA,02142,2592
13,IL,60607,2630
17,MI,48201,2
19,NC,27701,801
23,CA,92612,1115
29,WA,98033,4952
"""
)
job_config = bigquery.LoadJobConfig(
skip_leading_rows=1,
source_format=bigquery.SourceFormat.CSV,
)
load_job = client.load_table_from_file(
csv_file, table_id, job_config=job_config
)
load_job.result() # Waits for the job to complete.
return table_id
table_id = create_table_for_bigframes()
# This is assumed to run on the client.
import bigframes.pandas as bpd
df = bpd.read_gbq_table(table_id, index_col=["unique_index"])
df
| state | postal_code | pop | |
|---|---|---|---|
| unique_index | |||
| 2 | MI | 48105 | 669 |
| 3 | GA | 30309 | 2581 |
| 5 | TX | 78701 | 5373 |
| 7 | CO | 80301 | 2087 |
| 11 | MA | 02142 | 2592 |
| 13 | IL | 60607 | 2630 |
| 17 | MI | 48201 | 2 |
| 19 | NC | 27701 | 801 |
| 23 | CA | 92612 | 1115 |
| 29 | WA | 98033 | 4952 |
10 rows × 3 columns