# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
BigQuery DataFrames: Synthetic Data Generation#
In addition to BigQuery DataFrames (installing which also installs pandas as a dependency) we will use
faker library as a building block for synthetic data generation.
!pip install faker
Collecting faker
Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Requirement already satisfied: tzdata in /usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages (from faker) (2024.2)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/1.9 MB 55.1 MB/s eta 0:00:00
?25hInstalling collected packages: faker
Successfully installed faker-37.1.0
import bigframes.pandas as bpd
bpd.options.bigquery.project = PROJECT_ID
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 2
1 import bigframes.pandas as bpd
----> 2 bpd.options.bigquery.project = PROJECT_ID
NameError: name 'PROJECT_ID' is not defined
Let’s use GeminiTextGenerator for our purpose, which is BigQuery DataFrame’s state-of-the-art LLM integration at the time of writing this notebook (Apr 16 2024).
from bigframes.ml.llm import GeminiTextGenerator
model = GeminiTextGenerator(model_name="gemini-2.0-flash-001")
Craft a prompt for the LLM to indicate the schema of the desired data and hints for the code that could generate such data.
prompt = """\
Write python code to generate a pandas dataframe based on the requirements:
Column name: Name, type: string, Description: Latin American Names
Column name: Age, type: int
Column name: Gender, type: string, Description: Inclusive
Note:
- Return the code only, no additional texts or comments
- Use faker library
- Generate 100 rows
- The final dataframe should be named 'result_df'.
"""
df_prompt = bpd.DataFrame({"prompt" : [prompt]})
df_prompt
Be accommodating that LLM may not produce a runnable code in the first go and may need some nudging. We will retry by adding the failing code and the exception it throws as additional context in the prompt.
max_tries = 5
for i in range(max_tries):
# Get LLM generated code
df_result = model.predict(df_prompt)
llm_result = df_result['ml_generate_text_llm_result'].iloc[0]
# Python code comes back as a markdown code block,
# remove the prefix "```python" and suffix "```"
code = llm_result[9:-3]
print(code)
# Check if the generated code is runnable
try:
exec(code)
break
except Exception as ex:
print(ex)
error_context = f"""
Previous code:
{code}
Had this exception:
{ex}"""
# Update the prompt to help LLM correct error
df_prompt["prompt"] += error_context
# If we have exhausted max tries then stop trying
if i+1 == max_tries:
raise Exception("Failed to generate runnable code")
Run the generated code and verify that it produced the desired data.
execution_context = {}
exec(code, execution_context)
execution_context.get("result_df")
We want to run this code at scale to generate since we want to generate large amount of data. Let’s deploy a remote_function for this purpose.
@bpd.remote_function(packages=['faker', 'pandas'], cloud_function_service_account="default")
def data_generator(id: int) -> str:
context = {}
exec(code, context)
result_df = context.get("result_df")
return result_df.to_json(orient="records")
data_generator.bigframes_cloud_function
Let’s say we want to generate 1 million rows of synthetic data. Since our generated code produces 100 rows in one run, we can initialize an indicator dataframe with 1M/100 = 10K indicator rows. Then we can apply the remote function to produce 100 synthetic data rows for each indicator row.
desired_num_rows = 1_000_000 # 1 million rows
batch_size = 100 # used in the prompt
num_batches = int(desired_num_rows/batch_size)
df = bpd.DataFrame({"row_id": range(num_batches)})
df["json_data"] = df["row_id"].apply(data_generator)
At this point each item in df["json_data"] is a json serialized array of 100 records. Let’s flatten that into 1 record per row using a direct SQL.
sql = f"""
WITH T0 AS ({df.sql}),
T1 AS (
SELECT PARSE_JSON(json_row) AS json_row
FROM T0, UNNEST(JSON_EXTRACT_ARRAY(json_data)) AS json_row
)
SELECT STRING(json_row.Name) AS Name,
INT64(json_row.Age) AS Age,
STRING(json_row.Gender) AS Gender
FROM T1
"""
df_result = bpd.read_gbq(sql)
df_result
There you have it, 1 million synthetic data rows ready to use, or save them in a BigQuery table for future use.