# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

JSON Data Types#

When using BigQuery DataFrames, columns containing data in BigQuery’s JSON format (a lightweight standard) are represented as pandas.ArrowDtype. The exact underlying Arrow type depends on your library versions. Older environments typically use db_dtypes.JSONArrowType() for compatibility, which is an Arrow extension type acting as a light wrapper around pa.string(). In contrast, newer setups (pandas 3.0+ and pyarrow 19.0+) utilize the more recent pa.json_(pa.string()) representation.

import bigframes.pandas as bpd
import bigframes.bigquery as bbq
import db_dtypes
import pandas as pd
import pyarrow as pa
REGION = "US"  # @param {type: "string"}

bpd.options.display.progress_bar = None
bpd.options.bigquery.location = REGION

Create Series with JSON columns#

Example 1: Create a Series with a JSON dtype from local data

This example demonstrates creating a JSON Series from a list of JSON strings. Note that BigQuery standardizes these strings, for instance, by removing extra spaces and ordering dictionary keys. Specifying the dtype is essential; if omitted, a string-type Series will be generated.

json_data = [
    "1",
    '"str"',
    "false",
    '["a", {"b": 1}, null]',
    '{"a": {"b": [1, 2, 3], "c": true}}',
    None,
]
bpd.Series(json_data, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()))
0                               1
1                           "str"
2                           false
3              ["a",{"b":1},null]
4    {"a":{"b":[1,2,3],"c":true}}
5                            <NA>
dtype: extension<dbjson<JSONArrowType>>[pyarrow]

Example 2: Create a Series with a Nested JSON dtype from local data

To create a BigQuery DataFrame Series containing JSON data nested within a STRUCT or LIST type, you must represent the JSON data in a pa.array defined with the pa.string type. This workaround is necessary because Pyarrow lacks support for creating structs or lists that directly contain extension types (see issue).

list_data = [
    [{"key": "1"}],
    [{"key": None}],
    [{"key": '["1","3","5"]'}],
    [{"key": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
]
pa_array = pa.array(list_data, type=pa.list_(pa.struct([("key", pa.string())])))
bpd.Series(
    pd.arrays.ArrowExtensionArray(pa_array),
    dtype=pd.ArrowDtype(
        pa.list_(pa.struct([("key", db_dtypes.JSONArrowType())])),
    ),
)
0                                       [{'key': '1'}]
1                                      [{'key': None}]
2                           [{'key': '["1","3","5"]'}]
3    [{'key': '{"a":1,"b":["x","y"],"c":{"x":[],"z"...
dtype: list<item: struct<key: extension<dbjson<JSONArrowType>>>>[pyarrow]

Example 3: Create a Series with a Nested JSON dtype using BigQuery SQLs

sql = """
SELECT 0 AS id, STRUCT(JSON_OBJECT('a', True) AS data, 1 AS number) AS struct_col
UNION ALL
SELECT 1, STRUCT(JSON_OBJECT('b', 100), 2),
"""
df = bpd.read_gbq(sql)
df
id struct_col
0 1 {'data': '{"b":100}', 'number': 2}
1 0 {'data': '{"a":true}', 'number': 1}

2 rows × 2 columns

[2 rows x 2 columns in total]
df.dtypes
id                                                        Int64
struct_col    struct<data: extension<dbjson<JSONArrowType>>,...
dtype: object

Operate on JSON data#

The bigframes.bigquery module (often abbreviated as bbq) provides access within BigQuery DataFrames to various BigQuery built-in functions. Examples relevant for JSON data include json_extract and parse_json.

Extract JSON data via specific JSON path#

Example 1: When JSON data is represented as strings

fruits = [
  '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}',
  '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}',
]
str_s = bpd.Series(fruits, dtype="string")
str_s
0    {"fruits": [{"name": "apple"}, {"name": "cherr...
1    {"fruits": [{"name": "guava"}, {"name": "grape...
dtype: string
bbq.json_extract(str_s, "$.fruits[0]")
0    {"name":"apple"}
1    {"name":"guava"}
dtype: string

Example 2: When JSON data is stored as JSON type

json_s = bpd.Series(fruits, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()))
json_s
0    {"fruits":[{"name":"apple"},{"name":"cherry"}]}
1    {"fruits":[{"name":"guava"},{"name":"grapes"}]}
dtype: extension<dbjson<JSONArrowType>>[pyarrow]
bbq.json_extract(json_s, "$.fruits[0]")
0    {"name":"apple"}
1    {"name":"guava"}
dtype: extension<dbjson<JSONArrowType>>[pyarrow]

Extract an array from JSON data#

bbq.json_extract_array(json_s, "$.fruits")
0    ['{"name":"apple"}' '{"name":"cherry"}']
1    ['{"name":"guava"}' '{"name":"grapes"}']
dtype: list<item: extension<dbjson<JSONArrowType>>>[pyarrow]
bbq.json_extract_array(str_s, "$.fruits")
0    ['{"name":"apple"}' '{"name":"cherry"}']
1    ['{"name":"guava"}' '{"name":"grapes"}']
dtype: list<item: string>[pyarrow]