{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright 2025 Google LLC\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     https://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Struct Data Types\n",
    "\n",
    "In BigQuery, a [STRUCT](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type) (also known as a `record`) is a collection of ordered fields, each with a defined data type (required) and an optional field name. BigQuery DataFrames maps BigQuery `STRUCT` types to the pandas equivalent, `pandas.ArrowDtype(pa.struct())`. \n",
    "\n",
    "This notebook illustrates how to work with `STRUCT` columns in BigQuery DataFrames. First, let's import the required packages and perform the necessary setup below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import bigframes.pandas as bpd\n",
    "import bigframes.bigquery as bbq\n",
    "import pandas as pd\n",
    "import pyarrow as pa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "REGION = \"US\"  # @param {type: \"string\"}\n",
    "\n",
    "bpd.options.display.progress_bar = None\n",
    "bpd.options.bigquery.location = REGION"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create DataFrames with struct columns\n",
    "\n",
    "**Example 1: Creating from a list of objects**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Address</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Alice</td>\n",
       "      <td>{'City': 'New York', 'State': 'NY'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bob</td>\n",
       "      <td>{'City': 'San Francisco', 'State': 'CA'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Charlie</td>\n",
       "      <td>{'City': 'Seattle', 'State': 'WA'}</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 2 columns</p>\n",
       "</div>[3 rows x 2 columns in total]"
      ],
      "text/plain": [
       "      Name                                   Address\n",
       "0    Alice       {'City': 'New York', 'State': 'NY'}\n",
       "1      Bob  {'City': 'San Francisco', 'State': 'CA'}\n",
       "2  Charlie        {'City': 'Seattle', 'State': 'WA'}\n",
       "\n",
       "[3 rows x 2 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names = [\"Alice\", \"Bob\", \"Charlie\"]\n",
    "addresses = [\n",
    "    {'City': 'New York', 'State': 'NY'},\n",
    "    {'City': 'San Francisco', 'State': 'CA'},\n",
    "    {'City': 'Seattle', 'State': 'WA'}\n",
    "]\n",
    "df = bpd.DataFrame({'Name': names, 'Address': addresses})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Name                                    string[pyarrow]\n",
       "Address    struct<City: string, State: string>[pyarrow]\n",
       "dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Example 2: Defining schema explicitly**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         {'City': 'New York', 'State': 'NY'}\n",
       "1    {'City': 'San Francisco', 'State': 'CA'}\n",
       "2          {'City': 'Seattle', 'State': 'WA'}\n",
       "dtype: struct<City: string, State: string>[pyarrow]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bpd.Series(\n",
    "    data=addresses, \n",
    "    dtype=bpd.ArrowDtype(pa.struct([('City', pa.string()), ('State', pa.string())]))\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Example 3: Reading from a source**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    [{'tables': {'score': 0.8667634129524231, 'val...\n",
       "1    [{'tables': {'score': 0.9351968765258789, 'val...\n",
       "2    [{'tables': {'score': 0.8572560548782349, 'val...\n",
       "3    [{'tables': {'score': 0.9690881371498108, 'val...\n",
       "4    [{'tables': {'score': 0.9349926710128784, 'val...\n",
       "Name: predicted_default_payment_next_month, dtype: list<item: struct<tables: struct<score: double, value: string>>>[pyarrow]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bpd.read_gbq(\"bigquery-public-data.ml_datasets.credit_card_default\", max_results=5)[\"predicted_default_payment_next_month\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Operate on `STRUCT` data\n",
    "\n",
    "BigQuery DataFrames provides two main approaches for operating on `STRUCT` data:\n",
    "\n",
    "1. **[The `Series.struct` accessor](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor)**: Provides Pandas-like methods for STRUCT column manipulation.\n",
    "2. **The `DataFrame.struct` accessor**: Provides Pandas-like methods for all child STRUCT columns manipulation.\n",
    "3. **[BigQuery built-in functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions)**: Allows you to use functions mirroring BigQuery SQL operations, available through the `bigframes.bigquery` module (abbreviated as `bbq` below), such as [`struct`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_struct)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### View Data Types of Struct Fields"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City     string[pyarrow]\n",
       "State    string[pyarrow]\n",
       "dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Address'].struct.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Access a Struct Field by Name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         New York\n",
       "1    San Francisco\n",
       "2          Seattle\n",
       "Name: City, dtype: string"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Address'].struct.field(\"City\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract Struct Fields into a DataFrame\n",
    "\n",
    "**Example 1: Using Series `.struct` accessor**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>San Francisco</td>\n",
       "      <td>CA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Seattle</td>\n",
       "      <td>WA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 2 columns</p>\n",
       "</div>[3 rows x 2 columns in total]"
      ],
      "text/plain": [
       "            City State\n",
       "0       New York    NY\n",
       "1  San Francisco    CA\n",
       "2        Seattle    WA\n",
       "\n",
       "[3 rows x 2 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Address'].struct.explode()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Example 2: Using DataFrame `.struct` accessor while keeping other columns**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Address.City</th>\n",
       "      <th>Address.State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Alice</td>\n",
       "      <td>New York</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bob</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>CA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Charlie</td>\n",
       "      <td>Seattle</td>\n",
       "      <td>WA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 3 columns</p>\n",
       "</div>[3 rows x 3 columns in total]"
      ],
      "text/plain": [
       "      Name   Address.City Address.State\n",
       "0    Alice       New York            NY\n",
       "1      Bob  San Francisco            CA\n",
       "2  Charlie        Seattle            WA\n",
       "\n",
       "[3 rows x 3 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.struct.explode(\"Address\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}