Source code for bigframes.bigquery._operations.array
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Array functions defined from
https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions
"""
from __future__ import annotations
import typing
import bigframes_vendored.constants as constants
import bigframes.core.groupby as groupby
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
import bigframes.series as series
if typing.TYPE_CHECKING:
import bigframes.dataframe as dataframe
[docs]
def array_agg(
obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
) -> series.Series | dataframe.DataFrame:
"""Group data and create arrays from selected columns, omitting NULLs to avoid
BigQuery errors (NULLs not allowed in arrays).
**Examples:**
>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
For a SeriesGroupBy object:
>>> lst = ['a', 'a', 'b', 'b', 'a']
>>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst)
>>> bbq.array_agg(s.groupby(level=0))
a [1. 2.]
b [3. 4.]
dtype: list<item: double>[pyarrow]
For a DataFrameGroupBy object:
>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
>>> df = bpd.DataFrame(l, columns=["a", "b", "c"])
>>> bbq.array_agg(df.groupby(by=["b"]))
a c
b
1.0 [2] [3]
2.0 [1 1] [3 2]
<BLANKLINE>
[2 rows x 2 columns]
Args:
obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
A GroupBy object to be applied the function.
Returns:
bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
DataFrame containing aggregated array columns, and indexed by the
original group columns.
"""
if isinstance(obj, groupby.SeriesGroupBy):
return obj._aggregate(agg_ops.ArrayAggOp())
elif isinstance(obj, groupby.DataFrameGroupBy):
return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False)
else:
raise ValueError(
f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
)