# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Operations with an “NULL index” DataFrame#
Note: This notebook describes a feature that is currently in preview. There may be breaking changes to the functionality when using “NULL index” objects.
Use the “NULL” index for more efficient query generation, but some pandas-compatible methods may not be possible without an index.
import bigframes.enums
import bigframes.exceptions
import bigframes.pandas as bpd
df = bpd.read_gbq(
"bigquery-public-data.baseball.schedules",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
Use peek() to view an arbitrary selection of rows from the DataFrame. This is much more efficient than head(), which requires a total ordering for determinism.
df.peek()
| gameId | gameNumber | seasonId | year | type | dayNight | duration | duration_minutes | homeTeamId | homeTeamName | awayTeamId | awayTeamName | startTime | attendance | status | created | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e14b6493-9e7f-404f-840a-8a680cc364bf | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 3:07 | 187 | 03556285-bdbb-4576-a06d-42f71f46ddc5 | Marlins | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-26 17:10:00+00:00 | 27318 | closed | 2016-10-06 06:25:15+00:00 |
| 1 | 1f32b347-cbcb-4c31-a145-0e685306d168 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 3:09 | 189 | 03556285-bdbb-4576-a06d-42f71f46ddc5 | Marlins | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-25 20:10:00+00:00 | 29457 | closed | 2016-10-06 06:25:15+00:00 |
| 2 | 0c2292d1-7398-48be-bf8e-b41dad5e1a43 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 2:45 | 165 | 12079497-e414-450a-8bf2-29f91de646bf | Braves | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-11 20:10:00+00:00 | 43114 | closed | 2016-10-06 06:25:15+00:00 |
| 3 | 8fbec734-a15a-42ab-8d51-60790de7750b | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 3:42 | 222 | 12079497-e414-450a-8bf2-29f91de646bf | Braves | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-12 17:35:00+00:00 | 31625 | closed | 2016-10-06 06:25:15+00:00 |
| 4 | 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 2:44 | 164 | 2142e1ba-3b40-445c-b8bb-f1f8b1054220 | Phillies | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-08 17:05:00+00:00 | 28650 | closed | 2016-10-06 06:25:15+00:00 |
Inspect the properties of the DataFrame#
Some properties, such as dtypes, can be retrieved without executing a query job.
df.dtypes
gameId string[pyarrow]
gameNumber Int64
seasonId string[pyarrow]
year Int64
type string[pyarrow]
dayNight string[pyarrow]
duration string[pyarrow]
duration_minutes Int64
homeTeamId string[pyarrow]
homeTeamName string[pyarrow]
awayTeamId string[pyarrow]
awayTeamName string[pyarrow]
startTime timestamp[us, tz=UTC][pyarrow]
attendance Int64
status string[pyarrow]
created timestamp[us, tz=UTC][pyarrow]
dtype: object
Other properties, such as shape require a query. In this case, shape runs a COUNT(1) query.
df.shape
(2431, 16)
Select a subset of the DataFrame#
Filter columns by selecting a list of columns from the DataFrame.
Note: Even with index_col=bigframes.enums.DefaultIndexKind.NULL, it is more efficient to do this selection in read_gbq / read_gbq_table except in cases where the total ordering ID columns can be pruned.
column_filtered = df[["gameId", "year", "homeTeamName", "awayTeamName", "duration_minutes"]]
column_filtered.peek()
| gameId | year | homeTeamName | awayTeamName | duration_minutes | |
|---|---|---|---|---|---|
| 0 | e14b6493-9e7f-404f-840a-8a680cc364bf | 2016 | Marlins | Cubs | 187 |
| 1 | 1f32b347-cbcb-4c31-a145-0e685306d168 | 2016 | Marlins | Cubs | 189 |
| 2 | 0c2292d1-7398-48be-bf8e-b41dad5e1a43 | 2016 | Braves | Cubs | 165 |
| 3 | 8fbec734-a15a-42ab-8d51-60790de7750b | 2016 | Braves | Cubs | 222 |
| 4 | 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd | 2016 | Phillies | Cubs | 164 |
Filter by rows using a boolean Series. This Series must be derived from the DataFrame being filtered so that the NULL index can still align correctly.
night_games = df[df['dayNight'] == 'N']
night_games.peek()
| gameId | gameNumber | seasonId | year | type | dayNight | duration | duration_minutes | homeTeamId | homeTeamName | awayTeamId | awayTeamName | startTime | attendance | status | created | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 63f14670-c28e-432b-84ee-1a2c6ac29527 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 2:43 | 163 | 03556285-bdbb-4576-a06d-42f71f46ddc5 | Marlins | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-23 23:10:00+00:00 | 25291 | closed | 2016-10-06 06:25:15+00:00 |
| 1 | bf4e80d1-3125-44fa-8a89-de93d039d465 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 3:24 | 204 | 03556285-bdbb-4576-a06d-42f71f46ddc5 | Marlins | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-24 23:10:00+00:00 | 24385 | closed | 2016-10-06 06:25:15+00:00 |
| 2 | e8af534c-36ed-4ff9-8511-780825fdd041 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 2:51 | 171 | 12079497-e414-450a-8bf2-29f91de646bf | Braves | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-10 23:35:00+00:00 | 30547 | closed | 2016-10-06 06:25:15+00:00 |
| 3 | e599c525-ac42-4b54-928d-7ee5fbe67dd9 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 2:45 | 165 | 2142e1ba-3b40-445c-b8bb-f1f8b1054220 | Phillies | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-07 23:05:00+00:00 | 27381 | closed | 2016-10-06 06:25:15+00:00 |
| 4 | d80ffb65-57a4-42c9-ae1c-2c51d0650336 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 3:05 | 185 | 2142e1ba-3b40-445c-b8bb-f1f8b1054220 | Phillies | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-06 23:05:00+00:00 | 22162 | closed | 2016-10-06 06:25:15+00:00 |
Join two DataFrames#
Even though pandas usually joins by the index, NULL index objects can still be manually joined by a column using the on parameter in merge.
df1 = df[["gameId", "homeTeamName"]]
df1.peek()
| gameId | homeTeamName | |
|---|---|---|
| 0 | e14b6493-9e7f-404f-840a-8a680cc364bf | Marlins |
| 1 | 1f32b347-cbcb-4c31-a145-0e685306d168 | Marlins |
| 2 | 0c2292d1-7398-48be-bf8e-b41dad5e1a43 | Braves |
| 3 | 8fbec734-a15a-42ab-8d51-60790de7750b | Braves |
| 4 | 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd | Phillies |
df2 = df[["gameId", "awayTeamName"]].head(2)
df2.peek()
merged = df1.merge(df2, on="gameId", how="inner")
merged.peek()
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | af72a0b9-65f7-49fb-9b30-d505068bdf6d | Reds | Brewers |
| 1 | d60c6036-0ce1-4c90-8dd9-de3b403c92a8 | Nationals | Brewers |
merged = df1.merge(df2, on="gameId", how="outer")
merged.peek()
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | 039bb40e-7613-4674-a653-584b93e9b21b | American League | <NA> |
| 1 | 78000e12-2ef3-4246-adc1-c8a4d157631c | Angels | <NA> |
| 2 | de5555dc-9228-4f7c-88ae-4451e3ffb980 | Angels | <NA> |
| 3 | f29a2754-004b-436c-91fe-3d86c0bb17a8 | Angels | <NA> |
| 4 | 8e5af008-8a07-4f9a-90cb-336ca4c84c71 | Angels | <NA> |
merged = df1.merge(df2, on="gameId", how="left")
merged.peek()
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | 039bb40e-7613-4674-a653-584b93e9b21b | American League | <NA> |
| 1 | f6fcd83c-e130-487c-a0cc-d00b2712d08b | Angels | <NA> |
| 2 | fe401dd2-089c-4822-8657-4d510d460f38 | Angels | <NA> |
| 3 | c894bdee-5dda-49f4-87c8-53b9b9bfcd3b | Angels | <NA> |
| 4 | bbda59d9-fd52-4bed-bcfb-2ceed4be997c | Angels | <NA> |
merged = df1.merge(df2, on="gameId", how="right")
merged.peek()
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | af72a0b9-65f7-49fb-9b30-d505068bdf6d | Reds | Brewers |
| 1 | d60c6036-0ce1-4c90-8dd9-de3b403c92a8 | Nationals | Brewers |
Download the result as (in-memory) pandas DataFrame#
Use the ordered=False argument for more efficient query execution.
dfp = merged.to_pandas(ordered=False)
dfp
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | d60c6036-0ce1-4c90-8dd9-de3b403c92a8 | Nationals | Brewers |
| 1 | af72a0b9-65f7-49fb-9b30-d505068bdf6d | Reds | Brewers |