# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

BigFrames Timedelta#

Colab logo Run in Colab GitHub logo View on GitHub BQ logo Open in BQ Studio

In this notebook, you will use timedeltas to analyze the taxi trips in NYC.

Setup#

import bigframes.pandas as bpd

PROJECT = "bigframes-dev" # replace this with your project
LOCATION = "us" # replace this with your location

bpd.options.bigquery.project = PROJECT
bpd.options.bigquery.location = LOCATION
bpd.options.display.progress_bar = None

bpd.options.bigquery.ordering_mode = "partial"

Timedelta arithmetics and comparisons#

First, you load the taxi data from the BigQuery public dataset bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2021. The size of this table is about 6.3 GB.

taxi_trips = bpd.read_gbq("bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2021").dropna()
taxi_trips = taxi_trips[taxi_trips['pickup_datetime'].dt.year == 2021]
taxi_trips.peek(5)
vendor_id pickup_datetime dropoff_datetime passenger_count trip_distance rate_code store_and_fwd_flag payment_type fare_amount extra mta_tax tip_amount tolls_amount imp_surcharge airport_fee total_amount pickup_location_id dropoff_location_id data_file_year data_file_month
0 1 2021-06-09 07:44:46+00:00 2021-06-09 07:45:24+00:00 1 2.200000000 1.0 N 4 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 263 263 2021 6
1 2 2021-06-07 11:59:46+00:00 2021-06-07 12:00:00+00:00 2 0.010000000 3.0 N 2 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 263 263 2021 6
2 2 2021-06-23 15:03:58+00:00 2021-06-23 15:04:34+00:00 1 0E-9 1.0 N 1 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 193 193 2021 6
3 1 2021-06-12 14:26:55+00:00 2021-06-12 14:27:08+00:00 0 1.000000000 1.0 N 3 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 143 143 2021 6
4 2 2021-06-15 08:39:01+00:00 2021-06-15 08:40:36+00:00 1 0E-9 1.0 N 1 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 0E-9 193 193 2021 6

Based on the dataframe content, you calculate the trip durations and store them under the column “trip_duration”. You can see that the values under “trip_duartion” are timedeltas.

taxi_trips['trip_duration'] = taxi_trips['dropoff_datetime'] - taxi_trips['pickup_datetime']
taxi_trips['trip_duration'].dtype
duration[us][pyarrow]

To remove data outliers, you filter the taxi_trips to keep only the trips that were less than 2 hours.

import pandas as pd

taxi_trips = taxi_trips[taxi_trips['trip_duration'] <= pd.Timedelta("2h")]

Finally, you calculate the average speed of each trip, and find the median speed of all trips.

average_speed = taxi_trips["trip_distance"] / (taxi_trips['trip_duration'] / pd.Timedelta("1h"))
print(f"The median speed of an average taxi trip is: {average_speed.median():.2f} mph.")
The median speed of an average taxi trip is: 10.58 mph.

Given how packed NYC is, a median taxi speed of 10.58 mph totally makes sense.

Use timedelta for rolling aggregation#

Using your existing dataset, you can now calculate the taxi trip count over a period of two days, and find out when NYC is at its busiest and when it is fast asleep.

First, you pick two workdays (a Thursday and a Friday) as your target dates:

import datetime

target_dates = [
    datetime.date(2021, 12, 2), 
    datetime.date(2021, 12, 3)
]

two_day_taxi_trips = taxi_trips[taxi_trips['pickup_datetime'].dt.date.isin(target_dates)]
print(f"Number of records: {len(two_day_taxi_trips)}")
# Number of records: 255434
Number of records: 255434

Your next step involves aggregating the number of records associated with each unique “pickup_datetime” value. Additionally, the data undergo upsampling to account for any absent timestamps, which are populated with a count of 0.

import pandas as pd

two_day_trip_count = two_day_taxi_trips['pickup_datetime'].value_counts()

full_index = pd.date_range(
    start='2021-12-02 00:00:00',
    end='2021-12-04 00:00:00',
    freq='s',
    tz='UTC'
)
two_day_trip_count = two_day_trip_count.reindex(full_index).fillna(0)

You’ll then calculate the sum of trip counts within a 5-minute rolling window. This involves using the rolling() method, which can accept a time window in the form of either a string or a timedelta object.

two_day_trip_rolling_count = two_day_trip_count.sort_index().rolling(window="5m").sum()

Finally, you visualize the trip counts throughout the target dates.

from matplotlib import pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(16,7))
ax = plt.gca()
formatter = mdates.DateFormatter("%D %H:%M:%S")
ax.xaxis.set_major_formatter(formatter)

ax.tick_params(axis='both', labelsize=10)

two_day_trip_rolling_count.plot(ax=ax, legend=False)
plt.xlabel("Pickup Time", fontsize=12)
plt.ylabel("Trip count in last 5 minutes", fontsize=12)
plt.grid()
plt.show()
../../_images/40fd7888aae2b4e761d4998ac59af091e88635b817a967f3c9a68204f2260474.png

The taxi ride count reached its lowest point around 5:00 a.m., and peaked around 7:00 p.m. on a workday. Such is the rhythm of NYC.