library(arrow)
library(dplyr)Hello Arrow Exercises
nyc_taxi <- open_dataset(here::here("data/nyc-taxi"))
First dplyr pipeline with Arrow
Calculate the longest trip distance for every month in 2019
How long did this query take to run?
Longest trip distance for every month in 2019:
nyc_taxi |>
filter(year == 2019) |>
group_by(month) |>
summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
arrange(month) |>
collect()# A tibble: 12 × 2
month longest_trip
<int> <dbl>
1 1 832.
2 2 702.
3 3 237.
4 4 831.
5 5 401.
6 6 45977.
7 7 312.
8 8 602.
9 9 604.
10 10 308.
11 11 701.
12 12 19130.
Compute time:
library(tictoc)
tic()
nyc_taxi |>
filter(year == 2019) |>
group_by(month) |>
summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
arrange(month) |>
collect()# A tibble: 12 × 2
month longest_trip
<int> <dbl>
1 1 832.
2 2 702.
3 3 237.
4 4 831.
5 5 401.
6 6 45977.
7 7 312.
8 8 602.
9 9 604.
10 10 308.
11 11 701.
12 12 19130.
toc()0.461 sec elapsed
or
nyc_taxi |>
filter(year == 2019) |>
group_by(month) |>
summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
arrange(month) |>
collect() |>
system.time() user system elapsed
3.887 0.225 0.435