library(arrow)
library(dplyr)
Hello Arrow Exercises
<- open_dataset(here::here("data/nyc-taxi")) nyc_taxi
First dplyr pipeline with Arrow
Calculate the longest trip distance for every month in 2019
How long did this query take to run?
Longest trip distance for every month in 2019:
|>
nyc_taxi filter(year == 2019) |>
group_by(month) |>
summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
arrange(month) |>
collect()
# A tibble: 12 × 2
month longest_trip
<int> <dbl>
1 1 832.
2 2 702.
3 3 237.
4 4 831.
5 5 401.
6 6 45977.
7 7 312.
8 8 602.
9 9 604.
10 10 308.
11 11 701.
12 12 19130.
Compute time:
library(tictoc)
tic()
|>
nyc_taxi filter(year == 2019) |>
group_by(month) |>
summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
arrange(month) |>
collect()
# A tibble: 12 × 2
month longest_trip
<int> <dbl>
1 1 832.
2 2 702.
3 3 237.
4 4 831.
5 5 401.
6 6 45977.
7 7 312.
8 8 602.
9 9 604.
10 10 308.
11 11 701.
12 12 19130.
toc()
0.461 sec elapsed
or
|>
nyc_taxi filter(year == 2019) |>
group_by(month) |>
summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
arrange(month) |>
collect() |>
system.time()
user system elapsed
3.887 0.225 0.435