library(arrow)
library(dplyr)
library(duckdb)Data Manipulation Part 2 - Exercises
nyc_taxi <- open_dataset(here::here("data/nyc-taxi"))
User-defined functions
- Write a user-defined function which wraps the
stringrfunctionstr_replace_na(), and use it to replace anyNAvalues in thevendor_namecolumn with the string “No vendor” instead. (Test it on the data from 2019 so you’re not pulling everything into memory)
# Preview the distinct vendor names before we start
nyc_taxi |>
filter(year == 2019) |> # smaller subset of the data
distinct(vendor_name) |>
collect()# A tibble: 3 Ă— 1
vendor_name
<chr>
1 VTS
2 CMT
3 <NA>
register_scalar_function(
name = "replace_vendor_na",
function(context, string) {
stringr::str_replace_na(string, "No vendor")
},
in_type = schema(string = string()),
out_type = string(),
auto_convert = TRUE
)
vendor_names_fixed <- nyc_taxi |>
mutate(vendor_name = replace_vendor_na(vendor_name))
# Preview the distinct vendor names to check it's worked
vendor_names_fixed |>
filter(year == 2019) |> # smaller subset of the data
distinct(vendor_name) |>
collect()# A tibble: 3 Ă— 1
vendor_name
<chr>
1 CMT
2 VTS
3 No vendor
Joins
- How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)? (Hint: you can use
stringr::str_detect()to help you find pickup zones with the word “Airport” in them)
pickup_location <- read_csv_arrow(here::here("data/taxi_zone_lookup.csv"))
pickup_location <- pickup_location |>
select(
pickup_location_id = LocationID,
borough = Borough,
pickup_zone = Zone
)
pickup_location_arrow <- arrow_table(
pickup_location,
schema = schema(
pickup_location_id = int64(),
borough = utf8(),
pickup_zone = utf8()
))
nyc_taxi |>
filter(year == 2019) |>
left_join(pickup_location_arrow) |>
filter(str_detect(pickup_zone, "Airport")) |>
count(pickup_zone) |>
collect()# A tibble: 3 Ă— 2
pickup_zone n
<chr> <int>
1 JFK Airport 2729336
2 LaGuardia Airport 2159224
3 Newark Airport 8643
Window functions
- How many trips in September 2019 had a longer than average distance for that month?
Option 1 - via DuckDB
nyc_taxi |>
filter(year == 2019, month == 9) |>
to_duckdb() |>
mutate(mean_distance = mean(trip_distance)) |>
to_arrow() |>
filter(trip_distance < mean_distance) |>
count() |>
collect()# A tibble: 1 Ă— 1
n
<int>
1 4881580
Option 2 - via a join
nyc_taxi |>
filter(year == 2019, month == 9) |>
left_join(
nyc_taxi |>
filter(year == 2019, month == 9) |>
group_by(year) |>
summarise(mean_distance = mean(trip_distance))
) |>
filter(trip_distance < mean_distance) |>
count() |>
collect()# A tibble: 1 Ă— 1
n
<int>
1 4881580