library(arrow)
library(dplyr)
library(duckdb)
Data Manipulation Part 2 - Exercises
<- open_dataset(here::here("data/nyc-taxi")) nyc_taxi
User-defined functions
- Write a user-defined function which wraps the
stringr
functionstr_replace_na()
, and use it to replace anyNA
values in thevendor_name
column with the string “No vendor” instead. (Test it on the data from 2019 so you’re not pulling everything into memory)
# Preview the distinct vendor names before we start
|>
nyc_taxi filter(year == 2019) |> # smaller subset of the data
distinct(vendor_name) |>
collect()
# A tibble: 3 Ă— 1
vendor_name
<chr>
1 VTS
2 CMT
3 <NA>
register_scalar_function(
name = "replace_vendor_na",
function(context, string) {
::str_replace_na(string, "No vendor")
stringr
},in_type = schema(string = string()),
out_type = string(),
auto_convert = TRUE
)
<- nyc_taxi |>
vendor_names_fixed mutate(vendor_name = replace_vendor_na(vendor_name))
# Preview the distinct vendor names to check it's worked
|>
vendor_names_fixed filter(year == 2019) |> # smaller subset of the data
distinct(vendor_name) |>
collect()
# A tibble: 3 Ă— 1
vendor_name
<chr>
1 CMT
2 VTS
3 No vendor
Joins
- How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)? (Hint: you can use
stringr::str_detect()
to help you find pickup zones with the word “Airport” in them)
<- read_csv_arrow(here::here("data/taxi_zone_lookup.csv"))
pickup_location
<- pickup_location |>
pickup_location select(
pickup_location_id = LocationID,
borough = Borough,
pickup_zone = Zone
)
<- arrow_table(
pickup_location_arrow
pickup_location, schema = schema(
pickup_location_id = int64(),
borough = utf8(),
pickup_zone = utf8()
))
|>
nyc_taxi filter(year == 2019) |>
left_join(pickup_location_arrow) |>
filter(str_detect(pickup_zone, "Airport")) |>
count(pickup_zone) |>
collect()
# A tibble: 3 Ă— 2
pickup_zone n
<chr> <int>
1 JFK Airport 2729336
2 LaGuardia Airport 2159224
3 Newark Airport 8643
Window functions
- How many trips in September 2019 had a longer than average distance for that month?
Option 1 - via DuckDB
|>
nyc_taxi filter(year == 2019, month == 9) |>
to_duckdb() |>
mutate(mean_distance = mean(trip_distance)) |>
to_arrow() |>
filter(trip_distance < mean_distance) |>
count() |>
collect()
# A tibble: 1 Ă— 1
n
<int>
1 4881580
Option 2 - via a join
|>
nyc_taxi filter(year == 2019, month == 9) |>
left_join(
|>
nyc_taxi filter(year == 2019, month == 9) |>
group_by(year) |>
summarise(mean_distance = mean(trip_distance))
|>
) filter(trip_distance < mean_distance) |>
count() |>
collect()
# A tibble: 1 Ă— 1
n
<int>
1 4881580