Data Manipulation Part 2 - Exercises

library(arrow)
library(dplyr)
library(duckdb)

nyc_taxi <- open_dataset(here::here("data/nyc-taxi"))

User-defined functions

Problem
Solution 1

Write a user-defined function which wraps the stringr function str_replace_na(), and use it to replace any NA values in the vendor_name column with the string “No vendor” instead. (Test it on the data from 2019 so you’re not pulling everything into memory)

# Preview the distinct vendor names before we start
nyc_taxi |>
  filter(year == 2019) |> # smaller subset of the data
  distinct(vendor_name) |>
  collect()

# A tibble: 3 × 1
  vendor_name
  <chr>      
1 VTS        
2 CMT        
3 <NA>

register_scalar_function(
  name = "replace_vendor_na",
  function(context, string) {
    stringr::str_replace_na(string, "No vendor")
  },
  in_type = schema(string = string()),
  out_type = string(),
  auto_convert = TRUE
)

vendor_names_fixed <- nyc_taxi |>
  mutate(vendor_name = replace_vendor_na(vendor_name)) 

# Preview the distinct vendor names to check it's worked
vendor_names_fixed |>
  filter(year == 2019) |> # smaller subset of the data
  distinct(vendor_name) |>
  collect()

# A tibble: 3 × 1
  vendor_name
  <chr>      
1 CMT        
2 VTS        
3 No vendor

Joins

Problem
Solution 1

How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)? (Hint: you can use stringr::str_detect() to help you find pickup zones with the word “Airport” in them)

pickup_location <- read_csv_arrow(here::here("data/taxi_zone_lookup.csv"))

pickup_location <- pickup_location |>
  select(
    pickup_location_id = LocationID,
    borough = Borough,
    pickup_zone = Zone
  ) 


pickup_location_arrow <- arrow_table(
  pickup_location, 
  schema = schema(
    pickup_location_id = int64(),
    borough = utf8(),
    pickup_zone = utf8()
  ))

nyc_taxi |>
  filter(year == 2019) |>
  left_join(pickup_location_arrow) |>
  filter(str_detect(pickup_zone, "Airport")) |>
  count(pickup_zone) |>
  collect()

# A tibble: 3 × 2
  pickup_zone             n
  <chr>               <int>
1 JFK Airport       2729336
2 LaGuardia Airport 2159224
3 Newark Airport       8643

Window functions

Problem
Solution 1

How many trips in September 2019 had a longer than average distance for that month?

Option 1 - via DuckDB

nyc_taxi |>
  filter(year == 2019, month == 9) |>
  to_duckdb() |>
  mutate(mean_distance = mean(trip_distance)) |>
  to_arrow() |>
  filter(trip_distance < mean_distance) |>
  count() |>
  collect()

# A tibble: 1 × 1
        n
    <int>
1 4881580

Option 2 - via a join

nyc_taxi |>
  filter(year == 2019, month == 9) |>
  left_join(
    nyc_taxi |>
      filter(year == 2019, month == 9) |>
      group_by(year) |>
      summarise(mean_distance = mean(trip_distance))
    ) |>
  filter(trip_distance < mean_distance) |>
  count() |>
  collect()

# A tibble: 1 × 1
        n
    <int>
1 4881580