Skip to contents

For most projects the data collection process can be done manually and later attache the file in a folder but that isn’t a option when we are working with big data.

To solve this problem, we have created the next script to automate the data collection process so the project could be reproduced easily just by running the code below.

Web Scraping

To always have a updated list of 2022 and 2023 links of High Volume For-Hire Vehicles documents let’s scrape the TLC Trip Record Data by using the rvest library.

Downloading source page

SourcePage <-
  rvest::read_html("https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page")
TripLinks <-
  SourcePage |>
  rvest::html_elements(xpath = '//div[@class="faq-answers"]//li/a') |>
  rvest::html_attr("href") |>
  grep(pattern = "fhvhv_[a-z]+_202[23]-\\d{2}\\.parquet", value = TRUE) |>
  trimws() |>
  sort()

FileNames <- basename(TripLinks)

FileNames
#>  [1] "fhvhv_tripdata_2022-01.parquet" "fhvhv_tripdata_2022-02.parquet"
#>  [3] "fhvhv_tripdata_2022-03.parquet" "fhvhv_tripdata_2022-04.parquet"
#>  [5] "fhvhv_tripdata_2022-05.parquet" "fhvhv_tripdata_2022-06.parquet"
#>  [7] "fhvhv_tripdata_2022-07.parquet" "fhvhv_tripdata_2022-08.parquet"
#>  [9] "fhvhv_tripdata_2022-09.parquet" "fhvhv_tripdata_2022-10.parquet"
#> [11] "fhvhv_tripdata_2022-11.parquet" "fhvhv_tripdata_2022-12.parquet"
#> [13] "fhvhv_tripdata_2023-01.parquet" "fhvhv_tripdata_2023-02.parquet"
#> [15] "fhvhv_tripdata_2023-03.parquet" "fhvhv_tripdata_2023-04.parquet"
#> [17] "fhvhv_tripdata_2023-05.parquet" "fhvhv_tripdata_2023-06.parquet"
#> [19] "fhvhv_tripdata_2023-07.parquet" "fhvhv_tripdata_2023-08.parquet"
#> [21] "fhvhv_tripdata_2023-09.parquet" "fhvhv_tripdata_2023-10.parquet"
#> [23] "fhvhv_tripdata_2023-11.parquet" "fhvhv_tripdata_2023-12.parquet"
TaxiZoneLink <-
  SourcePage |>
  rvest::html_elements(xpath = '//ul/li/a[text()="Taxi Zone Lookup Table"]')  |>
  rvest::html_attr("href") |>
  trimws()

TaxiZoneLink
#> [1] "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"

Saving trips data

To take advantage of the best capacities of Duckdb we need save a each parquet file in folder with useful information to filter later, that why we will have one folder level related to years the next sub-folders related to a month with each parquet with the name part-0.parquet by following process.

Defining folder paths

SourcePath <- here::here()

RawDataPath <- file.path(SourcePath, "raw-data")

ParquetFolderPath <- file.path(RawDataPath, "trip-data")

YearFoldersPath <- 
  gsub(x = FileNames,
       pattern = "^fhvhv_tripdata_|-\\d{2}\\.parquet$",
       replacement = "") |>
  paste0("year=", a = _) |>
  unique() |>
  file.path(ParquetFolderPath, a = _)

Creating folders to use

for(paths in c(RawDataPath, ParquetFolderPath, YearFoldersPath)) {
  
  if(!file.exists(paths)){
    dir.create(paths)
  }
  
}

Defining the sub-folders to split the files based on year.



for(year_i in YearFoldersPath) dir.create(year_i, showWarnings = FALSE)

Creating a folder for each month

MonthFolders <- gsub(
  x = FileNames,
  pattern = "^fhvhv_tripdata_\\d{4}-|\\.parquet$",
  replacement = ""
) |>
  paste0("month=", a = _)

MonthFoldersPath <- file.path(ParquetFolderPath, YearFolders, MonthFolders)

for(month_i in MonthFoldersPath) dir.create(month_i, showWarnings = FALSE)

Downloading each file on each folder

# Parquet files might time a longer time to be downloaded
options(timeout = 1800)


# Parquet trip data
for(link_i in seq_along(TripLinks)){
  
  download.file(TripLinks[link_i],
                destfile = file.path(MonthFoldersPath[link_i],"part-0.parquet"),
                mode = "wb")
  
}


# Taxi Zone CSV
download.file(TaxiZoneLink,
              destfile = file.path(RawDataPath,"taxi_zone_lookup.csv"),
              mode = "wb")

Saving results in a database

Despite that parquet files are great for sharing big data files keeping save the format for each format, it presents problems if we need to apply sampling operations.

To work with data large than RAM, we will need to create a simple data base with duckdb by following the next simple steps.

  1. Listing all files to read.
ParquetSource <-
  list.files(ParquetFolderPath,
             recursive = TRUE,
             full.names = TRUE) |>
  paste0("'", a = _ ,"'") |> 
  paste0(collapse = ", ") |>
  paste0("read_parquet([", a = _ ,"])")

ParquetSource
#> [1] "read_parquet(['/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=01/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=02/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=03/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=04/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=05/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=06/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=07/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=08/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=09/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=10/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=11/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2022/month=12/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=01/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=02/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=03/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=04/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=05/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=06/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=07/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=08/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=09/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=10/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=11/part-0.parquet', '/home/rstudio/project.nyc.taxi/raw-data/trip-data/year=2023/month=12/part-0.parquet'])"
  1. Creating a connection.
con <- DBI::dbConnect(duckdb::duckdb(), 
                      dbdir = here::here("my-db.duckdb"))
  1. Saving all files in the database after adding an id to differentiate between trips and adding the performance per our each trips.
NycTripsCreateTable <- glue::glue_safe("
CREATE TABLE NycTrips AS
    SELECT 
        ROW_NUMBER() OVER () AS trip_id,
        *,
        (driver_pay + tips) / (EXTRACT(EPOCH FROM (dropoff_datetime - request_datetime)) / 3600) AS performance_per_hour
    FROM {ParquetSource}
")

DBI::dbExecute(con, NycTripsCreateTable)
  1. Disconnecting the data base.
DBI::dbDisconnect(con, shutdown = TRUE)

rm(con)
  1. After saving the data in the data base we can confirm its final size.
here::here("my-db.duckdb") |>
  file.size() |>
  structure(class = "object_size") |>
  format(units = "auto")
#> [1] "36.4 Gb"