Initial Exploration

After completing the business understanding phase, we are ready to perform the data understanding phase by performing an EDA with the following steps:

Take a sample of 2023 and 2024 trips.
Exploring the individual distribution of variables in 2023.
Exploring correlations between predictors and the target variable in 2023.
Exploring correlations between predictors in 2023.

This will help to:

Ensure data quality.
Identify key predictors.
Guide model and policy creation.

Setting up the environment

Loading packages

## To manage relative paths
library(here)

## Publish data sets, models, and other R objects
library(pins)
library(qs2)

## To transform data that fits in RAM
library(data.table)
library(lubridate)

## To create plots
library(ggplot2)
library(scales)
library(patchwork)

## To sample the data
library(rsample)

## Defining the print params to use in the report
options(datatable.print.nrows = 15, digits = 4)

## Custom functions
devtools::load_all()

# Defining the pin boards to use
BoardLocal <- board_folder(here("../NycTaxiPins/Board"))

# Loading params
Params <- yaml::read_yaml(here("params.yml"))
Params$BoroughColors <- unlist(Params$BoroughColors)

Importing geospatial information

ZoneCodesRef <- pin_read(BoardLocal, "ZoneCodesRef")

Sampling data

As most of the trips take place between Manhattan, Brooklyn and Queens, let’s list all possible combinations of related locations to use it as filter later.

ZoneCodesFilter <- ZoneCodesRef[
    c("Manhattan", "Brooklyn", "Queens"),
    on = "Borough",
    nomatch = NULL,
    CJ(PULocationID = as.integer(LocationID),
       DOLocationID = as.integer(LocationID))
  ]

con <- dbConnect(duckdb(), dbdir = here("../NycTaxiBigFiles/my-db.duckdb"))

# Addig ZoneCodesFilters to db
dbWriteTable(con, "ZoneCodesFilter", ZoneCodesFilter)

dbDisconnect(con, shutdown = TRUE)

Selecting the 0.20% of the data, aiming for around 500K samples for training and testing the model and saving the data in a binary file.

## Establishing a connection with the database
con <- DBI::dbConnect(
  duckdb::duckdb(),
  dbdir = here("../NycTaxiBigFiles/my-db.duckdb")
)

## Defining query to sample
ValidZoneSampleQuery <- glue::glue("
SELECT t1.*
FROM NycTrips t1
INNER JOIN ZoneCodesFilter t2
  ON t1.PULocationID = t2.PULocationID AND
     t1.DOLocationID = t2.DOLocationID
USING SAMPLE 0.20% (system, 547548);
")

## Sampling data from db
ValidZoneSample <- DBI::dbGetQuery(con, ValidZoneSampleQuery)

## Closing the database connection
DBI::dbDisconnect(con, shutdown = TRUE)

## Saving results to disk
pin_write(
  BoardLocal,
  ValidZoneSample, 
  "ValidZoneSample",
  type = "qs2"
 )

Importing the data to use

Let’s imagine we don’t have 2024 data, to use it to evaluate how our predictions would perform after deployment.

ValidZoneSample2023 <- pin_read(
  BoardLocal,
  "ValidZoneSample"
)[year == 2023]

Adding features for exploration

ValidZoneSample2023[,`:=`(company = fcase(hvfhs_license_num == "HV0002", "Juno",
                                     hvfhs_license_num == "HV0003", "Uber",
                                     hvfhs_license_num == "HV0004", "Via",
                                     hvfhs_license_num == "HV0005", "Lyft"),
                     
                     weekday = wday(request_datetime, label = TRUE, abbr = FALSE, week_start = 1),
                     month = month(request_datetime, label = TRUE, abbr = FALSE),
                     hour = hour(request_datetime),
                     
                     trip_time_min = trip_time / 60,
                     
                     trip_value = tips + driver_pay,
                     
                     PULocationID = as.character(PULocationID),
                     DOLocationID = as.character(DOLocationID))]

# To explore the distribution provided by the `PULocationID` (Pick Up Zone ID) 
# and `DOLocationID` (Drop Off Zone ID), we need to add extra information.
# If we check the taxi zone look up table, we can find the `Borough` and
# `service_zone` of each table.

ValidZoneSample2023[ZoneCodesRef,
               on = c("PULocationID" = "LocationID"),
               `:=`(PULocationID = as.character(PULocationID),
                    PUBorough = Borough,
                    PUServiceZone = service_zone)]

ValidZoneSample2023[ZoneCodesRef,
               on = c("DOLocationID" = "LocationID"),
               `:=`(DOLocationID = as.character(DOLocationID),
                    DOBorough = Borough,
                    DOServiceZone = service_zone)]

Exploring individual distributions

Categorical variables

In company, Uber accounts for 72% of trips. In consequence, the results of this project will be more related to Uber than any other company.
dispatching_base_num, originating_base_num, and access_a_ride_flag all seem to be highly correlated with the company and are unlikely to be useful for prediction.
Based on shared_request_flag and shared_match_flag 99% of trips don’t request to share the trip and don’t end sharing the the trip.
Based on wav_request_flag and wav_match_flag 94% of trips don’t request and don’t take place in wheelchair-accessible vehicles.

Show the code

ValidZoneSample2023[, .N,
               by = "company"
][, prop := N / sum(N)] |>
  ggplot(aes(N, company)) +
  geom_col(fill = Params$ColorGray,
           color = "black",
           width = 0.5) +
  geom_text(aes(label = percent(prop)),
            nudge_x = 6500) +
  scale_x_continuous(labels = comma_format()) +
  labs(title = "Most of the trips come from Uber",
       x = "",
       y = "Company Name") +
  theme_minimal(base_family = Params$BaseFontFamily) +
  theme(panel.grid.major.y = element_blank(),
        plot.title = element_text(face = "bold", size = 15))

Show the code

Plot_dispatching_base_num <-
  plot_heap_map(ValidZoneSample2023,
                cat_vars = c("company", "dispatching_base_num"),
                color_high = Params$ColorHighlight,
                color_low = Params$ColorGray)


Plot_originating_base_num <-
  plot_heap_map(ValidZoneSample2023,
                cat_vars = c("company", "originating_base_num"),
                color_high = Params$ColorHighlight,
                color_low = Params$ColorGray)

Plot_access_a_ride_flag <-
  plot_heap_map(ValidZoneSample2023,
                cat_vars = c("company", "access_a_ride_flag"),
                color_high = Params$ColorHighlight,
                color_low = Params$ColorGray,
                sep = "\n")

Plot_access_a_ride_flag

Show the code


(Plot_dispatching_base_num + Plot_originating_base_num)

Show the code

plot_heap_map(ValidZoneSample2023,
              cat_vars = c("shared_request_flag", "shared_match_flag"),
              color_high = Params$ColorHighlight,
              color_low = Params$ColorGray,
              sep = "\n") +
  labs(title = "",
       x = "shared_request_flag",
       y = "shared_match_flag")

Show the code


plot_heap_map(ValidZoneSample2023,
              cat_vars = c("wav_request_flag", "wav_match_flag"),
              color_high = Params$ColorHighlight,
              color_low = Params$ColorGray,
              sep = "\n") +
  labs(title = "",
       x = "wav_request_flag",
       y = "wav_match_flag")

Datetime variables

Based on the reports, we can confirm that we have a good number of example from each month, weekday and hour. The only exception is December from which only have 30,701 of the total sample of 290,277.

Show the code

plot_bar(ValidZoneSample2023,
         "month",
         color_highlight = Params$ColorHighlight,
         color_gray = Params$ColorGray)

Show the code

plot_bar(ValidZoneSample2023,
         "weekday",
         color_highlight = Params$ColorHighlight,
         color_gray = Params$ColorGray,
         n_top = 1L)

Show the code

plot_bar(ValidZoneSample2023,
         "hour",
         color_highlight = Params$ColorHighlight,
         color_gray = Params$ColorGray)

Geospatial data

Based on the next results, we can conclude that:

Based on the borough, must of the trips start and end in Manhattan.
service_zone is correlated with the borough and don’t provide much difference information.
We have examples for all LocationID, but the number of examples for each zone change a lot from zone to zone, as expected.

Show the code

PlotPUBorough <-
  plot_bar(ValidZoneSample2023,
           var_name = "PUBorough",
           n_top = 1L) +
  labs(title = "PUBorough")

PlotDOBorough <-
  plot_bar(ValidZoneSample2023,
           var_name = "DOBorough",
           n_top = 1L) +
  labs(title = "DOBorough")

PlotPUBorough / PlotDOBorough

Show the code

PlotPUServiceZone <-
  plot_heap_map(ValidZoneSample2023,
                cat_vars = c("PUBorough", "PUServiceZone"),
                sep = "\n") +
  labs(title = "PUBorough vs PUServiceZone")

PlotDOServiceZone <-
  plot_heap_map(ValidZoneSample2023,
                cat_vars = c("DOBorough", "DOServiceZone"),
                sep = "\n") +
  labs(title = "DOBorough vs DOServiceZone")

PlotPUServiceZone / PlotDOServiceZone

Show the code

LocationIdTextSize <- 7

setkeyv(ValidZoneSample2023, "PUBorough")

PlotPuLocations1 <-
  plot_bar(ValidZoneSample2023["Manhattan"],
           var_name = "PULocationID") +
  labs(subtitle = "Manhattan") +
  theme(axis.text.y = element_text(size = LocationIdTextSize))

PlotPuLocations2 <-
  plot_bar(ValidZoneSample2023["Brooklyn"],
           var_name = "PULocationID") +
  labs(subtitle = "Brooklyn")+
  theme(axis.text.y = element_text(size = LocationIdTextSize))

PlotPuLocations3 <-
  plot_bar(ValidZoneSample2023["Queens"],
           var_name = "PULocationID") +
  labs(subtitle = "Queens")+
  theme(axis.text.y = element_text(size = LocationIdTextSize))

PlotPuLocations1 / PlotPuLocations2 / PlotPuLocations3

Show the code

setkeyv(ValidZoneSample2023, "DOBorough")

PlotDoLocations1 <-
  plot_bar(ValidZoneSample2023["Manhattan"],
           var_name = "DOLocationID") +
  labs(subtitle = "Manhattan") +
  theme(axis.text.y = element_text(size = LocationIdTextSize))

PlotDoLocations2 <-
  plot_bar(ValidZoneSample2023["Brooklyn"],
           var_name = "DOLocationID") +
  labs(subtitle = "Brooklyn")+
  theme(axis.text.y = element_text(size = LocationIdTextSize))

PlotDoLocations3 <-
  plot_bar(ValidZoneSample2023["Queens"],
           var_name = "DOLocationID") +
  labs(subtitle = "Queens")+
  theme(axis.text.y = element_text(size = LocationIdTextSize))

PlotDoLocations1 / PlotDoLocations2 / PlotDoLocations3

Numerical variables

Based on the below graphs, we can say that:

trip_miles is right skewed with very few trips close to 100 miles, so logarithmic transformation of based 2 makes easier to see that 75% of trips take from 1 to 8 miles and based on the ECDF plot we can also see that really few trips take more than 24 miles, but 10% takes less than 1 mile.
trip_time_min is right skewed with very few trips close to 400 min, so logarithmic transformation of based 2 makes easier to see that 89% of trips take from 2 to 35 min and based on the ECDF plot we can also see that really few trips take more than 70 min or less than 2 min.
tips is strongly right-skewed, as approximately 79% of trips recorded no tip. Among the remaining 21% of trips, about 10% have tips in the range (0, 4] dollars and approximately 9% fall within (4, 10] dollars. It is uncommon to observe trips with tips exceeding 10 dollars, and trips with tips greater than 20 dollars are exceedingly rare.
driver_pay is right skewed as only 6% of trips of the trips present pays higher than $40.
performance_per_hour present some over inflated values of around 30K dollars per hour, which make no sense based on what we saw in when defining the based line to improve of 50 dollars per hour. After applying a logarithmic scale of base 2, we can see the real distribution, we can find that some of the trips have a negative performance, but thanks to the ECDF Plot we know that only 5% of the points present less than 40 dollars per hour, 7% of the trips present values greater than 100 dollars per hour and only 1% of the example present a performance grater than 160 dollars per hour.
performance_per_hour after removing trips with less 2 min, we can see a huge difference. Now the max performance it’s around 900 dollars per hour but the general distribution was not affected.

Show the code

plot_num_distribution(ValidZoneSample2023,
                      trip_miles,
                      title = "trip_miles",
                      hist_binwidth = 5,
                      hist_n_break = 12,
                      log_binwidth = 0.5,
                      curve_important_points = c(1, 8, 24),
                      curve_limits = c(0, 25),
                      curve_nudge_y = 0.08,
                      curve_breaks_x = breaks_width(3))

Show the code

plot_num_distribution(ValidZoneSample2023,
                      trip_time_min,
                      title = "trip_time_min",
                      hist_binwidth = 5,
                      hist_n_break = 12,
                      log_binwidth = 0.5,
                      curve_important_points = c(2, 35, 70),
                      curve_nudge_y = 0.1,
                      curve_breaks_x = scales::breaks_width(10),
                      curve_limits = c(0, 90))

Show the code

plot_num_distribution(ValidZoneSample2023,
                      tips,
                      title = "tips",
                      hist_n_break = 12,
                      hist_binwidth = 2,
                      log_binwidth = 0.2,
                      curve_important_points = c(0, 4, 10, 20),
                      curve_nudge_y = 0.02,
                      curve_breaks_x = scales::breaks_width(2),
                      curve_limits = c(0, 20))

Show the code

plot_num_distribution(ValidZoneSample2023,
                      driver_pay,
                      title = "driver_pay",
                      hist_n_break = 12,
                      hist_binwidth = 10,
                      log_binwidth = 0.2,
                      curve_important_points = c(7, 40, 80),
                      curve_nudge_y = 0.1,
                      curve_breaks_x = scales::breaks_width(10),
                      curve_limits = c(-50, 100))

Show the code

plot_num_distribution(ValidZoneSample2023,
                      performance_per_hour,
                      title = "performance_per_hour",
                      hist_n_break = 12,
                      hist_binwidth = 10,
                      log_binwidth = 0.2,
                      log_breaks = c(-2^(0:10), 0, 2^(0:20)),
                      curve_important_points = c(0,40, 70, 100, 160),
                      curve_nudge_y = 0.1,
                      curve_breaks_x = scales::breaks_width(10),
                      curve_limits = c(-30, 200))

Show the code

plot_num_distribution(ValidZoneSample2023[trip_time_min >= 2],
                      performance_per_hour,
                      title = "performance_per_hour",
                      hist_n_break = 12,
                      hist_binwidth = 10,
                      log_binwidth = 0.2,
                      log_breaks = c(-2^(0:10), 0, 2^(0:20)),
                      curve_important_points = c(0,40, 70, 100, 160),
                      curve_nudge_y = 0.1,
                      curve_breaks_x = scales::breaks_width(10),
                      curve_limits = c(-30, 200))

Exploring correlations between predictors

There is not a surprise that trip_time_min and trip_miles present a 79% of correlation.

But on the other hand, by checking the Cramer’s V correlation between categorical predictors we can see that there is low correlation between predictors, with the exception of features linked to locations.

Show the code

ValidZoneSample2023[trip_time_min >= 2,
                    lapply(.SD, as.character),
                    .SDcols = \(x) is.character(x) | is.factor(x) | is.integer(x)] |>
  corrcat::cramerV_df(unique = FALSE)|>
  ggplot(aes(V1, V2, fill = `Cramer.V`))+
  geom_tile()+
  scale_fill_gradient(low = "white",
                      high = Params$ColorHighlight,
                      space = "Lab",
                      na.value = "white") +
  geom_text(aes(label = percent(`Cramer.V`, accuracy = 1)),
            size = 2.5) + 
  labs(title = "Cramer's V correlation")+
  theme_minimal(base_family = Params$BaseFontFamily)+
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 15),
        panel.grid = element_blank(),
        axis.text.x = element_text(angle = 90),
        axis.title = element_blank())