Querying a massive-scale Google BigQuery database

More advanced queries

Now that we’re familiar with Google BigQuery, let’s play with a massively large dataset – a table that contains all trips completed in Yellow and Green taxis in New York City from 2009 to present. You can find more information here.

This is one of the many publicly-available Google BigQuery tables; one of them is also the GDELT project, and you can see some examples of queries More. For a complete list of public datasets, see here. Note that when you use these datasets, storage is free but you still need to pay for the queries.

Let’s connect with this database and see how big it is:

library(bigrquery)

## Warning: package 'bigrquery' was built under R version 3.4.4

project <- "usc-barbera"
set_service_token("~/credentials/bigquery-token.json")

get_table(project="nyc-tlc",
          dataset="yellow",
          table="trips")

## $kind
## [1] "bigquery#table"
## 
## $etag
## [1] "PDZE7dM+XULVXRtbAobeGQ=="
## 
## $id
## [1] "nyc-tlc:yellow.trips"
## 
## $selfLink
## [1] "https://www.googleapis.com/bigquery/v2/projects/nyc-tlc/datasets/yellow/tables/trips"
## 
## $tableReference
## $tableReference$projectId
## [1] "nyc-tlc"
## 
## $tableReference$datasetId
## [1] "yellow"
## 
## $tableReference$tableId
## [1] "trips"
## 
## 
## $description
## [1] "This dataset includes trip records from all trips completed in yellow taxis in NYC since 2009.  Records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts. The data used in the attached datasets were collected and provided to the NYC Taxi and Limousine Commission (TLC) by technology providers authorized under the Taxicab Passenger Enhancement Program (TPEP).  The  trip data was not created by the TLC, and TLC makes no representations as to the accuracy of these data."
## 
## $schema
## $schema$fields
## $schema$fields[[1]]
## $schema$fields[[1]]$name
## [1] "vendor_id"
## 
## $schema$fields[[1]]$type
## [1] "STRING"
## 
## $schema$fields[[1]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[1]]$description
## [1] "A designation for the technology vendor that provided the record.\nCMT=Creative Mobile Technologies\nVTS= VeriFone, Inc.\nDDS=Digital Dispatch Systems"
## 
## 
## $schema$fields[[2]]
## $schema$fields[[2]]$name
## [1] "pickup_datetime"
## 
## $schema$fields[[2]]$type
## [1] "TIMESTAMP"
## 
## $schema$fields[[2]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[2]]$description
## [1] "The date and time when the meter was engaged."
## 
## 
## $schema$fields[[3]]
## $schema$fields[[3]]$name
## [1] "dropoff_datetime"
## 
## $schema$fields[[3]]$type
## [1] "TIMESTAMP"
## 
## $schema$fields[[3]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[3]]$description
## [1] "The date and time when the meter was disengaged."
## 
## 
## $schema$fields[[4]]
## $schema$fields[[4]]$name
## [1] "pickup_longitude"
## 
## $schema$fields[[4]]$type
## [1] "FLOAT"
## 
## $schema$fields[[4]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[4]]$description
## [1] "Longitude where the meter was engaged."
## 
## 
## $schema$fields[[5]]
## $schema$fields[[5]]$name
## [1] "pickup_latitude"
## 
## $schema$fields[[5]]$type
## [1] "FLOAT"
## 
## $schema$fields[[5]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[5]]$description
## [1] "Latitude where the meter was engaged."
## 
## 
## $schema$fields[[6]]
## $schema$fields[[6]]$name
## [1] "dropoff_longitude"
## 
## $schema$fields[[6]]$type
## [1] "FLOAT"
## 
## $schema$fields[[6]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[6]]$description
## [1] "Longitude where the meter was disengaged."
## 
## 
## $schema$fields[[7]]
## $schema$fields[[7]]$name
## [1] "dropoff_latitude"
## 
## $schema$fields[[7]]$type
## [1] "FLOAT"
## 
## $schema$fields[[7]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[7]]$description
## [1] "Latitude where the meter was disengaged."
## 
## 
## $schema$fields[[8]]
## $schema$fields[[8]]$name
## [1] "rate_code"
## 
## $schema$fields[[8]]$type
## [1] "STRING"
## 
## $schema$fields[[8]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[8]]$description
## [1] "The final rate code in effect at the end of the trip.\n1= Standard rate\n2=JFK\n3=Newark\n4=Nassau or Westchester\n5=Negotiated fare\n6=Group ride"
## 
## 
## $schema$fields[[9]]
## $schema$fields[[9]]$name
## [1] "passenger_count"
## 
## $schema$fields[[9]]$type
## [1] "INTEGER"
## 
## $schema$fields[[9]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[9]]$description
## [1] "The number of passengers in the vehicle.  \n\nThis is a driver-entered value."
## 
## 
## $schema$fields[[10]]
## $schema$fields[[10]]$name
## [1] "trip_distance"
## 
## $schema$fields[[10]]$type
## [1] "FLOAT"
## 
## $schema$fields[[10]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[10]]$description
## [1] "The elapsed trip distance in miles reported by the taximeter."
## 
## 
## $schema$fields[[11]]
## $schema$fields[[11]]$name
## [1] "payment_type"
## 
## $schema$fields[[11]]$type
## [1] "STRING"
## 
## $schema$fields[[11]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[11]]$description
## [1] "A numeric code signifying how the passenger paid for the trip. \nCRD= Credit card\nCSH= Cash\nNOC= No charge\nDIS= Dispute\nUNK= Unknown"
## 
## 
## $schema$fields[[12]]
## $schema$fields[[12]]$name
## [1] "fare_amount"
## 
## $schema$fields[[12]]$type
## [1] "FLOAT"
## 
## $schema$fields[[12]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[12]]$description
## [1] "The time-and-distance fare calculated by the meter."
## 
## 
## $schema$fields[[13]]
## $schema$fields[[13]]$name
## [1] "extra"
## 
## $schema$fields[[13]]$type
## [1] "FLOAT"
## 
## $schema$fields[[13]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[13]]$description
## [1] "Miscellaneous extras and surcharges.  Currently, this only includes the $0.50 and $1 rush hour and overnight charges."
## 
## 
## $schema$fields[[14]]
## $schema$fields[[14]]$name
## [1] "mta_tax"
## 
## $schema$fields[[14]]$type
## [1] "FLOAT"
## 
## $schema$fields[[14]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[14]]$description
## [1] "$0.50 MTA tax that is automatically triggered based on the metered rate in use."
## 
## 
## $schema$fields[[15]]
## $schema$fields[[15]]$name
## [1] "imp_surcharge"
## 
## $schema$fields[[15]]$type
## [1] "FLOAT"
## 
## $schema$fields[[15]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[15]]$description
## [1] "$0.30 improvement surcharge assessed on trips at the flag drop. The improvement surcharge began being levied in 2015."
## 
## 
## $schema$fields[[16]]
## $schema$fields[[16]]$name
## [1] "tip_amount"
## 
## $schema$fields[[16]]$type
## [1] "FLOAT"
## 
## $schema$fields[[16]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[16]]$description
## [1] "Tip amount – This field is automatically populated for credit card tips. Cash tips are not included."
## 
## 
## $schema$fields[[17]]
## $schema$fields[[17]]$name
## [1] "tolls_amount"
## 
## $schema$fields[[17]]$type
## [1] "FLOAT"
## 
## $schema$fields[[17]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[17]]$description
## [1] "Total amount of all tolls paid in trip."
## 
## 
## $schema$fields[[18]]
## $schema$fields[[18]]$name
## [1] "total_amount"
## 
## $schema$fields[[18]]$type
## [1] "FLOAT"
## 
## $schema$fields[[18]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[18]]$description
## [1] "The total amount charged to passengers. Does not include cash tips."
## 
## 
## $schema$fields[[19]]
## $schema$fields[[19]]$name
## [1] "store_and_fwd_flag"
## 
## $schema$fields[[19]]$type
## [1] "STRING"
## 
## $schema$fields[[19]]$mode
## [1] "NULLABLE"
## 
## $schema$fields[[19]]$description
## [1] "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,” because the vehicle did not have a connection to the server. \nY= store and forward trip\nN= not a store and forward trip"
## 
## 
## 
## 
## $numBytes
## [1] "139287145197"
## 
## $numLongTermBytes
## [1] "139287145197"
## 
## $numRows
## [1] "1108779463"
## 
## $creationTime
## [1] "1443205741687"
## 
## $lastModifiedTime
## [1] "1513632420966"
## 
## $type
## [1] "TABLE"
## 
## $location
## [1] "US"

# how many taxi trips in this table?
query_exec(
  "SELECT COUNT(*) AS count
  FROM [nyc-tlc:yellow.trips]",
    project = project, useLegacySql = FALSE)

## 0 bytes processed

##        count
## 1 1108779463

Not bad! What is the distribution of trips by year and by month?

# number of trips per year?
query_exec(
  "SELECT YEAR(pickup_datetime) AS year, 
    COUNT(*) AS trips
  FROM [nyc-tlc:yellow.trips]
  GROUP BY year
  ORDER BY year",
  project=project, use_legacy_sql = TRUE)

## 0 bytes processed

##   year     trips
## 1 2009 170896055
## 2 2010 167041663
## 3 2011 176897199
## 4 2012 178544324
## 5 2013 173179759
## 6 2014 165114361
## 7 2015  77106102

# number of trips per month?
query_exec(
  "SELECT MONTH(pickup_datetime) AS month, 
    COUNT(*) AS trips
  FROM [nyc-tlc:yellow.trips]
  GROUP BY month
  ORDER BY month",
  project=project, use_legacy_sql = TRUE)

## 0 bytes processed

##    month     trips
## 1      1  98698396
## 2      2  93216283
## 3      3 104013930
## 4      4 102427116
## 5      5 104617346
## 6      6  99727074
## 7      7  84103439
## 8      8  79144785
## 9      9  84453388
## 10    10  89271060
## 11    11  84096180
## 12    12  85010466

How would we compute the average speed?

# First, let's compute distance and duration separately
query_exec(
  "SELECT AVG(trip_distance) AS avg_distance_miles,
  AVG( (dropoff_datetime-pickup_datetime)/1000000/60/60) 
      AS avg_duration_hours
  FROM [nyc-tlc:yellow.trips]",
   project=project)

## 0 bytes processed

##   avg_distance_miles avg_duration_hours
## 1           5.108387          0.2211204

# Now we can compute it in a single step
query_exec(
  "SELECT AVG(
      trip_distance / 
      ( (dropoff_datetime-pickup_datetime)/1000000/60/60 ) ) 
      AS avg_speed
  FROM [nyc-tlc:yellow.trips]",
   project=project)

## 0 bytes processed

##   avg_speed
## 1 -26.71703

# but it sounds like there might be some outliers, let's try to fix it:
query_exec(
  "SELECT AVG(
      trip_distance / 
      ( (dropoff_datetime-pickup_datetime)/1000000/60/60 ) ) 
      AS avg_speed
  FROM [nyc-tlc:yellow.trips]
  WHERE 
    trip_distance > 0
    AND fare_amount/trip_distance BETWEEN 2 AND 10
    AND dropoff_datetime > pickup_datetime",
   project=project)

## 0 bytes processed

##   avg_speed
## 1  13.49558

And just like with our SQL queries earlier, we can compute averages over groups.

# average number of passengers depending of hour of day?
query_exec(
  "SELECT HOUR(pickup_datetime) AS hour, 
    AVG(passenger_count) AS passengers_avg
  FROM [nyc-tlc:yellow.trips]
  GROUP BY hour
  ORDER BY hour",
  project=project, use_legacy_sql = TRUE)

## 0 bytes processed

##    hour passengers_avg
## 1     0       1.756186
## 2     1       1.750427
## 3     2       1.747928
## 4     3       1.747067
## 5     4       1.714046
## 6     5       1.584932
## 7     6       1.525714
## 8     7       1.552058
## 9     8       1.581057
## 10    9       1.594233
## 11   10       1.632721
## 12   11       1.655091
## 13   12       1.664857
## 14   13       1.674628
## 15   14       1.682929
## 16   15       1.701093
## 17   16       1.708493
## 18   17       1.701728
## 19   18       1.697272
## 20   19       1.709265
## 21   20       1.712115
## 22   21       1.731423
## 23   22       1.747434
## 24   23       1.755766

# average duration per hour of day?
(res <- query_exec(
  "SELECT 
    HOUR(pickup_datetime) AS hour,
    COUNT(*) AS count,
    AVG( (dropoff_datetime-pickup_datetime)/1000000/60 ) AS duration_minutes
  FROM [nyc-tlc:yellow.trips]
  WHERE 
    trip_distance > 0
    AND fare_amount/trip_distance BETWEEN 2 AND 10
    AND dropoff_datetime > pickup_datetime
  GROUP BY hour
  ORDER BY hour",
  project=project, use_legacy_sql = TRUE))

## 0 bytes processed

##    hour    count duration_minutes
## 1     0 42546907         12.14390
## 2     1 31415909         11.60763
## 3     2 23427486         11.31475
## 4     3 17091252         11.45392
## 5     4 12402430         11.87711
## 6     5 10428182         11.61021
## 7     6 22128571         10.49440
## 8     7 38867542         11.48338
## 9     8 48025204         12.94172
## 10    9 48857823         13.09494
## 11   10 47261516         12.93926
## 12   11 48478246         13.23328
## 13   12 50898168         13.35922
## 14   13 50585022         13.65667
## 15   14 52466242         14.26633
## 16   15 50253862         30.15111
## 17   16 43317962         14.11920
## 18   17 52231297         13.99992
## 19   18 63391190         13.31486
## 20   19 66507240         12.35082
## 21   20 63045694         11.90350
## 22   21 61700052         11.86167
## 23   22 59847519         12.07524
## 24   23 53287312         12.24089

plot(res$hour, res$duration_minutes, type="l")

# average length by day of the week?
(res <- query_exec(
  "SELECT 
    DAYOFWEEK(pickup_datetime) AS day,
    COUNT(*) AS count,
    AVG( (dropoff_datetime-pickup_datetime)/1000000/60 ) AS duration_minutes
  FROM [nyc-tlc:yellow.trips]
  WHERE 
    trip_distance > 0
    AND fare_amount/trip_distance BETWEEN 2 AND 10
    AND dropoff_datetime > pickup_datetime
  GROUP BY day
  ORDER BY day",
  project=project, use_legacy_sql = TRUE))

## 0 bytes processed

##   day     count duration_minutes
## 1   1 140145096         12.13359
## 2   2 136117334         12.31949
## 3   3 147969290         12.84937
## 4   4 152542148         18.07250
## 5   5 157669883         13.45166
## 6   6 162442676         13.28350
## 7   7 161576201         12.31228

plot(res$day, res$duration_minutes, type="l")

# average speed by day of week?
query_exec(
  "SELECT 
    DAYOFWEEK(pickup_datetime) AS day,
    COUNT(*) AS count,
    AVG(
      trip_distance / 
      ( (dropoff_datetime-pickup_datetime)/1000000/60/60 ) ) 
      AS avg_speed
  FROM [nyc-tlc:yellow.trips]
  WHERE 
    trip_distance > 0
    AND fare_amount/trip_distance BETWEEN 2 AND 10
    AND dropoff_datetime > pickup_datetime
  GROUP BY day
  ORDER BY day",
  project=project, use_legacy_sql = TRUE)

## 0 bytes processed

##   day     count avg_speed
## 1   1 140145096  15.26487
## 2   2 136117334  14.00163
## 3   3 147969290  12.99939
## 4   4 152542148  12.84165
## 5   5 157669883  12.81780
## 6   6 162442676  12.91596
## 7   7 161576201  13.85056

Querying a massive-scale Google BigQuery database

Pablo Barbera

August 7, 2018

More advanced queries

Uploading data to BigQuery