Now that we’re familiar with Google BigQuery, let’s play with a massively large dataset – a table that contains all trips completed in Yellow and Green taxis in New York City from 2009 to present. You can find more information here.
This is one of the many publicly-available Google BigQuery tables; one of them is also the GDELT project, and you can see some examples of queries More. For a complete list of public datasets, see here. Note that when you use these datasets, storage is free but you still need to pay for the queries.
Let’s connect with this database and see how big it is:
library(bigrquery)
## Warning: package 'bigrquery' was built under R version 3.4.4
project <- "usc-barbera"
set_service_token("~/credentials/bigquery-token.json")
get_table(project="nyc-tlc",
dataset="yellow",
table="trips")
## $kind
## [1] "bigquery#table"
##
## $etag
## [1] "PDZE7dM+XULVXRtbAobeGQ=="
##
## $id
## [1] "nyc-tlc:yellow.trips"
##
## $selfLink
## [1] "https://www.googleapis.com/bigquery/v2/projects/nyc-tlc/datasets/yellow/tables/trips"
##
## $tableReference
## $tableReference$projectId
## [1] "nyc-tlc"
##
## $tableReference$datasetId
## [1] "yellow"
##
## $tableReference$tableId
## [1] "trips"
##
##
## $description
## [1] "This dataset includes trip records from all trips completed in yellow taxis in NYC since 2009. Records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts. The data used in the attached datasets were collected and provided to the NYC Taxi and Limousine Commission (TLC) by technology providers authorized under the Taxicab Passenger Enhancement Program (TPEP). The trip data was not created by the TLC, and TLC makes no representations as to the accuracy of these data."
##
## $schema
## $schema$fields
## $schema$fields[[1]]
## $schema$fields[[1]]$name
## [1] "vendor_id"
##
## $schema$fields[[1]]$type
## [1] "STRING"
##
## $schema$fields[[1]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[1]]$description
## [1] "A designation for the technology vendor that provided the record.\nCMT=Creative Mobile Technologies\nVTS= VeriFone, Inc.\nDDS=Digital Dispatch Systems"
##
##
## $schema$fields[[2]]
## $schema$fields[[2]]$name
## [1] "pickup_datetime"
##
## $schema$fields[[2]]$type
## [1] "TIMESTAMP"
##
## $schema$fields[[2]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[2]]$description
## [1] "The date and time when the meter was engaged."
##
##
## $schema$fields[[3]]
## $schema$fields[[3]]$name
## [1] "dropoff_datetime"
##
## $schema$fields[[3]]$type
## [1] "TIMESTAMP"
##
## $schema$fields[[3]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[3]]$description
## [1] "The date and time when the meter was disengaged."
##
##
## $schema$fields[[4]]
## $schema$fields[[4]]$name
## [1] "pickup_longitude"
##
## $schema$fields[[4]]$type
## [1] "FLOAT"
##
## $schema$fields[[4]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[4]]$description
## [1] "Longitude where the meter was engaged."
##
##
## $schema$fields[[5]]
## $schema$fields[[5]]$name
## [1] "pickup_latitude"
##
## $schema$fields[[5]]$type
## [1] "FLOAT"
##
## $schema$fields[[5]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[5]]$description
## [1] "Latitude where the meter was engaged."
##
##
## $schema$fields[[6]]
## $schema$fields[[6]]$name
## [1] "dropoff_longitude"
##
## $schema$fields[[6]]$type
## [1] "FLOAT"
##
## $schema$fields[[6]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[6]]$description
## [1] "Longitude where the meter was disengaged."
##
##
## $schema$fields[[7]]
## $schema$fields[[7]]$name
## [1] "dropoff_latitude"
##
## $schema$fields[[7]]$type
## [1] "FLOAT"
##
## $schema$fields[[7]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[7]]$description
## [1] "Latitude where the meter was disengaged."
##
##
## $schema$fields[[8]]
## $schema$fields[[8]]$name
## [1] "rate_code"
##
## $schema$fields[[8]]$type
## [1] "STRING"
##
## $schema$fields[[8]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[8]]$description
## [1] "The final rate code in effect at the end of the trip.\n1= Standard rate\n2=JFK\n3=Newark\n4=Nassau or Westchester\n5=Negotiated fare\n6=Group ride"
##
##
## $schema$fields[[9]]
## $schema$fields[[9]]$name
## [1] "passenger_count"
##
## $schema$fields[[9]]$type
## [1] "INTEGER"
##
## $schema$fields[[9]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[9]]$description
## [1] "The number of passengers in the vehicle. \n\nThis is a driver-entered value."
##
##
## $schema$fields[[10]]
## $schema$fields[[10]]$name
## [1] "trip_distance"
##
## $schema$fields[[10]]$type
## [1] "FLOAT"
##
## $schema$fields[[10]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[10]]$description
## [1] "The elapsed trip distance in miles reported by the taximeter."
##
##
## $schema$fields[[11]]
## $schema$fields[[11]]$name
## [1] "payment_type"
##
## $schema$fields[[11]]$type
## [1] "STRING"
##
## $schema$fields[[11]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[11]]$description
## [1] "A numeric code signifying how the passenger paid for the trip. \nCRD= Credit card\nCSH= Cash\nNOC= No charge\nDIS= Dispute\nUNK= Unknown"
##
##
## $schema$fields[[12]]
## $schema$fields[[12]]$name
## [1] "fare_amount"
##
## $schema$fields[[12]]$type
## [1] "FLOAT"
##
## $schema$fields[[12]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[12]]$description
## [1] "The time-and-distance fare calculated by the meter."
##
##
## $schema$fields[[13]]
## $schema$fields[[13]]$name
## [1] "extra"
##
## $schema$fields[[13]]$type
## [1] "FLOAT"
##
## $schema$fields[[13]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[13]]$description
## [1] "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges."
##
##
## $schema$fields[[14]]
## $schema$fields[[14]]$name
## [1] "mta_tax"
##
## $schema$fields[[14]]$type
## [1] "FLOAT"
##
## $schema$fields[[14]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[14]]$description
## [1] "$0.50 MTA tax that is automatically triggered based on the metered rate in use."
##
##
## $schema$fields[[15]]
## $schema$fields[[15]]$name
## [1] "imp_surcharge"
##
## $schema$fields[[15]]$type
## [1] "FLOAT"
##
## $schema$fields[[15]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[15]]$description
## [1] "$0.30 improvement surcharge assessed on trips at the flag drop. The improvement surcharge began being levied in 2015."
##
##
## $schema$fields[[16]]
## $schema$fields[[16]]$name
## [1] "tip_amount"
##
## $schema$fields[[16]]$type
## [1] "FLOAT"
##
## $schema$fields[[16]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[16]]$description
## [1] "Tip amount – This field is automatically populated for credit card tips. Cash tips are not included."
##
##
## $schema$fields[[17]]
## $schema$fields[[17]]$name
## [1] "tolls_amount"
##
## $schema$fields[[17]]$type
## [1] "FLOAT"
##
## $schema$fields[[17]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[17]]$description
## [1] "Total amount of all tolls paid in trip."
##
##
## $schema$fields[[18]]
## $schema$fields[[18]]$name
## [1] "total_amount"
##
## $schema$fields[[18]]$type
## [1] "FLOAT"
##
## $schema$fields[[18]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[18]]$description
## [1] "The total amount charged to passengers. Does not include cash tips."
##
##
## $schema$fields[[19]]
## $schema$fields[[19]]$name
## [1] "store_and_fwd_flag"
##
## $schema$fields[[19]]$type
## [1] "STRING"
##
## $schema$fields[[19]]$mode
## [1] "NULLABLE"
##
## $schema$fields[[19]]$description
## [1] "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,” because the vehicle did not have a connection to the server. \nY= store and forward trip\nN= not a store and forward trip"
##
##
##
##
## $numBytes
## [1] "139287145197"
##
## $numLongTermBytes
## [1] "139287145197"
##
## $numRows
## [1] "1108779463"
##
## $creationTime
## [1] "1443205741687"
##
## $lastModifiedTime
## [1] "1513632420966"
##
## $type
## [1] "TABLE"
##
## $location
## [1] "US"
# how many taxi trips in this table?
query_exec(
"SELECT COUNT(*) AS count
FROM [nyc-tlc:yellow.trips]",
project = project, useLegacySql = FALSE)
## 0 bytes processed
## count
## 1 1108779463
Not bad! What is the distribution of trips by year and by month?
# number of trips per year?
query_exec(
"SELECT YEAR(pickup_datetime) AS year,
COUNT(*) AS trips
FROM [nyc-tlc:yellow.trips]
GROUP BY year
ORDER BY year",
project=project, use_legacy_sql = TRUE)
## 0 bytes processed
## year trips
## 1 2009 170896055
## 2 2010 167041663
## 3 2011 176897199
## 4 2012 178544324
## 5 2013 173179759
## 6 2014 165114361
## 7 2015 77106102
# number of trips per month?
query_exec(
"SELECT MONTH(pickup_datetime) AS month,
COUNT(*) AS trips
FROM [nyc-tlc:yellow.trips]
GROUP BY month
ORDER BY month",
project=project, use_legacy_sql = TRUE)
## 0 bytes processed
## month trips
## 1 1 98698396
## 2 2 93216283
## 3 3 104013930
## 4 4 102427116
## 5 5 104617346
## 6 6 99727074
## 7 7 84103439
## 8 8 79144785
## 9 9 84453388
## 10 10 89271060
## 11 11 84096180
## 12 12 85010466
How would we compute the average speed?
# First, let's compute distance and duration separately
query_exec(
"SELECT AVG(trip_distance) AS avg_distance_miles,
AVG( (dropoff_datetime-pickup_datetime)/1000000/60/60)
AS avg_duration_hours
FROM [nyc-tlc:yellow.trips]",
project=project)
## 0 bytes processed
## avg_distance_miles avg_duration_hours
## 1 5.108387 0.2211204
# Now we can compute it in a single step
query_exec(
"SELECT AVG(
trip_distance /
( (dropoff_datetime-pickup_datetime)/1000000/60/60 ) )
AS avg_speed
FROM [nyc-tlc:yellow.trips]",
project=project)
## 0 bytes processed
## avg_speed
## 1 -26.71703
# but it sounds like there might be some outliers, let's try to fix it:
query_exec(
"SELECT AVG(
trip_distance /
( (dropoff_datetime-pickup_datetime)/1000000/60/60 ) )
AS avg_speed
FROM [nyc-tlc:yellow.trips]
WHERE
trip_distance > 0
AND fare_amount/trip_distance BETWEEN 2 AND 10
AND dropoff_datetime > pickup_datetime",
project=project)
## 0 bytes processed
## avg_speed
## 1 13.49558
And just like with our SQL queries earlier, we can compute averages over groups.
# average number of passengers depending of hour of day?
query_exec(
"SELECT HOUR(pickup_datetime) AS hour,
AVG(passenger_count) AS passengers_avg
FROM [nyc-tlc:yellow.trips]
GROUP BY hour
ORDER BY hour",
project=project, use_legacy_sql = TRUE)
## 0 bytes processed
## hour passengers_avg
## 1 0 1.756186
## 2 1 1.750427
## 3 2 1.747928
## 4 3 1.747067
## 5 4 1.714046
## 6 5 1.584932
## 7 6 1.525714
## 8 7 1.552058
## 9 8 1.581057
## 10 9 1.594233
## 11 10 1.632721
## 12 11 1.655091
## 13 12 1.664857
## 14 13 1.674628
## 15 14 1.682929
## 16 15 1.701093
## 17 16 1.708493
## 18 17 1.701728
## 19 18 1.697272
## 20 19 1.709265
## 21 20 1.712115
## 22 21 1.731423
## 23 22 1.747434
## 24 23 1.755766
# average duration per hour of day?
(res <- query_exec(
"SELECT
HOUR(pickup_datetime) AS hour,
COUNT(*) AS count,
AVG( (dropoff_datetime-pickup_datetime)/1000000/60 ) AS duration_minutes
FROM [nyc-tlc:yellow.trips]
WHERE
trip_distance > 0
AND fare_amount/trip_distance BETWEEN 2 AND 10
AND dropoff_datetime > pickup_datetime
GROUP BY hour
ORDER BY hour",
project=project, use_legacy_sql = TRUE))
## 0 bytes processed
## hour count duration_minutes
## 1 0 42546907 12.14390
## 2 1 31415909 11.60763
## 3 2 23427486 11.31475
## 4 3 17091252 11.45392
## 5 4 12402430 11.87711
## 6 5 10428182 11.61021
## 7 6 22128571 10.49440
## 8 7 38867542 11.48338
## 9 8 48025204 12.94172
## 10 9 48857823 13.09494
## 11 10 47261516 12.93926
## 12 11 48478246 13.23328
## 13 12 50898168 13.35922
## 14 13 50585022 13.65667
## 15 14 52466242 14.26633
## 16 15 50253862 30.15111
## 17 16 43317962 14.11920
## 18 17 52231297 13.99992
## 19 18 63391190 13.31486
## 20 19 66507240 12.35082
## 21 20 63045694 11.90350
## 22 21 61700052 11.86167
## 23 22 59847519 12.07524
## 24 23 53287312 12.24089
plot(res$hour, res$duration_minutes, type="l")
# average length by day of the week?
(res <- query_exec(
"SELECT
DAYOFWEEK(pickup_datetime) AS day,
COUNT(*) AS count,
AVG( (dropoff_datetime-pickup_datetime)/1000000/60 ) AS duration_minutes
FROM [nyc-tlc:yellow.trips]
WHERE
trip_distance > 0
AND fare_amount/trip_distance BETWEEN 2 AND 10
AND dropoff_datetime > pickup_datetime
GROUP BY day
ORDER BY day",
project=project, use_legacy_sql = TRUE))
## 0 bytes processed
## day count duration_minutes
## 1 1 140145096 12.13359
## 2 2 136117334 12.31949
## 3 3 147969290 12.84937
## 4 4 152542148 18.07250
## 5 5 157669883 13.45166
## 6 6 162442676 13.28350
## 7 7 161576201 12.31228
plot(res$day, res$duration_minutes, type="l")
# average speed by day of week?
query_exec(
"SELECT
DAYOFWEEK(pickup_datetime) AS day,
COUNT(*) AS count,
AVG(
trip_distance /
( (dropoff_datetime-pickup_datetime)/1000000/60/60 ) )
AS avg_speed
FROM [nyc-tlc:yellow.trips]
WHERE
trip_distance > 0
AND fare_amount/trip_distance BETWEEN 2 AND 10
AND dropoff_datetime > pickup_datetime
GROUP BY day
ORDER BY day",
project=project, use_legacy_sql = TRUE)
## 0 bytes processed
## day count avg_speed
## 1 1 140145096 15.26487
## 2 2 136117334 14.00163
## 3 3 147969290 12.99939
## 4 4 152542148 12.84165
## 5 5 157669883 12.81780
## 6 6 162442676 12.91596
## 7 7 161576201 13.85056
How can we add our own data to a BigQuery table? We can also do it with R using the bq_table_upload
function.
congress <- read.csv("~/data/congress-facebook-2017.csv",
stringsAsFactors=F)
tab <- bq_table("usc-barbera", "twitter_panel", "congress")
bq_table_upload(tab, congress)
query_exec(
"SELECT
party, COUNT(1) AS count
FROM [usc-barbera:twitter_panel.congress]
GROUP BY party",
project="usc-barbera", use_legacy_sql = TRUE)
## 10.0 megabytes processed
## party count
## 1 Democrat 235
## 2 Republican 281
## 3 Independent 2
# delete table
bq_table_delete(tab)