Introduction to Google BigQuery databases

Connecting to Google BigQuery

To illustrate the potential of SQL queries with large-scale databases, we’ll now work with tables hosted in BigQuery – Google’s service to enable interactive analysis of massively large datasets.

You can read more about how to setup a Google BigQuery database here and about their SQL syntax here.

I have authorized this server to connect to my database and we’ll first work with one of my large datasets: all tweets posted by a random sample of 200,000 users in the US, labeled with user-level information predicted from the voter file (so two tables, one at the tweet level and another one at the user level).

library(bigrquery)

## Warning: package 'bigrquery' was built under R version 3.4.4

project <- "usc-barbera"
set_service_token("~/credentials/bigquery-token.json")

The tweet-label table currently contains 388 million tweets and five variables: id_str (tweet ID), user_id_str (user ID), datetime (time of the tweet in BigQuery datetime format), date (date of tweet in string format), and text of the tweet

## tweet-level table
query_exec(
  "SELECT COUNT(*) AS tweet_count
  FROM [usc-barbera:twitter_panel.tweets_20170721]",
    project = project)

## 0 bytes processed

##   tweet_count
## 1   388766024

query_exec(
  "SELECT *
  FROM [usc-barbera:twitter_panel.tweets_20170721]
  LIMIT 2",
    project = project)

## 55.1 gigabytes processed

##               id_str user_id_str            datetime       date
## 1 550717339956936704   434483963 2015-01-01 18:17:00 2015-01-01
## 2 550686581515112450   400939165 2015-01-01 16:14:46 2015-01-01
##                                                                text
## 1                                                         ROLL TIDE
## 2 RT @MarcoUTLR: What have a done \U0001f633 http://t.co/qWuuB0PN1j

get_table(project=project, dataset="twitter_panel", table="tweets_20170721")

## $kind
## [1] "bigquery#table"
## 
## $etag
## [1] "0W99QKrwhf6Veo/x8DewHQ=="
## 
## $id
## [1] "usc-barbera:twitter_panel.tweets_20170721"
## 
## $selfLink
## [1] "https://www.googleapis.com/bigquery/v2/projects/usc-barbera/datasets/twitter_panel/tables/tweets_20170721"
## 
## $tableReference
## $tableReference$projectId
## [1] "usc-barbera"
## 
## $tableReference$datasetId
## [1] "twitter_panel"
## 
## $tableReference$tableId
## [1] "tweets_20170721"
## 
## 
## $schema
## $schema$fields
## $schema$fields[[1]]
## $schema$fields[[1]]$name
## [1] "id_str"
## 
## $schema$fields[[1]]$type
## [1] "STRING"
## 
## $schema$fields[[1]]$mode
## [1] "NULLABLE"
## 
## 
## $schema$fields[[2]]
## $schema$fields[[2]]$name
## [1] "user_id_str"
## 
## $schema$fields[[2]]$type
## [1] "STRING"
## 
## $schema$fields[[2]]$mode
## [1] "NULLABLE"
## 
## 
## $schema$fields[[3]]
## $schema$fields[[3]]$name
## [1] "datetime"
## 
## $schema$fields[[3]]$type
## [1] "TIMESTAMP"
## 
## $schema$fields[[3]]$mode
## [1] "NULLABLE"
## 
## 
## $schema$fields[[4]]
## $schema$fields[[4]]$name
## [1] "date"
## 
## $schema$fields[[4]]$type
## [1] "STRING"
## 
## $schema$fields[[4]]$mode
## [1] "NULLABLE"
## 
## 
## $schema$fields[[5]]
## $schema$fields[[5]]$name
## [1] "text"
## 
## $schema$fields[[5]]$type
## [1] "STRING"
## 
## $schema$fields[[5]]$mode
## [1] "NULLABLE"
## 
## 
## 
## 
## $numBytes
## [1] "59119933022"
## 
## $numLongTermBytes
## [1] "59119933022"
## 
## $numRows
## [1] "388766024"
## 
## $creationTime
## [1] "1518813373917"
## 
## $lastModifiedTime
## [1] "1518813373917"
## 
## $type
## [1] "TABLE"
## 
## $location
## [1] "US"

The user-level table has information about 200,000 users. For each of them, the following variables are available: user_id_str (user ID), male (TRUE if user is predicted to be male), predicted age (with values 1 = 18-25, 2 = 26-40, 3 = 40+), predicted party (with values 1 = None/Others, 2 = Democrat, 3 = Republican), predicted race (with values 1 = Black, 2 = Hispanic, 3 = Asian, 4 = White), predicted turnout in 2012, name of user, description of user, followers_count, statuses_count, friends_count, location field in user’s profile, and lang (account language).

More information about how this dataset was collected is available in this paper.

# user-level table
query_exec(
  "SELECT COUNT(*) AS user_count
  FROM [usc-barbera:twitter_panel.users_subset_20160718]",
    project = project)

query_exec(
  "SELECT *
  FROM [usc-barbera:twitter_panel.users_subset_20160718]
  LIMIT 2",
    project = project)

get_table(project=project, dataset="twitter_panel", table="users_subset_20160718")

Now let’s run a few sample queries. First, let’s count the number of tweets by year and month. See that the syntax is exactly the same we were using with SQLite database. The only difference is that now we’re using the YEAR and MONTH operators to extract the relevant components of the datetime field.

query_exec(
  "SELECT YEAR(datetime) AS year,
    MONTH(datetime) AS month,
    COUNT(*)
  FROM [usc-barbera:twitter_panel.tweets_20170721]
  GROUP BY year, month
  ORDER BY year, month",
  project = project)

## 2.9 gigabytes processed

##    year month      f0_
## 1  2015     1  8879841
## 2  2015     2  8376297
## 3  2015     3  9379934
## 4  2015     4  9299342
## 5  2015     5  9840638
## 6  2015     6 10467819
## 7  2015     7 11483014
## 8  2015     8 12337227
## 9  2015     9 12012601
## 10 2015    10 12933774
## 11 2015    11 14258066
## 12 2015    12 15295636
## 13 2016     1 17148328
## 14 2016     2 17721276
## 15 2016     3 16347090
## 16 2016     4 16883142
## 17 2016     5 17555398
## 18 2016     6 15185441
## 19 2016     7 17617628
## 20 2016     8 16854735
## 21 2016     9 11670785
## 22 2016    10 10924096
## 23 2016    11 12383489
## 24 2016    12  9997604
## 25 2017     1 12135073
## 26 2017     2 12304796
## 27 2017     3 10763978
## 28 2017     4  8721059
## 29 2017     5 10350285
## 30 2017     6 11964361
## 31 2017     7  7673271

Now, let’s switch to the user-level table and look at the distribution of languages by party.

query_exec(
  "SELECT lang, party, COUNT(*) AS user_count
  FROM [usc-barbera:twitter_panel.users_subset_20160718]
  GROUP BY party, lang
  ORDER BY party, user_count DESC",
  project = project)

## 10.0 megabytes processed

##                        lang party user_count
## 1                        en     1      61521
## 2                        es     1       9783
## 3                        ja     1       1645
## 4                        id     1       1302
## 5                        pt     1        744
## 6                        fr     1        681
## 7                        ar     1        660
## 8                        tr     1        459
## 9                        ru     1        404
## 10                    en-gb     1        383
## 11                       ko     1        342
## 12                       de     1        170
## 13                       it     1        154
## 14                       NA     1        126
## 15                       th     1        125
## 16                       nl     1         94
## 17                    zh-cn     1         87
## 18                    zh-tw     1         49
## 19                       pl     1         49
## 20                       el     1         49
## 21                      fil     1         33
## 22                       sv     1         32
## 23                       vi     1         31
## 24                       cs     1         30
## 25                       hu     1         29
## 26                    en-GB     1         27
## 27                       da     1         23
## 28                       no     1         18
## 29                       bn     1         16
## 30                       ca     1         13
## 31       Select Language...     1         13
## 32                       fa     1         12
## 33                    xx-lc     1          7
## 34                      msa     1          6
## 35                       ro     1          6
## 36                       hi     1          6
## 37                       uk     1          5
## 38                       fi     1          5
## 39                  zh-Hans     1          3
## 40                       eu     1          3
## 41                       zh     1          2
## 42                       hr     1          2
## 43                       he     1          2
## 44                       af     1          2
## 45                    es-MX     1          2
## 46  Selecciona un idioma...     1          2
## 47                       ur     1          2
## 48                    zh-TW     1          2
## 49                    zh-CN     1          1
## 50                    pt-PT     1          1
## 51                       ta     1          1
## 52                       sr     1          1
## 53                       en     2      56718
## 54                       es     2       9937
## 55                       ja     2       1484
## 56                       id     2       1159
## 57                       pt     2        803
## 58                       ar     2        668
## 59                       fr     2        575
## 60                       tr     2        486
## 61                    en-gb     2        385
## 62                       ru     2        284
## 63                       ko     2        268
## 64                       de     2        154
## 65                       it     2        130
## 66                       NA     2        122
## 67                       th     2        113
## 68                       nl     2        109
## 69                    zh-cn     2         87
## 70                       pl     2         52
## 71                    zh-tw     2         48
## 72                       el     2         47
## 73                    en-GB     2         32
## 74                       sv     2         32
## 75                      fil     2         31
## 76                       hu     2         27
## 77                       cs     2         23
## 78                       vi     2         22
## 79                       no     2         20
## 80                       da     2         13
## 81                       fa     2         11
## 82                       ro     2         11
## 83                       ca     2          9
## 84       Select Language...     2          9
## 85                       fi     2          8
## 86                       bn     2          7
## 87                       hi     2          6
## 88                  zh-Hans     2          6
## 89                      msa     2          4
## 90                       he     2          3
## 91                    es-MX     2          2
## 92                    xx-lc     2          2
## 93                       hr     2          2
## 94                       ka     2          2
## 95                       uk     2          2
## 96                       eu     2          1
## 97                    en-AU     2          1
## 98                    zh-CN     2          1
## 99                       zh     2          1
## 100                      sk     2          1
## 101                      ur     2          1
## 102 Selecciona un idioma...     2          1
## 103                      en     3      38227
## 104                      es     3       5561
## 105                      ja     3       1113
## 106                      id     3        771
## 107                      pt     3        485
## 108                      ar     3        413
## 109                      fr     3        372
## 110                      tr     3        289
## 111                   en-gb     3        235
## 112                      ru     3        235
## 113                      ko     3        206
## 114                      it     3        100
## 115                      de     3         94
## 116                      th     3         71
## 117                      NA     3         67
## 118                      nl     3         53
## 119                   zh-cn     3         50
## 120                      el     3         34
## 121                      vi     3         31
## 122                   zh-tw     3         29
## 123                      sv     3         28
## 124                      pl     3         26
## 125                     fil     3         23
## 126                      da     3         17
## 127                   en-GB     3         17
## 128                      no     3         15
## 129                      cs     3         11
## 130      Select Language...     3          9
## 131                     msa     3          7
## 132                      hu     3          7
## 133                      ro     3          6
## 134                   zh-CN     3          6
## 135                      hi     3          5
## 136                      fa     3          5
## 137                      ca     3          5
## 138                      bn     3          5
## 139                      fi     3          3
## 140                   xx-lc     3          3
## 141                 zh-Hans     3          3
## 142                   en-AU     3          2
## 143                      uk     3          2
## 144                   pt-PT     3          1
## 145                      ga     3          1
## 146                      ur     3          1
## 147 Selecciona un idioma...     3          1
## 148                      hr     3          1
## 149                      sr     3          1
## 150                   es-MX     3          1
## 151                      gl     3          1
## 152                      lt     3          1
## 153                   zh-TW     3          1
## 154                      he     3          1

We can also use LIKE to search for tweets based on their text.

query_exec(
  "SELECT COUNT(*)
  FROM [usc-barbera:twitter_panel.tweets_20170721]
  WHERE lower(text) LIKE '%obama%'",
  project = project)

## 36.5 gigabytes processed

##      f0_
## 1 967995

query_exec(
  "SELECT COUNT(*)
  FROM [usc-barbera:twitter_panel.tweets_20170721]
  WHERE lower(text) LIKE '%trump%'",
  project = project)

## 36.5 gigabytes processed

##       f0_
## 1 4067171

Let’s look at another example for the user-level table. As a validation of the party predictions, let’s look at mentions of key political words in users’ profiles across parties. Note that instead of LIKE we can directly use regular expressions, which give us a bit more flexibility.

query_exec(
  "SELECT party, COUNT(*) AS user_count
  FROM [usc-barbera:twitter_panel.users_subset_20160718]
  WHERE REGEXP_MATCH(LOWER(description), 'conservative|republican')
  GROUP BY party
  ORDER BY party",
  project = project)

## 12.0 megabytes processed

##   party user_count
## 1     1         85
## 2     2         49
## 3     3        139

But one potential issue here: the number of users across groups may also vary, so what we need to compute is the proportion of users that mention these keywords on their profile. How can we do this? One option is to do it manually after collecting the data:

(df <- query_exec(
  "SELECT party, 
      SUM(REGEXP_MATCH(LOWER(description), 'conservative|republican')) AS user_count, 
    COUNT(UNIQUE(user_id_str)) AS total_users
  FROM [usc-barbera:twitter_panel.users_subset_20160718]
  GROUP BY party
  ORDER BY party",
  project = project))

## 15.0 megabytes processed

##   party user_count total_users
## 1     1         85       79165
## 2     2         49       73920
## 3     3        139       48652

(df$user_count / df$total_users) * 100

## [1] 0.10737068 0.06628788 0.28570254

But we can also do it in one step using a subselect:

query_exec(
  "SELECT party, user_count, total_users, 100 * (user_count / total_users) AS proportion
  FROM (SELECT party, 
      SUM(REGEXP_MATCH(LOWER(description), 'conservative|republican')) AS user_count, 
      COUNT(UNIQUE(user_id_str)) AS total_users
      FROM [usc-barbera:twitter_panel.users_subset_20160718]
      GROUP BY PARTY)
  ORDER BY party, proportion",
  project=project)

## 15.0 megabytes processed

##   party user_count total_users proportion
## 1     1         85       79165 0.10737068
## 2     2         49       73920 0.06628788
## 3     3        139       48652 0.28570254

What is happening here? We are using a nested query, creating a new table on the spot. Let’s try to understand by looking at that table we created:

query_exec(
  "SELECT party, 
      SUM(REGEXP_MATCH(LOWER(description), 'conservative|republican')) AS user_count, 
      COUNT(UNIQUE(user_id_str)) AS total_users
  FROM [usc-barbera:twitter_panel.users_subset_20160718]
  GROUP BY PARTY",
  project=project)

## 15.0 megabytes processed

##   party user_count total_users
## 1     2         49       73920
## 2     1         85       79165
## 3     3        139       48652

Note that this last query is equivalent to:

query_exec(
  "SELECT party, COUNT(*) AS user_count
  FROM [usc-barbera:twitter_panel.users_subset_20160718]
  WHERE lower(description) LIKE '%liberal%' 
      OR lower(description) LIKE '%democrat%'
  GROUP BY party
  ORDER BY party",
  project = project)

## 12.0 megabytes processed

##   party user_count
## 1     1         65
## 2     2        101
## 3     3         41

We can also compute counts at the date level to obtain a plot showing the evolution in the number of tweets over time that e.g. mention the word happy.

df <- query_exec(
  "SELECT date, COUNT(*) AS happy_count
  FROM [usc-barbera:twitter_panel.tweets_20170721]
  WHERE lower(text) LIKE '%happy%'
  GROUP BY date",
  project = project)

## 40.9 gigabytes processed

df <- df[order(df$date),]
plot(as.Date(df$date), df$happy_count, type="l")

Which are the happiest days of the year? Two ways of doing this…

head(df[order(df$happy_count, decreasing = TRUE),])

##           date happy_count
## 473 2016-01-01       37044
## 280 2015-01-01       27600
## 93  2015-11-26       25590
## 697 2017-01-01       24376
## 371 2016-02-14       22021
## 302 2016-05-08       20952

query_exec(
  "SELECT date, COUNT(*) AS happy_count
  FROM [usc-barbera:twitter_panel.tweets_20170721]
  WHERE lower(text) LIKE '%happy%'
  GROUP BY date
  ORDER BY happy_count DESC
  LIMIT 10",
  project = project)

## 40.9 gigabytes processed

##          date happy_count
## 1  2016-01-01       37044
## 2  2015-01-01       27600
## 3  2015-11-26       25590
## 4  2017-01-01       24376
## 5  2016-02-14       22021
## 6  2016-05-08       20952
## 7  2015-12-31       19279
## 8  2016-07-04       18973
## 9  2016-03-27       18951
## 10 2016-11-24       18496

How did the popularity of the #BlackLivesMatter hashtag evolve over time?

df <- query_exec(
  "SELECT date, COUNT(*) AS blm_count
  FROM [usc-barbera:twitter_panel.tweets_20170721]
  WHERE lower(text) LIKE '%blacklivesmatter%'
  GROUP BY date",
  project = project)

## 40.9 gigabytes processed

df <- df[order(df$date),]
plot(as.Date(df$date), df$blm_count, type="l")

And in combination with JOIN, it gets really easy to merge databases to, for example, count the number of tweets mentioning a specific keyword based on user-level characteristics:

query_exec(
  "SELECT users.party, 
    COUNT(*) AS tweet_count
  FROM [usc-barbera:twitter_panel.users_subset_20160718] AS users
  JOIN [usc-barbera:twitter_panel.tweets_20170721] AS tweets
  ON tweets.user_id_str = users.user_id_str
  WHERE lower(text) LIKE '%immigra%'
  GROUP BY users.party,
  ORDER BY users.party",
  project = project)

## 40.6 gigabytes processed

##   users_party tweet_count
## 1           1       28273
## 2           2       34074
## 3           3       24264

Introduction to Google BigQuery databases

Pablo Barbera

August 7, 2018

Connecting to Google BigQuery