Scraping web data from Twitter

Collecting data from Twitter’s REST API

It is possible to download recent tweets, but only up those less than 7 days old, and in some cases not all of them. We will use the netdemR package for this (and the other functions that scrape Twitter’s REST API).

library(netdemR)
## Loading required package: ROAuth
## Loading required package: rjson
## Loading required package: jsonlite
## 
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
## Loading required package: httr
## ##
## ## netdemR: tools for analysis of Twitter data
## ## Networked Democracy Lab at USC
## ## netdem.org
## ##
library(streamR)
## Loading required package: RCurl
## Loading required package: bitops
searchTweets(q=c("mcconell", "mccain"), 
  filename="senator-tweets.json",
  n=1000, until="2017-07-30", 
  oauth_folder="../credentials")
## 100 tweets. Max id: 891445335234453504
## 178 hits left
## 200 tweets. Max id: 891294647305347072
## 177 hits left
## 300 tweets. Max id: 891281645373984768
## 176 hits left
## 400 tweets. Max id: 891034587585032192
## 175 hits left
## 500 tweets. Max id: 890919249757298688
## 174 hits left
## 600 tweets. Max id: 890817610996080640
## 173 hits left
## 700 tweets. Max id: 890811430835896320
## 172 hits left
## 800 tweets. Max id: 890809553905786880
## 171 hits left
## 900 tweets. Max id: 890699258701262848
## 170 hits left
## 1000 tweets. Max id: 890033272696209408
tweets <- parseTweets("senator-tweets.json")
## 1000 tweets have been parsed.

What are the most popular hashtags?

library(stringr)
ht <- str_extract_all(tweets$text, "#(\\d|\\w)+")
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))
## ht
## #SkinnyRepeal       #google          #ACA     #mcconell     #McConell 
##            19             7             6             6             6 
## #skinnyrepeal 
##             6

You can check the documentation about the options for string search here.

This is how you would extract information from user profiles:

wh <- c("realDonaldTrump", "POTUS", "VP", "FLOTUS")
users <- getUsersBatch(screen_names=wh,
                       oauth_folder="../credentials")
## 1--4 users left
str(users)
## 'data.frame':    4 obs. of  14 variables:
##  $ id_str           : chr  "25073877" "822215679726100480" "818876014390603776" "818910970567344128"
##  $ screen_name      : chr  "realDonaldTrump" "POTUS" "FLOTUS" "VP"
##  $ name             : chr  "Donald J. Trump" "President Trump" "Melania Trump" "Vice President Pence"
##  $ description      : chr  "45th President of the United States of America" "45th President of the United States of America, @realDonaldTrump. Tweets archived: https://t.co/eVVzoBb3Zr" "This account is run by the Office of First Lady Melania Trump. Tweets may be archived. More at https://t.co/eVVzoBb3Zr" "Husband, father, and honored to serve as the 48th Vice President of the United States. Tweets may be archived: "| __truncated__
##  $ followers_count  : int  35035138 19676841 7908612 4083400
##  $ statuses_count   : int  35426 953 104 1862
##  $ friends_count    : int  45 42 4 6
##  $ created_at       : chr  "Wed Mar 18 13:46:38 +0000 2009" "Thu Jan 19 22:54:28 +0000 2017" "Tue Jan 10 17:43:50 +0000 2017" "Tue Jan 10 20:02:44 +0000 2017"
##  $ location         : chr  "Washington, DC" "Washington, D.C." "Washington, D.C." "Washington, D.C."
##  $ lang             : chr  "en" "en" "en" "en"
##  $ time_zone        : chr  "Eastern Time (US & Canada)" "Eastern Time (US & Canada)" "Eastern Time (US & Canada)" "Eastern Time (US & Canada)"
##  $ status.id_str    : chr  "892383242535481344" "890977634972307456" "892076805674455042" "892551283281932288"
##  $ status.created_at: chr  "Tue Aug 01 13:55:19 +0000 2017" "Fri Jul 28 16:49:56 +0000 2017" "Mon Jul 31 17:37:39 +0000 2017" "Wed Aug 02 01:03:03 +0000 2017"
##  $ status.text      : chr  "Only the Fake News Media and Trump enemies want me to stop using Social Media (110 million people). Only way fo"| __truncated__ "RT @realDonaldTrump: Departing for Long Island now. An area under siege from #MS13 gang members. We will not re"| __truncated__ "Thank you to all of our East wing interns for their dedication! The future is bright with these hard workers! h"| __truncated__ "Arrests of @leopoldolopez &amp; @alcaldeledezma by corrupt Maduro regime is an outrage. Spoke to Lopez on Frida"| __truncated__

Which of these has the most followers?

users[which.max(users$followers_count),]
##     id_str     screen_name            name
## 1 25073877 realDonaldTrump Donald J. Trump
##                                      description followers_count
## 1 45th President of the United States of America        35035138
##   statuses_count friends_count                     created_at
## 1          35426            45 Wed Mar 18 13:46:38 +0000 2009
##         location lang                  time_zone      status.id_str
## 1 Washington, DC   en Eastern Time (US & Canada) 892383242535481344
##                status.created_at
## 1 Tue Aug 01 13:55:19 +0000 2017
##                                                                                                                                 status.text
## 1 Only the Fake News Media and Trump enemies want me to stop using Social Media (110 million people). Only way for me to get the truth out!
users$screen_name[which.max(users$followers_count)]
## [1] "realDonaldTrump"

Download up to 3,200 recent tweets from a Twitter account:

getTimeline(filename="realDonaldTrump.json", screen_name="realDonaldTrump", 
    n=1000, oauth_folder="../credentials")
## 200 tweets. Max id: 883444736849305600
## 898 hits left
## 400 tweets. Max id: 873878061778907137
## 897 hits left
## 600 tweets. Max id: 859143061678501892
## 896 hits left
## 800 tweets. Max id: 844536536930619393
## 895 hits left
## 1000 tweets. Max id: 829299566344359936

What are the most common hashtags?

tweets <- parseTweets("realDonaldTrump.json")
## 1000 tweets have been parsed.
ht <- str_extract_all(tweets$text, "#(\\d|\\w)+")
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))
## ht
##         #MAGA        #ICYMI          #USA     #FakeNews #AmericaFirst 
##            15            11            10             9             8 
##    #Obamacare 
##             6

Download friends and followers:

followers <- getFollowers("ecpr", 
    oauth_folder="../credentials")
## ../credentials/twitter-token.Rdata
## 15 API calls left
## 5000 followers. Next cursor: 1505108097996709997
## 14 API calls left
## 10000 followers. Next cursor: 1370131470074099998
## 13 API calls left
## 10339 followers. Next cursor: 0
## 12 API calls left
friends <- getFriends("ecpr", 
    oauth_folder="../credentials")
## ../credentials/twitter-token.Rdata
## 15 API calls left
## 981 friends. Next cursor: 0
## 14 API calls left

What are the most common words that friends of the ECPR Twitter account use to describe themselves on Twitter?

# extract profile descriptions
users <- getUsersBatch(ids=friends, oauth_folder="../credentials")
## 1--981 users left
## 2--881 users left
## 3--781 users left
## 4--681 users left
## 5--581 users left
## 6--481 users left
## 7--381 users left
## 8--281 users left
## 9--181 users left
## 10--81 users left
# create table with frequency of word use
library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, remove=c(stopwords("english"), stopwords("spanish"),
                                 "t.co", "https", "rt", "rts", "http"),
           remove_punct=TRUE)
# create wordcloud
par(mar=c(0,0,0,0))
textplot_wordcloud(dfm, rot.per=0, scale=c(3, .50), max.words=100)

The REST API offers also a long list of other endpoints that could be of use at some point, depending on your research interests.

  1. You can search users related to specific keywords:
users <- searchUsers(q="ecpr", count=100, oauth_folder="../credentials")
## ../credentials/twitter-token.Rdata
  1. If you know the ID of the tweets, you can download it directly from the API. This is useful because tweets cannot be redistributed as part of the replication materials of a published paper, but the list of tweet IDs can be shared:
# Downloading tweets when you know the ID
getStatuses(ids=c("474134260149157888", "266038556504494082"), filename="old-tweets.json",
            oauth_folder="../credentials")
## ../credentials/twitter-token.Rdata
## 900 API calls left
## 899 API calls left
parseTweets("old-tweets.json")
## 2 tweets have been parsed.
##                                                             text
## 1 Are you allowed to impeach a president for gross incompetence?
## 2           The electoral college is a disaster for a democracy.
##   retweet_count favorited truncated             id_str
## 1             0     FALSE     FALSE 474134260149157888
## 2             0     FALSE     FALSE 266038556504494082
##   in_reply_to_screen_name
## 1                      NA
## 2                      NA
##                                                                                 source
## 1 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## 2                   <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
##   retweeted                     created_at in_reply_to_status_id_str
## 1     FALSE Wed Jun 04 10:23:11 +0000 2014                        NA
## 2     FALSE Wed Nov 07 04:45:09 +0000 2012                        NA
##   in_reply_to_user_id_str lang listed_count verified       location
## 1                      NA   en        73836     TRUE Washington, DC
## 2                      NA   en        73836     TRUE Washington, DC
##   user_id_str                                    description geo_enabled
## 1    25073877 45th President of the United States of America        TRUE
## 2    25073877 45th President of the United States of America        TRUE
##                  user_created_at statuses_count followers_count
## 1 Wed Mar 18 13:46:38 +0000 2009          35426        35035148
## 2 Wed Mar 18 13:46:38 +0000 2009          35426        35035148
##   favourites_count protected user_url            name
## 1               12     FALSE       NA Donald J. Trump
## 2               12     FALSE       NA Donald J. Trump
##                    time_zone user_lang utc_offset friends_count
## 1 Eastern Time (US & Canada)        en     -14400            45
## 2 Eastern Time (US & Canada)        en     -14400            45
##       screen_name country_code country place_type full_name place_name
## 1 realDonaldTrump           NA      NA         NA        NA         NA
## 2 realDonaldTrump           NA      NA         NA        NA         NA
##   place_id place_lat place_lon lat lon expanded_url url
## 1       NA       NaN       NaN  NA  NA           NA  NA
## 2       NA       NaN       NaN  NA  NA           NA  NA
  1. Lists of Twitter users, compiled by other users, are also accessible through the API.
# download user information from a list
MCs <- getList(list_name="new-members-of-congress", 
               screen_name="cspan", oauth_folder="../credentials")
## ../credentials/twitter-token.Rdata
## 900 API calls left
## 20 users in list. Next cursor: 5427698142933319684
## 899 API calls left
## 40 users in list. Next cursor: 4611686021745187729
## 898 API calls left
## 60 users in list. Next cursor: 0
## 897 API calls left
head(MCs)
##             id             id_str                 name     screen_name
## 1 8.272798e+17 827279765287559171    Rep. Mike Johnson  RepMikeJohnson
## 2 8.235530e+17 823552974253342721     Anthony G. Brown RepAnthonyBrown
## 3 8.171385e+17 817138492614524928             Ted Budd      RepTedBudd
## 4 8.170763e+17 817076257770835968    Adriano Espaillat    RepEspaillat
## 5 8.170502e+17 817050219007328258 Rep. Blunt Rochester   RepBRochester
## 6 8.168339e+17 816833925456789505  Nanette D. Barragán     RepBarragan
##                         location
## 1                 Washington, DC
## 2                 Washington, DC
## 3  Davie County, North Carolina 
## 4 https://www.facebook.com/Congr
## 5 Delaware, USA - Washington, DC
## 6                  San Pedro, CA
##                                                                                                                                           description
## 1                                                  Proudly serving Louisiana's 4th Congressional District. Member on @HouseJudiciary & @NatResources.
## 2   Congressman proudly representing Maryland's Fourth District. Member of @HASCDemocrats & @NRDems. Father, husband & retired @USArmyReserve Colonel
## 3                                                                                          Proudly serving the 13th district of North Carolina. #NC13
## 4                                 U. S. Representative proudly serving New York’s 13th Congressional District. Follow my work in Washington and #NY13
## 5                            Official Twitter page for U.S. Representative Lisa Blunt Rochester (D-DE). Tweets from Rep. Blunt Rochester signed -LBR.
## 6 Official account. Honored to represent California's 44th Congressional District #CA44. Member of the @HispanicCaucus @USProgressives  @Dodgers fan.
##                       url followers_count friends_count
## 1 https://t.co/qLAyhrFbRl            1603           361
## 2 https://t.co/2u5X332ICM            1561           120
## 3 https://t.co/VTsvWe0pia            1822           120
## 4 https://t.co/lcRqmQFAbz            5553          1013
## 5 https://t.co/Fe3XCG51wO            2530           264
## 6 https://t.co/Mt3nPi7hSH            4814           439
##                       created_at                  time_zone lang
## 1 Thu Feb 02 22:17:20 +0000 2017 Eastern Time (US & Canada)   en
## 2 Mon Jan 23 15:28:24 +0000 2017 Eastern Time (US & Canada)   en
## 3 Thu Jan 05 22:39:33 +0000 2017 Pacific Time (US & Canada)   en
## 4 Thu Jan 05 18:32:15 +0000 2017                       <NA>   en
## 5 Thu Jan 05 16:48:47 +0000 2017                       <NA>   en
## 6 Thu Jan 05 02:29:18 +0000 2017                       <NA>   en

This is also useful if e.g. you’re interested in compiling lists of journalists, because media outlets offer these lists in their profiles.

  1. List of users who retweeted a particular tweet – unfortunately, it’s limited to only 100 most recent retweets.
# Download list of users who retweeted a tweet (unfortunately, only up to 100)
rts <- getRetweets(id='892147656319004672', oauth_folder="../credentials")
## ../credentials/twitter-token.Rdata
## 75 API calls left
## 87 retweeters. Next cursor: 0
## 74 API calls left
# https://twitter.com/realDonaldTrump/status/892147656319004672
users <- getUsersBatch(ids=rts, oauth_folder="../credentials")
## 1--87 users left
# create table with frequency of word use
library(quanteda)
tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, remove=c(stopwords("english"), stopwords("spanish"),
                                 "t.co", "https", "rt", "rts", "http"),
           remove_punct = TRUE)
# create wordcloud
par(mar=c(0,0,0,0))
textplot_wordcloud(dfm, rot.per=0, scale=c(5, .50), max.words=100)

  1. And one final function to convert dates in their internal Twitter format to another format we could work with in R:
# format Twitter dates to facilitate analysis
tweets <- parseTweets("realDonaldTrump.json")
## 1000 tweets have been parsed.
tweets$date <- formatTwDate(tweets$created_at, format="date")
hist(tweets$date, breaks="month")