Scraping web data from Twitter

Collecting data from Twitter’s REST API

It is possible to download recent tweets, but only up those less than 7 days old, and in some cases not all of them. We will use the netdemR package for this (and the other functions that scrape Twitter’s REST API).

library(netdemR)
## Loading required package: ROAuth
## Loading required package: rjson
## Loading required package: jsonlite
## 
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
## Loading required package: httr
## ##
## ## netdemR: tools for analysis of Twitter data
## ## Networked Democracy Lab at USC
## ## netdem.org
## ##
library(streamR)
## Loading required package: RCurl
## Loading required package: bitops
searchTweets(q=c("graham", "mccain"), 
  filename="senator-tweets.json",
  n=1000, until="2017-06-25", 
  oauth_folder="credentials")
## 100 tweets. Max id: 878764666939486208
## 176 hits left
## 200 tweets. Max id: 878762868426014720
## 175 hits left
## 300 tweets. Max id: 878760976190853120
## 174 hits left
## 400 tweets. Max id: 878758939575418880
## 173 hits left
## 500 tweets. Max id: 878756808906682368
## 172 hits left
## 600 tweets. Max id: 878755032631570432
## 171 hits left
## 700 tweets. Max id: 878753089578205184
## 170 hits left
## 800 tweets. Max id: 878751611119706112
## 169 hits left
## 900 tweets. Max id: 878749885968920576
## 168 hits left
## 1000 tweets. Max id: 878748094564712448
tweets <- parseTweets("senator-tweets.json")
## 1000 tweets have been parsed.

What are the most popular hashtags?

library(stringr)
ht <- str_extract_all(tweets$text, "#(\\d|\\w)+")
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))
## ht
##      #America #AmericaFirst      #Liberal         #Love         #MAGA 
##             4             4             4             4             4 
##       #People 
##             4

You can check the documentation about the options for string search here.

This is how you would extract information from user profiles:

wh <- c("realDonaldTrump", "POTUS", "VP", "FLOTUS")
users <- getUsersBatch(screen_names=wh,
                       oauth_folder="credentials")
## 1--4 users left
str(users)
## 'data.frame':    4 obs. of  14 variables:
##  $ id_str           : chr  "818910970567344128" "822215679726100480" "25073877" "818876014390603776"
##  $ screen_name      : chr  "VP" "POTUS" "realDonaldTrump" "FLOTUS"
##  $ name             : chr  "Vice President Pence" "President Trump" "Donald J. Trump" "Melania Trump"
##  $ description      : chr  "Husband, father, and honored to serve as the 48th Vice President of the United States. Tweets may be archived: "| __truncated__ "45th President of the United States of America, @realDonaldTrump. Tweets archived: https://t.co/eVVzoBb3Zr" "45th President of the United States of America" "This account is run by the Office of First Lady Melania Trump. Tweets may be archived. More at https://t.co/eVVzoBb3Zr"
##  $ followers_count  : int  3792386 19035524 32903842 7510506
##  $ statuses_count   : int  1510 754 35162 87
##  $ friends_count    : int  6 41 45 4
##  $ created_at       : chr  "Tue Jan 10 20:02:44 +0000 2017" "Thu Jan 19 22:54:28 +0000 2017" "Wed Mar 18 13:46:38 +0000 2009" "Tue Jan 10 17:43:50 +0000 2017"
##  $ location         : chr  "Washington, D.C." "Washington, D.C." "Washington, DC" "Washington, D.C."
##  $ lang             : chr  "en" "en" "en" "en"
##  $ time_zone        : chr  "Eastern Time (US & Canada)" "Eastern Time (US & Canada)" "Eastern Time (US & Canada)" "Eastern Time (US & Canada)"
##  $ status.id_str    : chr  "880399037228339200" "880256538056830976" "880242238000955392" "879506255852777472"
##  $ status.created_at: chr  "Thu Jun 29 12:14:22 +0000 2017" "Thu Jun 29 02:48:07 +0000 2017" "Thu Jun 29 01:51:18 +0000 2017" "Tue Jun 27 01:06:46 +0000 2017"
##  $ status.text      : chr  "Was such a pleasure to meet Barbara &amp; Teeba Marlowe in Cleveland. Inspiring to talk w/ them. See their stor"| __truncated__ "RT @GOPLeader: .@HouseGOP will send a strong signal tomorrow by passing #KatesLaw. Defiance of our laws will no"| __truncated__ "Tomorrow the House votes on #KatesLaw &amp; No Sanctuary For Criminals Act. Lawmakers must vote to put American"| __truncated__ "Happy belated 3rd bday (yesterday) sweet Monty!  Thank you for visiting us at @WhiteHouse today! https://t.co/ZnENYjB7zE"

Which of these has the most followers?

users[which.max(users$followers_count),]
##     id_str     screen_name            name
## 3 25073877 realDonaldTrump Donald J. Trump
##                                      description followers_count
## 3 45th President of the United States of America        32903842
##   statuses_count friends_count                     created_at
## 3          35162            45 Wed Mar 18 13:46:38 +0000 2009
##         location lang                  time_zone      status.id_str
## 3 Washington, DC   en Eastern Time (US & Canada) 880242238000955392
##                status.created_at
## 3 Thu Jun 29 01:51:18 +0000 2017
##                                                                                                                                       status.text
## 3 Tomorrow the House votes on #KatesLaw &amp; No Sanctuary For Criminals Act. Lawmakers must vote to put American safety… https://t.co/p3vd8IXXaD
users$screen_name[which.max(users$followers_count)]
## [1] "realDonaldTrump"

Download up to 3,200 recent tweets from a Twitter account:

getTimeline(filename="realDonaldTrump.json", screen_name="realDonaldTrump", 
    n=1000, oauth_folder="credentials")
## 200 tweets. Max id: 869553853750013953
## 898 hits left
## 400 tweets. Max id: 855517053238870019
## 897 hits left
## 600 tweets. Max id: 839084268991229952
## 896 hits left
## 800 tweets. Max id: 825127844066054144
## 895 hits left
## 1000 tweets. Max id: 814114980983427073

What are the most common hashtags?

tweets <- parseTweets("realDonaldTrump.json")
## 1000 tweets have been parsed.
ht <- str_extract_all(tweets$text, "#(\\d|\\w)+")
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))
## ht
##         #MAGA #AmericaFirst        #ICYMI    #Obamacare     #FakeNews 
##            13             9             9             6             5 
##          #USA 
##             5

Download friends and followers:

followers <- getFollowers("RECSM_UPF", 
    oauth_folder="credentials")
## credentials/twitter-token.Rdata
## 15 API calls left
## 744 followers. Next cursor: 0
## 14 API calls left

What are the most common words that followers of the Center for Survey Methodology use to describe themselves on Twitter?

# extract profile descriptions
users <- getUsersBatch(ids=followers, oauth_folder="credentials")
## 1--744 users left
## 2--644 users left
## 3--544 users left
## 4--444 users left
## 5--344 users left
## 6--244 users left
## 7--144 users left
## 8--44 users left
# create table with frequency of word use
library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, ignoredFeatures=c(stopwords("english"), stopwords("spanish"),
                                 "t.co", "https", "rt", "rts", "http"))
## Warning in tokens.character(texts(x), ...): Argument ignoredFeatures not
## used.
## Warning in dfm.tokenizedTexts(temp, tolower = tolower, stem = stem, select
## = select, : Argument ignoredFeatures not used.
## Warning in dfm.dfm(result, tolower = FALSE, stem = stem, select = select, :
## Argument ignoredFeatures not used.
wf <- tfidf(dfm)
# create wordcloud
par(mar=c(0,0,0,0))
plot(wf, rot.per=0, scale=c(3, .50), max.words=100)
## Warning: 'plot.dfm' is deprecated.
## Use 'textplot_wordcloud' instead.
## See help("Deprecated")

Other functions that could be of use at some point:

# Finding users related to keywords
users <- searchUsers(q="pompeu fabra", count=100, oauth_folder="credentials")
## credentials/twitter-token.Rdata
# Downloading tweets when you know the ID
getStatuses(ids=c("474134260149157888", "266038556504494082"), filename="old-tweets.json",
            oauth_folder="credentials")
## credentials/twitter-token.Rdata
## 900 API calls left
## 899 API calls left
parseTweets("old-tweets.json")
## 2 tweets have been parsed.
##                                                             text
## 1 Are you allowed to impeach a president for gross incompetence?
## 2           The electoral college is a disaster for a democracy.
##   retweet_count favorited truncated             id_str
## 1             0     FALSE     FALSE 474134260149157888
## 2             0     FALSE     FALSE 266038556504494082
##   in_reply_to_screen_name
## 1                      NA
## 2                      NA
##                                                                                 source
## 1 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## 2                   <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
##   retweeted                     created_at in_reply_to_status_id_str
## 1     FALSE Wed Jun 04 10:23:11 +0000 2014                        NA
## 2     FALSE Wed Nov 07 04:45:09 +0000 2012                        NA
##   in_reply_to_user_id_str lang listed_count verified       location
## 1                      NA   en        71952     TRUE Washington, DC
## 2                      NA   en        71952     TRUE Washington, DC
##   user_id_str                                    description geo_enabled
## 1    25073877 45th President of the United States of America        TRUE
## 2    25073877 45th President of the United States of America        TRUE
##                  user_created_at statuses_count followers_count
## 1 Wed Mar 18 13:46:38 +0000 2009          35162        32903864
## 2 Wed Mar 18 13:46:38 +0000 2009          35162        32903864
##   favourites_count protected user_url            name
## 1               24     FALSE       NA Donald J. Trump
## 2               24     FALSE       NA Donald J. Trump
##                    time_zone user_lang utc_offset friends_count
## 1 Eastern Time (US & Canada)        en     -14400            45
## 2 Eastern Time (US & Canada)        en     -14400            45
##       screen_name country_code country place_type full_name place_name
## 1 realDonaldTrump           NA      NA         NA        NA         NA
## 2 realDonaldTrump           NA      NA         NA        NA         NA
##   place_id place_lat place_lon lat lon expanded_url url
## 1       NA       NaN       NaN  NA  NA           NA  NA
## 2       NA       NaN       NaN  NA  NA           NA  NA
# download user information from a list
MCs <- getList(list_name="new-members-of-congress", 
               screen_name="cspan", oauth_folder="credentials")
## credentials/twitter-token.Rdata
## 900 API calls left
## 20 users in list. Next cursor: 5427671057913225216
## 899 API calls left
## 40 users in list. Next cursor: 4611686021468931847
## 898 API calls left
## 57 users in list. Next cursor: 0
## 897 API calls left
head(MCs)
##             id             id_str                 name    screen_name
## 1 8.272798e+17 827279765287559171    Rep. Mike Johnson RepMikeJohnson
## 2 8.171385e+17 817138492614524928             Ted Budd     RepTedBudd
## 3 8.170763e+17 817076257770835968    Adriano Espaillat   RepEspaillat
## 4 8.170502e+17 817050219007328258 Rep. Blunt Rochester  RepBRochester
## 5 8.168339e+17 816833925456789505  Nanette D. Barragán    RepBarragan
## 6 8.167198e+17 816719802328715264      Rep. Liz Cheney   RepLizCheney
##                         location
## 1                 Washington, DC
## 2  Davie County, North Carolina 
## 3 https://www.facebook.com/Congr
## 4 Delaware, USA - Washington, DC
## 5                  San Pedro, CA
## 6                   Wyoming, USA
##                                                                                                                                           description
## 1                                                  Proudly serving Louisiana's 4th Congressional District. Member on @HouseJudiciary & @NatResources.
## 2                                                                                          Proudly serving the 13th district of North Carolina. #NC13
## 3                                 U. S. Representative proudly serving New York’s 13th Congressional District. Follow my work in Washington and #NY13
## 4                            Official Twitter page for U.S. Representative Lisa Blunt Rochester (D-DE). Tweets from Rep. Blunt Rochester signed -LBR.
## 5 Official account. Honored to represent California's 44th Congressional District #CA44. Member of the @HispanicCaucus @USProgressives  @Dodgers fan.
## 6                                                                Official Page of Congressman Liz Cheney, proudly representing the people of Wyoming.
##                       url followers_count friends_count
## 1 https://t.co/qLAyhrFbRl            1380           361
## 2 https://t.co/VTsvWe0pia            1675           117
## 3 https://t.co/lcRqmQFAbz            5319          1010
## 4 https://t.co/Fe3XCG51wO            2271           259
## 5 https://t.co/Mt3nPi7hSH            4500           406
## 6 https://t.co/lBZ7grDQC9            1653            65
##                       created_at                   time_zone lang
## 1 Thu Feb 02 22:17:20 +0000 2017  Eastern Time (US & Canada)   en
## 2 Thu Jan 05 22:39:33 +0000 2017  Pacific Time (US & Canada)   en
## 3 Thu Jan 05 18:32:15 +0000 2017                        <NA>   en
## 4 Thu Jan 05 16:48:47 +0000 2017                        <NA>   en
## 5 Thu Jan 05 02:29:18 +0000 2017                        <NA>   en
## 6 Wed Jan 04 18:55:49 +0000 2017 Mountain Time (US & Canada)   en
# format Twitter dates to facilitate analysis
tweets <- parseTweets("realDonaldTrump.json")
## 1000 tweets have been parsed.
tweets$date <- formatTwDate(tweets$created_at, format="date")
hist(tweets$date, breaks="month")

# Download list of users who retweeted a tweet (unfortunately, only up to 100)
rts <- getRetweets(id='474134260149157888', oauth_folder="credentials")
## credentials/twitter-token.Rdata
## 75 API calls left
## 91 retweeters. Next cursor: 0
## 74 API calls left
users <- getUsersBatch(ids=rts, oauth_folder="credentials")
## 1--91 users left
# create table with frequency of word use
library(quanteda)
tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, ignoredFeatures=c(stopwords("english"), stopwords("spanish"),
                                 "t.co", "https", "rt", "rts", "http"))
## Warning in tokens.character(texts(x), ...): Argument ignoredFeatures not
## used.
## Warning in dfm.tokenizedTexts(temp, tolower = tolower, stem = stem, select
## = select, : Argument ignoredFeatures not used.
## Warning in dfm.dfm(result, tolower = FALSE, stem = stem, select = select, :
## Argument ignoredFeatures not used.
wf <- tfidf(dfm)
# create wordcloud
par(mar=c(0,0,0,0))
plot(wf, rot.per=0, scale=c(3, .50), max.words=100)
## Warning: 'plot.dfm' is deprecated.
## Use 'textplot_wordcloud' instead.
## See help("Deprecated")