Scraping data from Twitter’s REST API

We’ll now turn to a different type of Twitter data – static data, either recent tweets or user-level information. This type of data can be retrieved with Twitter’s REST API. We will use the tweetscores package here – this is a package that I created to facilitate the collection and analysis of Twitter data.

Searching recent tweets

It is possible to download recent tweets, but only up those less than 7 days old, and in some cases not all of them.

load("~/my_oauth")
library(tweetscores)

## Loading required package: R2WinBUGS

## Loading required package: coda

## Loading required package: boot

## ##
## ## tweetscores: tools for the analysis of Twitter data

## ## Pablo Barbera (LSE)

## ## www.tweetscores.com
## ##

library(streamR)

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: rjson

## Loading required package: ndjson

searchTweets(q=c("brexit", "survey"), 
  filename="~/data/recent-brexit-tweets.json",
  n=1000, until="2019-06-22", 
  oauth=my_oauth)

## 92 tweets. Max id: 1142220629019648000

## 175 hits left

## 183 tweets. Max id: 1142219622571237376

## 174 hits left

## 262 tweets. Max id: 1142218524221464576

## 173 hits left

## 352 tweets. Max id: 1142217557488865280

## 172 hits left

## 444 tweets. Max id: 1142216627129323520

## 171 hits left

## 527 tweets. Max id: 1142215712997593088

## 170 hits left

## 626 tweets. Max id: 1142214913097494528

## 169 hits left

## 716 tweets. Max id: 1142214291237605376

## 168 hits left

## 807 tweets. Max id: 1142213545079103488

## 167 hits left

## 905 tweets. Max id: 1142212726330134528

## 166 hits left

## 994 tweets. Max id: 1142211961150758912

## 165 hits left

## 1083 tweets. Max id: 1.142211197305e+18

tweets <- parseTweets("~/data/recent-brexit-tweets.json")

## 2083 tweets have been parsed.

What are the most popular hashtags?

library(stringr)
ht <- str_extract_all(tweets$text, '#[A-Za-z0-9_]+')
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))

## ht
##       #Brexit       #brexit  #PeoplesVote   #StopBrexit #BorisJohnson 
##           107            30            17            16            15 
##      #ItsTime 
##            10

You can check the documentation about the options for string search here.

Extracting users’ profile information

This is how you would extract information from user profiles:

wh <- c("realDonaldTrump", "POTUS", "VP", "FLOTUS")
users <- getUsersBatch(screen_names=wh,
                       oauth=my_oauth)

## 1--4 users left

str(users)

## 'data.frame':    4 obs. of  9 variables:
##  $ id_str         : chr  "25073877" "818876014390603776" "818910970567344128" "822215679726100480"
##  $ screen_name    : chr  "realDonaldTrump" "FLOTUS" "VP" "POTUS"
##  $ name           : chr  "Donald J. Trump" "Melania Trump" "Vice President Mike Pence" "President Trump"
##  $ description    : chr  "45th President of the United States of America\U0001f1fa\U0001f1f8" "This account is run by the Office of First Lady Melania Trump. Tweets may be archived. More at https://t.co/eVVzoBb3Zr" "Vice President Mike Pence. Husband, father, & honored to serve as the 48th Vice President of the United States."| __truncated__ "45th President of the United States of America, @realDonaldTrump. Tweets archived: https://t.co/eVVzoBb3Zr"
##  $ followers_count: int  61302459 12009888 7531132 26062657
##  $ statuses_count : int  42506 621 6634 6265
##  $ friends_count  : int  47 7 13 39
##  $ created_at     : chr  "Wed Mar 18 13:46:38 +0000 2009" "Tue Jan 10 17:43:50 +0000 2017" "Tue Jan 10 20:02:44 +0000 2017" "Thu Jan 19 22:54:28 +0000 2017"
##  $ location       : chr  "Washington, DC" "Washington, D.C." "Washington, D.C." "Washington, D.C."

Which of these has the most followers?

users[which.max(users$followers_count),]

##     id_str     screen_name            name
## 1 25073877 realDonaldTrump Donald J. Trump
##                                                          description
## 1 45th President of the United States of America\U0001f1fa\U0001f1f8
##   followers_count statuses_count friends_count
## 1        61302459          42506            47
##                       created_at       location
## 1 Wed Mar 18 13:46:38 +0000 2009 Washington, DC

users$screen_name[which.max(users$followers_count)]

## [1] "realDonaldTrump"

Download up to 3,200 recent tweets from a Twitter account:

getTimeline(filename="~/data/realDonaldTrump.json", screen_name="realDonaldTrump", n=1000, oauth=my_oauth)

## 200 tweets. Max id: 1138404143222206464

## 890 hits left

## 400 tweets. Max id: 1133572279324037121

## 889 hits left

## 600 tweets. Max id: 1129853974247550977

## 888 hits left

## 799 tweets. Max id: 1126662643677319168

## 887 hits left

## 997 tweets. Max id: 1123529034292506624

## 886 hits left

## 1196 tweets. Max id: 1119372874442108928

What are the most common hashtags?

tweets <- parseTweets("~/data/realDonaldTrump.json")

## 1196 tweets have been parsed.

ht <- str_extract_all(tweets$text, '#[A-Za-z0-9_]+')
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))

## ht
##                #MAGA        #USStateVisit #DDay75thAnniversary 
##                   30                    9                    7 
##           #Trump2020        #POTUSinJapan              #DDay75 
##                    7                    5                    4

Building friend and follower networks

Download friends and followers:

followers <- getFollowers("MethodologyLSE", 
    oauth=my_oauth)

## 14 API calls left

## 1902 followers. Next cursor: 0

## 13 API calls left

friends <- getFriends("MethodologyLSE", 
    oauth=my_oauth)

## 10 API calls left

## 136 friends. Next cursor: 0

## 9 API calls left

What are the most common words that friends of the MethodologyLSE account use to describe themselves on Twitter?

# extract profile descriptions
users <- getUsersBatch(ids=friends, oauth=my_oauth)

## 1--136 users left

## 2--36 users left

# create table with frequency of word use
library(quanteda)

## Package version: 1.3.14

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, remove=c(stopwords("english"), stopwords("spanish"),
                                 "t.co", "https", "rt", "rts", "http"),
           remove_punct=TRUE)
topfeatures(dfm, n = 30)

##         lse    research     science      london   political   economics 
##          52          42          33          26          25          24 
##      school      social  university      centre  department      events 
##          23          20          15          12          12          11 
##          uk        data      public      policy        news   professor 
##          11          10          10          10          10           9 
##     society    teaching    politics       lse's     twitter    academic 
##           9           9           9           9           8           8 
##      health    analysis   programme          us       urban development 
##           8           8           8           7           7           7

# create wordcloud
par(mar=c(0,0,0,0))
textplot_wordcloud(dfm, rotation=0, min_size=1, max_size=5, max_words=100)

Estimating ideology based on Twitter networks

The tweetscores package also includes functions to replicate the method developed in the Political Analysis paper Birds of a Feather Tweet Together. Bayesian Ideal Point Estimation Using Twitter Data. For an application of this method, see also this Monkey Cage blog post.

# download list of friends for an account
user <- "DonaldJTrumpJr"
friends <- getFriends(user, oauth=my_oauth)

## 9 API calls left

## 1329 friends. Next cursor: 0

## 8 API calls left

# estimating ideology with correspondence analysis method
(theta <- estimateIdeology2(user, friends, verbose=FALSE))

## DonaldJTrumpJr follows 142 elites: ABC, AEI, AlbertBrooks, AllenWest, AmbJohnBolton, andersoncooper, AndreaTantaros, AnnCoulter, AP, AriFleischer, azizansari, BBCBreaking, BBCWorld, benshapiro, BillHemmer, BillyHallowell, BreakingNews, BreitbartNews, BretBaier, brithume, BuzzFeed, ByronYork, CharlieDaniels, chrisrock, chucktodd, chuckwoolery, CNBC, CNN, cnnbrk, DailyCaller, daveweigel, davidgregory, DennisDMZ, DineshDSouza, DRUDGE, DRUDGE_REPORT, ericbolling, FareedZakaria, FinancialTimes, ForAmerica, Forbes, foxandfriends, FoxNews, foxnewspolitics, funnyordie, gatewaypundit, Gawker, ggreenwald, GOP, gopleader, GovChristie, GovernorPerry, GovMikeHuckabee, greggutfeld, GStephanopoulos, hardball_chris, Heritage, HeyTammyBruce, HouseGOP, HuffingtonPost, iamjohnoliver, IngrahamAngle, iowahawkblog, JamesOKeefeIII, JamesRosenFNC, jasoninthehouse, jim_jordan, JimDeMint, jimmyfallon, Judgenap, JudicialWatch, kanyewest, KatiePavlich, kilmeade, kimguilfoyle, krauthammer, KurtSchlichter, larryelder, LindaSuhler, loudobbsnews, LukeRussert, marcorubio, marklevinshow, MarkSteynOnline, marthamaccallum, michellemalkin, mitchellreports, MittRomney, MonicaCrowley, mtaibbi, NASA, neiltyson, newsbusters, newtgingrich, NolteNC, NRA, NRO, nytimes, oreillyfactor, politico, ppppolls, RealBenCarson, realDonaldTrump, RealJamesWoods, Reince, repdianeblack, reploubarletta, replouiegohmert, reppaulryan, Reuters, rushlimbaugh, SarahPalinUSA, scrowder, seanhannity, secupp, Senate_GOPs, senmikelee, sentedcruz, ShannonBream, SharylAttkisson, stevekingia, SteveMartinToGo, tamronhall, TeamCavuto, tedcruz, TedNugent, TEDTalks, tgowdysc, TheAtlantic, TheEconomist, thenation, TheOnion, ThomasSowell, TODAYshow, townhallcom, TuckerCarlson, TwitchyTeam, VanityFair, washingtonpost, wikileaks, WSJ, YoungCons

## [1] 1.412358

# download list of friends for an account
user <- "realDonaldTrump"
friends <- getFriends(user, oauth=my_oauth)

## 8 API calls left

## 47 friends. Next cursor: 0

## 7 API calls left

# estimating ideology with correspondence analysis method
(theta <- estimateIdeology2(user, friends, verbose=FALSE))

## realDonaldTrump follows 10 elites: DRUDGE_REPORT, ericbolling, foxandfriends, IngrahamAngle, jim_jordan, oreillyfactor, piersmorgan, Reince, seanhannity, TuckerCarlson

## [1] 1.668441

Other types of data

The REST API offers also a long list of other endpoints that could be of use at some point, depending on your research interests.

You can search users related to specific keywords:

users <- searchUsers(q="MethodologyLSE", count=100, oauth=my_oauth)
users$screen_name[1:10]

##  [1] "thosjleeper"    "MethodologyLSE" "NoamTitelman"   "p_barbera"     
##  [5] "milamila07"     "joannaaa"       "Cris_Monteneg"  "ellie_knott"   
##  [9] "ROliveiraT"     "kateesummers"

If you know the ID of the tweets, you can download it directly from the API. This is useful because tweets cannot be redistributed as part of the replication materials of a published paper, but the list of tweet IDs can be shared. You could use this function to hydrate the lists of tweets available in https://www.docnow.io/catalog/.

# Downloading tweets when you know the ID
getStatuses(ids=c("474134260149157888", "266038556504494082"),
            filename="~/data/old-tweets.json",
            oauth=my_oauth)

## 896 API calls left

## 2 tweets left.

## 0 tweets left.

## 895 API calls left

parseTweets("~/data/old-tweets.json", legacy=TRUE)

## 2 tweets have been parsed.

##   text retweet_count favorite_count favorited truncated             id_str
## 1   NA        214730         190843     FALSE     FALSE 474134260149157888
## 2   NA        143511         108432     FALSE     FALSE 266038556504494082
##   in_reply_to_screen_name
## 1                      NA
## 2                      NA
##                                                                                 source
## 1 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## 2                   <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
##   retweeted                     created_at in_reply_to_status_id_str
## 1     FALSE Wed Jun 04 10:23:11 +0000 2014                        NA
## 2     FALSE Wed Nov 07 04:45:09 +0000 2012                        NA
##   in_reply_to_user_id_str lang listed_count verified       location
## 1                      NA   en       104464     TRUE Washington, DC
## 2                      NA   en       104464     TRUE Washington, DC
##   user_id_str                                    description geo_enabled
## 1    25073877 45th President of the United States of America        TRUE
## 2    25073877 45th President of the United States of America        TRUE
##                  user_created_at statuses_count followers_count
## 1 Wed Mar 18 13:46:38 +0000 2009          42506        61302466
## 2 Wed Mar 18 13:46:38 +0000 2009          42506        61302466
##   favourites_count protected                user_url            name
## 1                7     FALSE https://t.co/OMxB0x7xC5 Donald J. Trump
## 2                7     FALSE https://t.co/OMxB0x7xC5 Donald J. Trump
##   time_zone user_lang utc_offset friends_count     screen_name
## 1        NA        NA         NA            47 realDonaldTrump
## 2        NA        NA         NA            47 realDonaldTrump
##   country_code country place_type full_name place_name place_id place_lat
## 1           NA      NA         NA        NA         NA       NA       NaN
## 2           NA      NA         NA        NA         NA       NA       NaN
##   place_lon lat lon expanded_url url
## 1       NaN  NA  NA           NA  NA
## 2       NaN  NA  NA           NA  NA

Lists of Twitter users, compiled by other users, are also accessible through the API.

# download user information from a list
MCs <- getList(list_name="new-members-of-congress", 
               screen_name="cspan", oauth=my_oauth)

## 894 API calls left

## 20 users in list. Next cursor: 5504177645515399168

## 893 API calls left

## 40 users in list. Next cursor: 5361732962332446720

## 892 API calls left

## 60 users in list. Next cursor: 4611686019261061818

## 891 API calls left

## 80 users in list. Next cursor: 4611686018645680191

## 890 API calls left

## 100 users in list. Next cursor: 4611686018453445113

## 889 API calls left

## 109 users in list. Next cursor: 0

## 888 API calls left

head(MCs)

##             id              id_str            name     screen_name
## 1 1.029094e+18 1029094268542099457    Lance Gooden     Lancegooden
## 2 9.917210e+17  991721030631780354    Jahana Hayes   JahanaHayesCT
## 3 9.864236e+17  986423555025002497     Bryan Steil BryanSteilforWI
## 4 9.772474e+17  977247448820305920     Joe Morelle     votemorelle
## 5 9.693281e+17  969328139485802496      John Joyce  JohnJoyceForPA
## 6 9.649970e+17  964996994623311874 Kelly Armstrong    Armstrong_ND
##            location
## 1       Terrell, TX
## 2     Waterbury, CT
## 3                  
## 4   Irondequoit, NY
## 5 Pennsylvania, USA
## 6                  
##                                                                                                                                                       description
## 1                                                                         Husband, Father, TX State Rep and U.S. Congressman for TX's 5th Congressional District.
## 2                                                                                                                     CT 5th Congressional District Congresswoman
## 3                                                                                         Problem Solver. Badger. Manufacturing. Running for Congress. #TeamSteil
## 4 #NY25 Democratic candidate. Husband, father, believer in the promise of a future that is as strong, resilient & bold as the people who call Monroe County home.
## 5                   Father, husband, granddad, doctor, advocate, PSU alum. Congressman representing #PA13. Fighting everyday for central Pennsylvania. #TeamJoyce
## 6                                      Lifelong North Dakotan. Proud husband and dad. Republican. U.S. House of Representatives. Real Conservative. Real results.
##                       url followers_count friends_count
## 1 https://t.co/se9SXZNb3A             434             0
## 2 https://t.co/M3SQ8jj5UE           26724           164
## 3 https://t.co/YCyHtH7BJc            1707            67
## 4 https://t.co/rK3kkPMWMY            1136           214
## 5 https://t.co/jMnrGGJVsx             754           178
## 6 https://t.co/olKerxuUWz            1621           579
##                       created_at time_zone lang
## 1 Mon Aug 13 19:56:08 +0000 2018        NA   NA
## 2 Wed May 02 16:48:13 +0000 2018        NA   NA
## 3 Wed Apr 18 01:57:57 +0000 2018        NA   NA
## 4 Fri Mar 23 18:15:22 +0000 2018        NA   NA
## 5 Thu Mar 01 21:46:52 +0000 2018        NA   NA
## 6 Sat Feb 17 22:56:27 +0000 2018        NA   NA

This is also useful if e.g. you’re interested in compiling lists of journalists, because media outlets offer these lists in their profiles.

List of users who retweeted a particular tweet – unfortunately, it’s limited to only 100 most recent retweets.

# Download list of users who retweeted a tweet (unfortunately, only up to 100)
rts <- getRetweets(id='942123433873281024', oauth=my_oauth)

## 74 API calls left

## 66 retweeters. Next cursor: 0

## 73 API calls left

# https://twitter.com/realDonaldTrump/status/942123433873281024
users <- getUsersBatch(ids=rts, oauth=my_oauth)

## 1--66 users left

# create table with frequency of word use
library(quanteda)
tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, remove=c(stopwords("english"), stopwords("spanish"),
                                 "t.co", "https", "rt", "rts", "http"),
           remove_punct = TRUE)
# create wordcloud
par(mar=c(0,0,0,0))
textplot_wordcloud(dfm, rot.per=0, scale=c(3, .50), max.words=100)

## Warning: scalemax.wordsrot.per is deprecated; use min_size and
## max_sizemax_wordsrotation instead

And one final function to convert dates in their internal Twitter format to another format we could work with in R:

# format Twitter dates to facilitate analysis
tweets <- parseTweets("~/data/realDonaldTrump.json")

## 1196 tweets have been parsed.

tweets$date <- formatTwDate(tweets$created_at, format="date")
hist(tweets$date, breaks="month")

Checking for bots

# adapted from the botcheck package by @marsha5814
botometer = function(user, my_oauth, mashape_key, verbose=TRUE) {
  require(httr)
  # creating OAuth token
  myapp = oauth_app("twitter", key=my_oauth$consumer_key, 
                    secret=my_oauth$consumer_secret)
  sig = sign_oauth1.0(myapp, token=my_oauth$access_token,
                    token_secret=my_oauth$access_token_secret)
  
  users_url = "https://api.twitter.com/1.1/users/show.json?screen_name="
  statuses_url = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name="
  search_url = "https://api.twitter.com/1.1/search/tweets.json?q=%40"
  opts = "&count=200"
  
  # API call to get user
  if (verbose) message("Downloading user profile...")
  userdata = GET(paste0(users_url,user,opts), sig)
  
  # API call to get tweets
  if (verbose) message("Downloading user tweets...")
  tweets = GET(paste0(statuses_url,user,opts), sig)
  
  # API call to get mentions
  if (verbose) message("Downloading user mentions...")
  mentions = GET(paste0(search_url,user,opts), sig)
  
  
  # Put everything in a list
  body = list(
    timeline = content(tweets, type="application/json"),
    mentions = content(mentions, type="application/json"),
    user = content(userdata, type="application/json")
  )
  
  # Convert to JSON
  body_json = RJSONIO::toJSON(body, auto_unbox = T, pretty = T)
  
  # Make the API request
  if (verbose) message("Checking Botometer scores...")
  result = POST("https://osome-botometer.p.mashape.com/2/check_account",
                 encode="json",
                 add_headers(`X-Mashape-Key`=mashape_key),
                 body=body_json)
  
  # Parse result
  result = content(result, as = "parsed")
  
  # Return "English" score
  return(result)
}


results <- botometer("MethodologyLSE", my_oauth, 
          mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')

## Loading required package: httr

## Downloading user profile...

## Downloading user tweets...

## Downloading user mentions...

## Checking Botometer scores...

results$scores

## $english
## [1] 0.1355826
## 
## $universal
## [1] 0.2217957

results$categories

## $content
## [1] 0.2225713
## 
## $friend
## [1] 0.2975815
## 
## $network
## [1] 0.1041661
## 
## $sentiment
## [1] 0.4940333
## 
## $temporal
## [1] 0.1109869
## 
## $user
## [1] 0.05836197

results <- botometer("realDonaldTrump", my_oauth, 
          mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')

## Downloading user profile...

## Downloading user tweets...

## Downloading user mentions...

## Checking Botometer scores...

results$scores

## $english
## [1] 0.03265991
## 
## $universal
## [1] 0.03239875

results$categories

## $content
## [1] 0.0662062
## 
## $friend
## [1] 0.07829814
## 
## $network
## [1] 0.05312993
## 
## $sentiment
## [1] 0.0562195
## 
## $temporal
## [1] 0.09392178
## 
## $user
## [1] 0.02392906

results <- botometer("everyword", my_oauth, 
          mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')

## Downloading user profile...

## Downloading user tweets...

## Downloading user mentions...

## Checking Botometer scores...

results$scores

## $english
## [1] 0.2368838
## 
## $universal
## [1] 0.5361761

results$categories

## $content
## [1] 0.2107299
## 
## $friend
## [1] 0.5857204
## 
## $network
## [1] 0.6686587
## 
## $sentiment
## [1] 0.3798948
## 
## $temporal
## [1] 0.670126
## 
## $user
## [1] 0.05042453

results <- botometer("Horse_3books", my_oauth, 
          mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')

## Downloading user profile...

## Downloading user tweets...

## Downloading user mentions...

## Checking Botometer scores...

results$scores

## $english
## [1] 0.1937495
## 
## $universal
## [1] 0.03508264

results$categories

## $content
## [1] 0.9036625
## 
## $friend
## [1] 0.9136224
## 
## $network
## [1] 0.9114359
## 
## $sentiment
## [1] 0.8849749
## 
## $temporal
## [1] 0.9105558
## 
## $user
## [1] 0.04684971

Now time for another challenge!