Scraping data from Twitter’s REST API

We’ll now turn to a different type of Twitter data – static data, either recent tweets or user-level information. This type of data can be retrieved with Twitter’s REST API. We will use the tweetscores package here – this is a package that I created to facilitate the collection and analysis of Twitter data.

Searching recent tweets

It is possible to download recent tweets, but only up those less than 7 days old, and in some cases not all of them.

load("~/my_oauth")
library(tweetscores)

## Loading required package: R2WinBUGS

## Loading required package: coda

## Loading required package: boot

## ##
## ## tweetscores: tools for the analysis of Twitter data

## ## Pablo Barbera (LSE)

## ## www.tweetscores.com
## ##

library(streamR)

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: rjson

## Warning: package 'rjson' was built under R version 3.4.4

## Loading required package: ndjson

## Warning: package 'ndjson' was built under R version 3.4.4

searchTweets(q=c("brexit", "survey"), 
  filename="~/data/survey-tweets.json",
  n=1000, until="2018-07-31", 
  oauth=my_oauth)

## 100 tweets. Max id: 1024082209102282752

## 168 hits left

## 200 tweets. Max id: 1024081776484839424

## 167 hits left

## 300 tweets. Max id: 1024081377489313792

## 166 hits left

## 400 tweets. Max id: 1024080981903458304

## 165 hits left

## 500 tweets. Max id: 1024080636372504576

## 164 hits left

## 600 tweets. Max id: 1024080257140379648

## 163 hits left

## 700 tweets. Max id: 1024079875890716672

## 162 hits left

## 800 tweets. Max id: 1024079568678936576

## 161 hits left

## 900 tweets. Max id: 1024079294723760128

## 160 hits left

## 1000 tweets. Max id: 1024079085377441792

tweets <- parseTweets("~/data/survey-tweets.json")

## 1000 tweets have been parsed.

What are the most popular hashtags?

library(stringr)
ht <- str_extract_all(tweets$text, "#(\\d|\\w)+")
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))

## ht
##      #Brexit      #brexit #PeoplesVote      #BREXIT  #LoveIsland 
##          120           25           17           13           11 
##   #newsnight 
##            8

You can check the documentation about the options for string search here.

Extracting users’ profile information

This is how you would extract information from user profiles:

wh <- c("realDonaldTrump", "POTUS", "VP", "FLOTUS")
users <- getUsersBatch(screen_names=wh,
                       oauth=my_oauth)

## 1--4 users left

str(users)

## 'data.frame':    4 obs. of  9 variables:
##  $ id_str         : chr  "818876014390603776" "25073877" "822215679726100480" "818910970567344128"
##  $ screen_name    : chr  "FLOTUS" "realDonaldTrump" "POTUS" "VP"
##  $ name           : chr  "Melania Trump" "Donald J. Trump" "President Trump" "Vice President Mike Pence"
##  $ description    : chr  "This account is run by the Office of First Lady Melania Trump. Tweets may be archived. More at https://t.co/eVVzoBb3Zr" "45th President of the United States of America\U0001f1fa\U0001f1f8" "45th President of the United States of America, @realDonaldTrump. Tweets archived: https://t.co/eVVzoBb3Zr" "Vice President Mike Pence. Husband, father, & honored to serve as the 48th Vice President of the United States."| __truncated__
##  $ followers_count: int  10714631 53444080 23682056 6432401
##  $ statuses_count : int  345 38408 3655 4642
##  $ friends_count  : int  6 47 39 11
##  $ created_at     : chr  "Tue Jan 10 17:43:50 +0000 2017" "Wed Mar 18 13:46:38 +0000 2009" "Thu Jan 19 22:54:28 +0000 2017" "Tue Jan 10 20:02:44 +0000 2017"
##  $ location       : chr  "Washington, D.C." "Washington, DC" "Washington, D.C." "Washington, D.C."

Which of these has the most followers?

users[which.max(users$followers_count),]

##     id_str     screen_name            name
## 2 25073877 realDonaldTrump Donald J. Trump
##                                                          description
## 2 45th President of the United States of America\U0001f1fa\U0001f1f8
##   followers_count statuses_count friends_count
## 2        53444080          38408            47
##                       created_at       location
## 2 Wed Mar 18 13:46:38 +0000 2009 Washington, DC

users$screen_name[which.max(users$followers_count)]

## [1] "realDonaldTrump"

Download up to 3,200 recent tweets from a Twitter account:

getTimeline(filename="~/data/realDonaldTrump.json", screen_name="realDonaldTrump", n=1000, oauth=my_oauth)

## 200 tweets. Max id: 1018956970143858688

## 893 hits left

## 400 tweets. Max id: 1011577303023980544

## 892 hits left

## 600 tweets. Max id: 1006286790536323074

## 891 hits left

## 800 tweets. Max id: 999626347361206274

## 890 hits left

## 1000 tweets. Max id: 989834048796266498

What are the most common hashtags?

tweets <- parseTweets("~/data/realDonaldTrump.json")

## 1000 tweets have been parsed.

ht <- str_extract_all(tweets$text, "#(\\d|\\w)+")
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))

## ht
##         #MAGA            #1 #HELSINKI2018   #RightToTry     #G7Summit 
##            15             5             4             4             3 
##       #SCOTUS 
##             3

Building friend and follower networks

Download friends and followers:

followers <- getFollowers("ECPR", 
    oauth=my_oauth)

## 12 API calls left

## 5000 followers. Next cursor: 1556878267714917122

## 11 API calls left

## 10000 followers. Next cursor: 1464105674343453827

## 10 API calls left

## 13172 followers. Next cursor: 0

## 9 API calls left

friends <- getFriends("ECPR", 
    oauth=my_oauth)

## 12 API calls left

## 1072 friends. Next cursor: 0

## 11 API calls left

What are the most common words that friends of the ECPR account use to describe themselves on Twitter?

# extract profile descriptions
users <- getUsersBatch(ids=friends, oauth=my_oauth)

## 1--1072 users left

## 2--972 users left

## 3--872 users left

## 4--772 users left

## 5--672 users left

## 6--572 users left

## 7--472 users left

## 8--372 users left

## 9--272 users left

## 10--172 users left

## 11--72 users left

# create table with frequency of word use
library(quanteda)

## Warning: package 'quanteda' was built under R version 3.4.4

## Package version: 1.3.0

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, remove=c(stopwords("english"), stopwords("spanish"),
                                 "t.co", "https", "rt", "rts", "http"),
           remove_punct=TRUE)
topfeatures(dfm, n = 30)

##      politics    university     political      research international 
##           267           267           256           176           137 
##       science     professor        social      european        policy 
##           130           107            97            92            76 
##     relations       studies       twitter        tweets      official 
##            69            67            65            64            63 
##        public    department      sciences      lecturer          news 
##            57            55            54            51            51 
##       account        school        centre     institute           der 
##            48            46            43            41            41 
##            uk     scientist        editor            eu      director 
##            40            39            38            36            36

# create wordcloud
par(mar=c(0,0,0,0))
textplot_wordcloud(dfm, rotation=0, min_size=1, max_size=5, max_words=100)

Estimating ideology based on Twitter networks

The tweetscores package also includes functions to replicate the method developed in the Political Analysis paper Birds of a Feather Tweet Together. Bayesian Ideal Point Estimation Using Twitter Data. For an application of this method, see also this Monkey Cage blog post.

# download list of friends for an account
user <- "DonaldJTrumpJr"
friends <- getFriends(user, oauth=my_oauth)

## 11 API calls left

## 1260 friends. Next cursor: 0

## 10 API calls left

# estimating ideology with correspondence analysis method
(theta <- estimateIdeology2(user, friends, verbose=FALSE))

## DonaldJTrumpJr follows 140 elites: ABC, AEI, ajam, AlbertBrooks, AllenWest, AmbJohnBolton, andersoncooper, AndreaTantaros, AnnCoulter, AP, AriFleischer, azizansari, BBCBreaking, BBCWorld, benshapiro, BillyHallowell, BreakingNews, BreitbartNews, BretBaier, brithume, BuzzFeed, ByronYork, CharlieDaniels, chrisrock, chucktodd, chuckwoolery, CNBC, CNN, cnnbrk, DailyCaller, DanaPerino, daveweigel, davidgregory, DennisDMZ, DineshDSouza, DRUDGE, DRUDGE_REPORT, ericbolling, FareedZakaria, FinancialTimes, ForAmerica, Forbes, foxandfriends, FoxNews, foxnewspolitics, funnyordie, gatewaypundit, Gawker, ggreenwald, GOP, gopleader, GovChristie, GovernorPerry, GovMikeHuckabee, greggutfeld, GStephanopoulos, hardball_chris, Heritage, HeyTammyBruce, HouseGOP, howardfineman, HuffingtonPost, iamjohnoliver, IngrahamAngle, iowahawkblog, JamesOKeefeIII, JamesRosenFNC, jasoninthehouse, jim_jordan, JimDeMint, jimmyfallon, Judgenap, JudicialWatch, kanyewest, KatiePavlich, kilmeade, kimguilfoyle, krauthammer, KurtSchlichter, larryelder, loudobbsnews, LukeRussert, marcorubio, marklevinshow, MarkSteynOnline, marthamaccallum, michellemalkin, mitchellreports, MittRomney, MonicaCrowley, NASA, neiltyson, newsbusters, newtgingrich, NolteNC, NRA, NRO, nytimes, oreillyfactor, politico, ppppolls, RealBenCarson, realDonaldTrump, RealJamesWoods, Reince, repdianeblack, reploubarletta, replouiegohmert, reppaulryan, Reuters, SarahPalinUSA, scrowder, seanhannity, secupp, Senate_GOPs, senmikelee, sentedcruz, ShannonBream, SharylAttkisson, stevekingia, SteveMartinToGo, tamronhall, TeamCavuto, tedcruz, TedNugent, TEDTalks, tgowdysc, TheAtlantic, TheEconomist, TheOnion, ThomasSowell, TODAYshow, townhallcom, TuckerCarlson, TwitchyTeam, VanityFair, washingtonpost, wikileaks, WSJ, YoungCons

## [1] 1.437454

# download list of friends for an account
user <- "realDonaldTrump"
friends <- getFriends(user, oauth=my_oauth)

## 10 API calls left

## 47 friends. Next cursor: 0

## 9 API calls left

# estimating ideology with correspondence analysis method
(theta <- estimateIdeology2(user, friends, verbose=FALSE))

## realDonaldTrump follows 10 elites: AnnCoulter, DRUDGE_REPORT, ericbolling, foxandfriends, IngrahamAngle, oreillyfactor, piersmorgan, Reince, seanhannity, TuckerCarlson

## [1] 1.833987

Other types of data

The REST API offers also a long list of other endpoints that could be of use at some point, depending on your research interests.

You can search users related to specific keywords:

users <- searchUsers(q="ecpr", count=100, oauth=my_oauth)
users$screen_name[1:10]

##  [1] "ECPR"           "ECPRMethods"    "ECPR_PolCom"    "ECPR_SGEU"     
##  [5] "EPSRjournal"    "jamayoralda"    "littvay"        "ECPRKnowledge" 
##  [9] "ECPR_SGOC"      "ECPR_Migration"

If you know the ID of the tweets, you can download it directly from the API. This is useful because tweets cannot be redistributed as part of the replication materials of a published paper, but the list of tweet IDs can be shared:

# Downloading tweets when you know the ID
getStatuses(ids=c("474134260149157888", "266038556504494082"),
            filename="~/data/old-tweets.json",
            oauth=my_oauth)

## 899 API calls left

## 898 API calls left

parseTweets("~/data/old-tweets.json")

## 2 tweets have been parsed.

##                                                             text
## 1 Are you allowed to impeach a president for gross incompetence?
## 2           The electoral college is a disaster for a democracy.
##   retweet_count favorite_count favorited truncated             id_str
## 1            NA             NA     FALSE     FALSE 474134260149157888
## 2            NA             NA     FALSE     FALSE 266038556504494082
##   in_reply_to_screen_name
## 1                      NA
## 2                      NA
##                                                                                 source
## 1 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## 2                   <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
##   retweeted                     created_at in_reply_to_status_id_str
## 1     FALSE Wed Jun 04 10:23:11 +0000 2014                        NA
## 2     FALSE Wed Nov 07 04:45:09 +0000 2012                        NA
##   in_reply_to_user_id_str lang listed_count verified       location
## 1                      NA   en        91423     TRUE Washington, DC
## 2                      NA   en        91423     TRUE Washington, DC
##   user_id_str
## 1    25073877
## 2    25073877
##                                                          description
## 1 45th President of the United States of America\U0001f1fa\U0001f1f8
## 2 45th President of the United States of America\U0001f1fa\U0001f1f8
##   geo_enabled                user_created_at statuses_count
## 1        TRUE Wed Mar 18 13:46:38 +0000 2009          38408
## 2        TRUE Wed Mar 18 13:46:38 +0000 2009          38408
##   followers_count favourites_count protected                user_url
## 1        53444082               25     FALSE https://t.co/OMxB0x7xC5
## 2        53444082               25     FALSE https://t.co/OMxB0x7xC5
##              name time_zone user_lang utc_offset friends_count
## 1 Donald J. Trump        NA        en         NA            47
## 2 Donald J. Trump        NA        en         NA            47
##       screen_name country_code country place_type full_name place_name
## 1 realDonaldTrump           NA      NA         NA        NA         NA
## 2 realDonaldTrump           NA      NA         NA        NA         NA
##   place_id place_lat place_lon lat lon expanded_url url
## 1       NA       NaN       NaN  NA  NA           NA  NA
## 2       NA       NaN       NaN  NA  NA           NA  NA

Lists of Twitter users, compiled by other users, are also accessible through the API.

# download user information from a list
MCs <- getList(list_name="new-members-of-congress", 
               screen_name="cspan", oauth=my_oauth)

## 897 API calls left

## 20 users in list. Next cursor: 5427698142933319684

## 896 API calls left

## 40 users in list. Next cursor: 4611686021745187729

## 895 API calls left

## 60 users in list. Next cursor: 0

## 894 API calls left

head(MCs)

##             id             id_str                 name     screen_name
## 1 8.272798e+17 827279765287559171    Rep. Mike Johnson  RepMikeJohnson
## 2 8.235530e+17 823552974253342721     Anthony G. Brown RepAnthonyBrown
## 3 8.171385e+17 817138492614524928             Ted Budd      RepTedBudd
## 4 8.170763e+17 817076257770835968    Adriano Espaillat    RepEspaillat
## 5 8.170502e+17 817050219007328258 Rep. Blunt Rochester          RepLBR
## 6 8.168339e+17 816833925456789505  Nanette D. Barragán     RepBarragan
##                         location
## 1                 Washington, DC
## 2                 Washington, DC
## 3   Davie County, North Carolina
## 4 https://www.facebook.com/Congr
## 5 Delaware, USA - Washington, DC
## 6                  San Pedro, CA
##                                                                                                                                          description
## 1                                                 Proudly serving Louisiana's 4th Congressional District. Member on @HouseJudiciary & @NatResources.
## 2  Congressman proudly representing Maryland's Fourth District. Member of @HASCDemocrats & @NRDems. Father, husband & retired @USArmyReserve Colonel
## 3                                                                                         Proudly serving the 13th district of North Carolina. #NC13
## 4                                U. S. Representative proudly serving New York’s 13th Congressional District. Follow my work in Washington and #NY13
## 5                           Official Twitter page for U.S. Representative Lisa Blunt Rochester (D-DE). Tweets from Rep. Blunt Rochester signed -LBR.
## 6 Official account. Honored to represent California's 44th Congressional District. #CA44 Member of the @HispanicCaucus @USProgressives @Dodgers fan.
##                       url followers_count friends_count
## 1 https://t.co/qLAyhrFbRl            3517           402
## 2 https://t.co/2u5X332ICM           10326           849
## 3 https://t.co/VTsvWe0pia            5579           195
## 4 https://t.co/lcRqmQFAbz           10924          1257
## 5 https://t.co/Fe3XCG51wO            7421           317
## 6 https://t.co/Mt3nPi7hSH            8266           618
##                       created_at time_zone lang
## 1 Thu Feb 02 22:17:20 +0000 2017        NA   en
## 2 Mon Jan 23 15:28:24 +0000 2017        NA   en
## 3 Thu Jan 05 22:39:33 +0000 2017        NA   en
## 4 Thu Jan 05 18:32:15 +0000 2017        NA   en
## 5 Thu Jan 05 16:48:47 +0000 2017        NA   en
## 6 Thu Jan 05 02:29:18 +0000 2017        NA   en

This is also useful if e.g. you’re interested in compiling lists of journalists, because media outlets offer these lists in their profiles.

List of users who retweeted a particular tweet – unfortunately, it’s limited to only 100 most recent retweets.

# Download list of users who retweeted a tweet (unfortunately, only up to 100)
rts <- getRetweets(id='942123433873281024', oauth=my_oauth)

## 74 API calls left

## 75 retweeters. Next cursor: 0

## 73 API calls left

# https://twitter.com/realDonaldTrump/status/942123433873281024
users <- getUsersBatch(ids=rts, oauth=my_oauth)

## 1--75 users left

# create table with frequency of word use
library(quanteda)
tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, remove=c(stopwords("english"), stopwords("spanish"),
                                 "t.co", "https", "rt", "rts", "http"),
           remove_punct = TRUE)
# create wordcloud
par(mar=c(0,0,0,0))
textplot_wordcloud(dfm, rot.per=0, scale=c(3, .50), max.words=100)

## Warning: scalemax.wordsrot.per is deprecated; use min_size and
## max_sizemax_wordsrotation instead

And one final function to convert dates in their internal Twitter format to another format we could work with in R:

# format Twitter dates to facilitate analysis
tweets <- parseTweets("~/data/realDonaldTrump.json")

## 1000 tweets have been parsed.

tweets$date <- formatTwDate(tweets$created_at, format="date")
hist(tweets$date, breaks="month")

Checking for bots

# adapted from the botcheck package by @marsha5814
botometer = function(user, my_oauth, mashape_key, verbose=TRUE) {
  require(httr)
  # creating OAuth token
  myapp = oauth_app("twitter", key=my_oauth$consumer_key, 
                    secret=my_oauth$consumer_secret)
  sig = sign_oauth1.0(myapp, token=my_oauth$access_token,
                    token_secret=my_oauth$access_token_secret)
  
  users_url = "https://api.twitter.com/1.1/users/show.json?screen_name="
  statuses_url = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name="
  search_url = "https://api.twitter.com/1.1/search/tweets.json?q=%40"
  opts = "&count=200"
  
  # API call to get user
  if (verbose) message("Downloading user profile...")
  userdata = GET(paste0(users_url,user,opts), sig)
  
  # API call to get tweets
  if (verbose) message("Downloading user tweets...")
  tweets = GET(paste0(statuses_url,user,opts), sig)
  
  # API call to get mentions
  if (verbose) message("Downloading user mentions...")
  mentions = GET(paste0(search_url,user,opts), sig)
  
  
  # Put everything in a list
  body = list(
    timeline = content(tweets, type="application/json"),
    mentions = content(mentions, type="application/json"),
    user = content(userdata, type="application/json")
  )
  
  # Convert to JSON
  body_json = RJSONIO::toJSON(body, auto_unbox = T, pretty = T)
  
  # Make the API request
  if (verbose) message("Checking Botometer scores...")
  result = POST("https://osome-botometer.p.mashape.com/2/check_account",
                 encode="json",
                 add_headers(`X-Mashape-Key`=mashape_key),
                 body=body_json)
  
  # Parse result
  result = content(result, as = "parsed")
  
  # Return "English" score
  return(result)
}


results <- botometer("ECPR", my_oauth, 
          mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')

## Loading required package: httr

## Downloading user profile...

## Downloading user tweets...

## Downloading user mentions...

## Checking Botometer scores...

results$scores

## $english
## [1] 0.1167954
## 
## $universal
## [1] 0.3742668

results$categories

## $content
## [1] 0.431992
## 
## $friend
## [1] 0.2975815
## 
## $network
## [1] 0.09863455
## 
## $sentiment
## [1] 0.2517499
## 
## $temporal
## [1] 0.09933142
## 
## $user
## [1] 0.06746011

results <- botometer("realDonaldTrump", my_oauth, 
          mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')

## Downloading user profile...

## Downloading user tweets...

## Downloading user mentions...

## Checking Botometer scores...

results$scores

## $english
## [1] 0.03546593
## 
## $universal
## [1] 0.03798014

results$categories

## $content
## [1] 0.0580824
## 
## $friend
## [1] 0.1782331
## 
## $network
## [1] 0.09336611
## 
## $sentiment
## [1] 0.08157836
## 
## $temporal
## [1] 0.05938086
## 
## $user
## [1] 0.02392906

results <- botometer("everyword", my_oauth, 
          mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')

## Downloading user profile...

## Downloading user tweets...

## Downloading user mentions...

## Checking Botometer scores...

results$scores

## $english
## [1] 0.2691022
## 
## $universal
## [1] 0.182079

results$categories

## $content
## [1] 0.1584955
## 
## $friend
## [1] 0.4036753
## 
## $network
## [1] 0.5538789
## 
## $sentiment
## [1] 0.4608188
## 
## $temporal
## [1] 0.5984031
## 
## $user
## [1] 0.1447754

results <- botometer("Horse_3books", my_oauth, 
          mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')

## Downloading user profile...

## Downloading user tweets...

## Downloading user mentions...

## Checking Botometer scores...

results$scores

## $english
## [1] 0.7238002
## 
## $universal
## [1] 0.182079

results$categories

## $content
## [1] 0.9036625
## 
## $friend
## [1] 0.8049524
## 
## $network
## [1] 0.9044903
## 
## $sentiment
## [1] 0.8849749
## 
## $temporal
## [1] 0.8287204
## 
## $user
## [1] 0.1649844

Now time for another challenge!