We’ll now turn to a different type of Twitter data – static data, either recent tweets or user-level information. This type of data can be retrieved with Twitter’s REST API. We will use the tweetscores
package here – this is a package that I created to facilitate the collection and analysis of Twitter data.
It is possible to download recent tweets, but only up those less than 7 days old, and in some cases not all of them.
load("~/my_oauth")
library(tweetscores)
## Loading required package: R2WinBUGS
## Loading required package: coda
## Loading required package: boot
## ##
## ## tweetscores: tools for the analysis of Twitter data
## ## Pablo Barbera (LSE)
## ## www.tweetscores.com
## ##
library(streamR)
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: rjson
## Loading required package: ndjson
searchTweets(q=c("brexit", "survey"),
filename="~/data/recent-brexit-tweets.json",
n=1000, until="2019-06-22",
oauth=my_oauth)
## 92 tweets. Max id: 1142220629019648000
## 175 hits left
## 183 tweets. Max id: 1142219622571237376
## 174 hits left
## 262 tweets. Max id: 1142218524221464576
## 173 hits left
## 352 tweets. Max id: 1142217557488865280
## 172 hits left
## 444 tweets. Max id: 1142216627129323520
## 171 hits left
## 527 tweets. Max id: 1142215712997593088
## 170 hits left
## 626 tweets. Max id: 1142214913097494528
## 169 hits left
## 716 tweets. Max id: 1142214291237605376
## 168 hits left
## 807 tweets. Max id: 1142213545079103488
## 167 hits left
## 905 tweets. Max id: 1142212726330134528
## 166 hits left
## 994 tweets. Max id: 1142211961150758912
## 165 hits left
## 1083 tweets. Max id: 1.142211197305e+18
tweets <- parseTweets("~/data/recent-brexit-tweets.json")
## 2083 tweets have been parsed.
What are the most popular hashtags?
library(stringr)
ht <- str_extract_all(tweets$text, '#[A-Za-z0-9_]+')
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))
## ht
## #Brexit #brexit #PeoplesVote #StopBrexit #BorisJohnson
## 107 30 17 16 15
## #ItsTime
## 10
You can check the documentation about the options for string search here.
This is how you would extract information from user profiles:
wh <- c("realDonaldTrump", "POTUS", "VP", "FLOTUS")
users <- getUsersBatch(screen_names=wh,
oauth=my_oauth)
## 1--4 users left
str(users)
## 'data.frame': 4 obs. of 9 variables:
## $ id_str : chr "25073877" "818876014390603776" "818910970567344128" "822215679726100480"
## $ screen_name : chr "realDonaldTrump" "FLOTUS" "VP" "POTUS"
## $ name : chr "Donald J. Trump" "Melania Trump" "Vice President Mike Pence" "President Trump"
## $ description : chr "45th President of the United States of America\U0001f1fa\U0001f1f8" "This account is run by the Office of First Lady Melania Trump. Tweets may be archived. More at https://t.co/eVVzoBb3Zr" "Vice President Mike Pence. Husband, father, & honored to serve as the 48th Vice President of the United States."| __truncated__ "45th President of the United States of America, @realDonaldTrump. Tweets archived: https://t.co/eVVzoBb3Zr"
## $ followers_count: int 61302459 12009888 7531132 26062657
## $ statuses_count : int 42506 621 6634 6265
## $ friends_count : int 47 7 13 39
## $ created_at : chr "Wed Mar 18 13:46:38 +0000 2009" "Tue Jan 10 17:43:50 +0000 2017" "Tue Jan 10 20:02:44 +0000 2017" "Thu Jan 19 22:54:28 +0000 2017"
## $ location : chr "Washington, DC" "Washington, D.C." "Washington, D.C." "Washington, D.C."
Which of these has the most followers?
users[which.max(users$followers_count),]
## id_str screen_name name
## 1 25073877 realDonaldTrump Donald J. Trump
## description
## 1 45th President of the United States of America\U0001f1fa\U0001f1f8
## followers_count statuses_count friends_count
## 1 61302459 42506 47
## created_at location
## 1 Wed Mar 18 13:46:38 +0000 2009 Washington, DC
users$screen_name[which.max(users$followers_count)]
## [1] "realDonaldTrump"
Download up to 3,200 recent tweets from a Twitter account:
getTimeline(filename="~/data/realDonaldTrump.json", screen_name="realDonaldTrump", n=1000, oauth=my_oauth)
## 200 tweets. Max id: 1138404143222206464
## 890 hits left
## 400 tweets. Max id: 1133572279324037121
## 889 hits left
## 600 tweets. Max id: 1129853974247550977
## 888 hits left
## 799 tweets. Max id: 1126662643677319168
## 887 hits left
## 997 tweets. Max id: 1123529034292506624
## 886 hits left
## 1196 tweets. Max id: 1119372874442108928
What are the most common hashtags?
tweets <- parseTweets("~/data/realDonaldTrump.json")
## 1196 tweets have been parsed.
ht <- str_extract_all(tweets$text, '#[A-Za-z0-9_]+')
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))
## ht
## #MAGA #USStateVisit #DDay75thAnniversary
## 30 9 7
## #Trump2020 #POTUSinJapan #DDay75
## 7 5 4
Download friends and followers:
followers <- getFollowers("MethodologyLSE",
oauth=my_oauth)
## 14 API calls left
## 1902 followers. Next cursor: 0
## 13 API calls left
friends <- getFriends("MethodologyLSE",
oauth=my_oauth)
## 10 API calls left
## 136 friends. Next cursor: 0
## 9 API calls left
What are the most common words that friends of the MethodologyLSE account use to describe themselves on Twitter?
# extract profile descriptions
users <- getUsersBatch(ids=friends, oauth=my_oauth)
## 1--136 users left
## 2--36 users left
# create table with frequency of word use
library(quanteda)
## Package version: 1.3.14
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, remove=c(stopwords("english"), stopwords("spanish"),
"t.co", "https", "rt", "rts", "http"),
remove_punct=TRUE)
topfeatures(dfm, n = 30)
## lse research science london political economics
## 52 42 33 26 25 24
## school social university centre department events
## 23 20 15 12 12 11
## uk data public policy news professor
## 11 10 10 10 10 9
## society teaching politics lse's twitter academic
## 9 9 9 9 8 8
## health analysis programme us urban development
## 8 8 8 7 7 7
# create wordcloud
par(mar=c(0,0,0,0))
textplot_wordcloud(dfm, rotation=0, min_size=1, max_size=5, max_words=100)
The tweetscores
package also includes functions to replicate the method developed in the Political Analysis paper Birds of a Feather Tweet Together. Bayesian Ideal Point Estimation Using Twitter Data. For an application of this method, see also this Monkey Cage blog post.
# download list of friends for an account
user <- "DonaldJTrumpJr"
friends <- getFriends(user, oauth=my_oauth)
## 9 API calls left
## 1329 friends. Next cursor: 0
## 8 API calls left
# estimating ideology with correspondence analysis method
(theta <- estimateIdeology2(user, friends, verbose=FALSE))
## DonaldJTrumpJr follows 142 elites: ABC, AEI, AlbertBrooks, AllenWest, AmbJohnBolton, andersoncooper, AndreaTantaros, AnnCoulter, AP, AriFleischer, azizansari, BBCBreaking, BBCWorld, benshapiro, BillHemmer, BillyHallowell, BreakingNews, BreitbartNews, BretBaier, brithume, BuzzFeed, ByronYork, CharlieDaniels, chrisrock, chucktodd, chuckwoolery, CNBC, CNN, cnnbrk, DailyCaller, daveweigel, davidgregory, DennisDMZ, DineshDSouza, DRUDGE, DRUDGE_REPORT, ericbolling, FareedZakaria, FinancialTimes, ForAmerica, Forbes, foxandfriends, FoxNews, foxnewspolitics, funnyordie, gatewaypundit, Gawker, ggreenwald, GOP, gopleader, GovChristie, GovernorPerry, GovMikeHuckabee, greggutfeld, GStephanopoulos, hardball_chris, Heritage, HeyTammyBruce, HouseGOP, HuffingtonPost, iamjohnoliver, IngrahamAngle, iowahawkblog, JamesOKeefeIII, JamesRosenFNC, jasoninthehouse, jim_jordan, JimDeMint, jimmyfallon, Judgenap, JudicialWatch, kanyewest, KatiePavlich, kilmeade, kimguilfoyle, krauthammer, KurtSchlichter, larryelder, LindaSuhler, loudobbsnews, LukeRussert, marcorubio, marklevinshow, MarkSteynOnline, marthamaccallum, michellemalkin, mitchellreports, MittRomney, MonicaCrowley, mtaibbi, NASA, neiltyson, newsbusters, newtgingrich, NolteNC, NRA, NRO, nytimes, oreillyfactor, politico, ppppolls, RealBenCarson, realDonaldTrump, RealJamesWoods, Reince, repdianeblack, reploubarletta, replouiegohmert, reppaulryan, Reuters, rushlimbaugh, SarahPalinUSA, scrowder, seanhannity, secupp, Senate_GOPs, senmikelee, sentedcruz, ShannonBream, SharylAttkisson, stevekingia, SteveMartinToGo, tamronhall, TeamCavuto, tedcruz, TedNugent, TEDTalks, tgowdysc, TheAtlantic, TheEconomist, thenation, TheOnion, ThomasSowell, TODAYshow, townhallcom, TuckerCarlson, TwitchyTeam, VanityFair, washingtonpost, wikileaks, WSJ, YoungCons
## [1] 1.412358
# download list of friends for an account
user <- "realDonaldTrump"
friends <- getFriends(user, oauth=my_oauth)
## 8 API calls left
## 47 friends. Next cursor: 0
## 7 API calls left
# estimating ideology with correspondence analysis method
(theta <- estimateIdeology2(user, friends, verbose=FALSE))
## realDonaldTrump follows 10 elites: DRUDGE_REPORT, ericbolling, foxandfriends, IngrahamAngle, jim_jordan, oreillyfactor, piersmorgan, Reince, seanhannity, TuckerCarlson
## [1] 1.668441
The REST API offers also a long list of other endpoints that could be of use at some point, depending on your research interests.
users <- searchUsers(q="MethodologyLSE", count=100, oauth=my_oauth)
users$screen_name[1:10]
## [1] "thosjleeper" "MethodologyLSE" "NoamTitelman" "p_barbera"
## [5] "milamila07" "joannaaa" "Cris_Monteneg" "ellie_knott"
## [9] "ROliveiraT" "kateesummers"
# Downloading tweets when you know the ID
getStatuses(ids=c("474134260149157888", "266038556504494082"),
filename="~/data/old-tweets.json",
oauth=my_oauth)
## 896 API calls left
## 2 tweets left.
## 0 tweets left.
## 895 API calls left
parseTweets("~/data/old-tweets.json", legacy=TRUE)
## 2 tweets have been parsed.
## text retweet_count favorite_count favorited truncated id_str
## 1 NA 214730 190843 FALSE FALSE 474134260149157888
## 2 NA 143511 108432 FALSE FALSE 266038556504494082
## in_reply_to_screen_name
## 1 NA
## 2 NA
## source
## 1 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## 2 <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
## retweeted created_at in_reply_to_status_id_str
## 1 FALSE Wed Jun 04 10:23:11 +0000 2014 NA
## 2 FALSE Wed Nov 07 04:45:09 +0000 2012 NA
## in_reply_to_user_id_str lang listed_count verified location
## 1 NA en 104464 TRUE Washington, DC
## 2 NA en 104464 TRUE Washington, DC
## user_id_str description geo_enabled
## 1 25073877 45th President of the United States of America TRUE
## 2 25073877 45th President of the United States of America TRUE
## user_created_at statuses_count followers_count
## 1 Wed Mar 18 13:46:38 +0000 2009 42506 61302466
## 2 Wed Mar 18 13:46:38 +0000 2009 42506 61302466
## favourites_count protected user_url name
## 1 7 FALSE https://t.co/OMxB0x7xC5 Donald J. Trump
## 2 7 FALSE https://t.co/OMxB0x7xC5 Donald J. Trump
## time_zone user_lang utc_offset friends_count screen_name
## 1 NA NA NA 47 realDonaldTrump
## 2 NA NA NA 47 realDonaldTrump
## country_code country place_type full_name place_name place_id place_lat
## 1 NA NA NA NA NA NA NaN
## 2 NA NA NA NA NA NA NaN
## place_lon lat lon expanded_url url
## 1 NaN NA NA NA NA
## 2 NaN NA NA NA NA
# download user information from a list
MCs <- getList(list_name="new-members-of-congress",
screen_name="cspan", oauth=my_oauth)
## 894 API calls left
## 20 users in list. Next cursor: 5504177645515399168
## 893 API calls left
## 40 users in list. Next cursor: 5361732962332446720
## 892 API calls left
## 60 users in list. Next cursor: 4611686019261061818
## 891 API calls left
## 80 users in list. Next cursor: 4611686018645680191
## 890 API calls left
## 100 users in list. Next cursor: 4611686018453445113
## 889 API calls left
## 109 users in list. Next cursor: 0
## 888 API calls left
head(MCs)
## id id_str name screen_name
## 1 1.029094e+18 1029094268542099457 Lance Gooden Lancegooden
## 2 9.917210e+17 991721030631780354 Jahana Hayes JahanaHayesCT
## 3 9.864236e+17 986423555025002497 Bryan Steil BryanSteilforWI
## 4 9.772474e+17 977247448820305920 Joe Morelle votemorelle
## 5 9.693281e+17 969328139485802496 John Joyce JohnJoyceForPA
## 6 9.649970e+17 964996994623311874 Kelly Armstrong Armstrong_ND
## location
## 1 Terrell, TX
## 2 Waterbury, CT
## 3
## 4 Irondequoit, NY
## 5 Pennsylvania, USA
## 6
## description
## 1 Husband, Father, TX State Rep and U.S. Congressman for TX's 5th Congressional District.
## 2 CT 5th Congressional District Congresswoman
## 3 Problem Solver. Badger. Manufacturing. Running for Congress. #TeamSteil
## 4 #NY25 Democratic candidate. Husband, father, believer in the promise of a future that is as strong, resilient & bold as the people who call Monroe County home.
## 5 Father, husband, granddad, doctor, advocate, PSU alum. Congressman representing #PA13. Fighting everyday for central Pennsylvania. #TeamJoyce
## 6 Lifelong North Dakotan. Proud husband and dad. Republican. U.S. House of Representatives. Real Conservative. Real results.
## url followers_count friends_count
## 1 https://t.co/se9SXZNb3A 434 0
## 2 https://t.co/M3SQ8jj5UE 26724 164
## 3 https://t.co/YCyHtH7BJc 1707 67
## 4 https://t.co/rK3kkPMWMY 1136 214
## 5 https://t.co/jMnrGGJVsx 754 178
## 6 https://t.co/olKerxuUWz 1621 579
## created_at time_zone lang
## 1 Mon Aug 13 19:56:08 +0000 2018 NA NA
## 2 Wed May 02 16:48:13 +0000 2018 NA NA
## 3 Wed Apr 18 01:57:57 +0000 2018 NA NA
## 4 Fri Mar 23 18:15:22 +0000 2018 NA NA
## 5 Thu Mar 01 21:46:52 +0000 2018 NA NA
## 6 Sat Feb 17 22:56:27 +0000 2018 NA NA
This is also useful if e.g. you’re interested in compiling lists of journalists, because media outlets offer these lists in their profiles.
# Download list of users who retweeted a tweet (unfortunately, only up to 100)
rts <- getRetweets(id='942123433873281024', oauth=my_oauth)
## 74 API calls left
## 66 retweeters. Next cursor: 0
## 73 API calls left
# https://twitter.com/realDonaldTrump/status/942123433873281024
users <- getUsersBatch(ids=rts, oauth=my_oauth)
## 1--66 users left
# create table with frequency of word use
library(quanteda)
tw <- corpus(users$description[users$description!=""])
dfm <- dfm(tw, remove=c(stopwords("english"), stopwords("spanish"),
"t.co", "https", "rt", "rts", "http"),
remove_punct = TRUE)
# create wordcloud
par(mar=c(0,0,0,0))
textplot_wordcloud(dfm, rot.per=0, scale=c(3, .50), max.words=100)
## Warning: scalemax.wordsrot.per is deprecated; use min_size and
## max_sizemax_wordsrotation instead
# format Twitter dates to facilitate analysis
tweets <- parseTweets("~/data/realDonaldTrump.json")
## 1196 tweets have been parsed.
tweets$date <- formatTwDate(tweets$created_at, format="date")
hist(tweets$date, breaks="month")
# adapted from the botcheck package by @marsha5814
botometer = function(user, my_oauth, mashape_key, verbose=TRUE) {
require(httr)
# creating OAuth token
myapp = oauth_app("twitter", key=my_oauth$consumer_key,
secret=my_oauth$consumer_secret)
sig = sign_oauth1.0(myapp, token=my_oauth$access_token,
token_secret=my_oauth$access_token_secret)
users_url = "https://api.twitter.com/1.1/users/show.json?screen_name="
statuses_url = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name="
search_url = "https://api.twitter.com/1.1/search/tweets.json?q=%40"
opts = "&count=200"
# API call to get user
if (verbose) message("Downloading user profile...")
userdata = GET(paste0(users_url,user,opts), sig)
# API call to get tweets
if (verbose) message("Downloading user tweets...")
tweets = GET(paste0(statuses_url,user,opts), sig)
# API call to get mentions
if (verbose) message("Downloading user mentions...")
mentions = GET(paste0(search_url,user,opts), sig)
# Put everything in a list
body = list(
timeline = content(tweets, type="application/json"),
mentions = content(mentions, type="application/json"),
user = content(userdata, type="application/json")
)
# Convert to JSON
body_json = RJSONIO::toJSON(body, auto_unbox = T, pretty = T)
# Make the API request
if (verbose) message("Checking Botometer scores...")
result = POST("https://osome-botometer.p.mashape.com/2/check_account",
encode="json",
add_headers(`X-Mashape-Key`=mashape_key),
body=body_json)
# Parse result
result = content(result, as = "parsed")
# Return "English" score
return(result)
}
results <- botometer("MethodologyLSE", my_oauth,
mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')
## Loading required package: httr
## Downloading user profile...
## Downloading user tweets...
## Downloading user mentions...
## Checking Botometer scores...
results$scores
## $english
## [1] 0.1355826
##
## $universal
## [1] 0.2217957
results$categories
## $content
## [1] 0.2225713
##
## $friend
## [1] 0.2975815
##
## $network
## [1] 0.1041661
##
## $sentiment
## [1] 0.4940333
##
## $temporal
## [1] 0.1109869
##
## $user
## [1] 0.05836197
results <- botometer("realDonaldTrump", my_oauth,
mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')
## Downloading user profile...
## Downloading user tweets...
## Downloading user mentions...
## Checking Botometer scores...
results$scores
## $english
## [1] 0.03265991
##
## $universal
## [1] 0.03239875
results$categories
## $content
## [1] 0.0662062
##
## $friend
## [1] 0.07829814
##
## $network
## [1] 0.05312993
##
## $sentiment
## [1] 0.0562195
##
## $temporal
## [1] 0.09392178
##
## $user
## [1] 0.02392906
results <- botometer("everyword", my_oauth,
mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')
## Downloading user profile...
## Downloading user tweets...
## Downloading user mentions...
## Checking Botometer scores...
results$scores
## $english
## [1] 0.2368838
##
## $universal
## [1] 0.5361761
results$categories
## $content
## [1] 0.2107299
##
## $friend
## [1] 0.5857204
##
## $network
## [1] 0.6686587
##
## $sentiment
## [1] 0.3798948
##
## $temporal
## [1] 0.670126
##
## $user
## [1] 0.05042453
results <- botometer("Horse_3books", my_oauth,
mashape_key = 'Ujq7AAd3igmshqCBvI1LWbz0J8Hlp1hvVOYjsnMOx8z6bg4U68')
## Downloading user profile...
## Downloading user tweets...
## Downloading user mentions...
## Checking Botometer scores...
results$scores
## $english
## [1] 0.1937495
##
## $universal
## [1] 0.03508264
results$categories
## $content
## [1] 0.9036625
##
## $friend
## [1] 0.9136224
##
## $network
## [1] 0.9114359
##
## $sentiment
## [1] 0.8849749
##
## $temporal
## [1] 0.9105558
##
## $user
## [1] 0.04684971
Now time for another challenge!