One of the most common applications of dictionary methods is sentiment analysis: using a dictionary of positive and negative words, we compute a sentiment score for each individual document.
Let’s apply this technique to tweets by the four leading candidates in the 2016 Presidential primaries.
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.4
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
tweets <- read.csv('../data/candidate-tweets.csv', stringsAsFactors=F)
We will use the LIWC dictionary to measure the extent to which these candidates adopted a positive or negative tone during the election campaign. (Note: LIWC is provided here for teaching purposes only and will not be distributed publicly.) LIWC has many other categories, but for now let’s just use positive
and negative
liwc <- read.csv("../data/liwc-dictionary.csv",
stringsAsFactors = FALSE)
table(liwc$class)
##
## adjective affect anger anxiety cause
## 235 445 46 92 46
## cognition compare differ discrepancy female
## 252 101 46 92 46
## future insight interrogation male negate
## 46 92 47 46 47
## negative number past positive power
## 230 36 123 211 184
## present quant reward risk social
## 138 47 46 46 230
## tentative verb
## 23 329
pos.words <- liwc$word[liwc$class=="positive"]
neg.words <- liwc$word[liwc$class=="negative"]
# a look at a random sample of positive and negative words
sample(pos.words, 10)
## [1] "kind*" "finest" "keen*" "(will) like" "rich"
## [6] "fairer" "intelligent" "benefits" "secur*" "proudest"
sample(neg.words, 10)
## [1] "anger*" "emptiest" "inferiority" "anxiously" "trickier"
## [6] "diss" "aggress" "rigidly" "angry" "inferior"
As earlier today, we will convert our text to a corpus object.
twcorpus <- corpus(tweets)
Now we’re ready to run the sentiment analysis! First we will construct a dictionary object.
mydict <- dictionary(list(positive = pos.words,
negative = neg.words))
And now we apply it to the corpus in order to count the number of words that appear in each category
sent <- dfm(twcorpus, dictionary = mydict)
We can then extract the score and add it to the data frame as a new variable
tweets$score <- as.numeric(sent[,1]) - as.numeric(sent[,2])
And now start answering some descriptive questions…
# what is the average sentiment score?
mean(tweets$score)
## [1] 0.2056106
# what is the most positive and most negative tweet?
tweets[which.max(tweets$score),]
## screen_name
## 3125 realDonaldTrump
## text
## 3125 .@robertjeffress I greatly appreciate your kind words last night on @FoxNews. Have great love for the evangelicals -- great respect for you.
## datetime
## 3125 2015-09-11 19:24:44
## source
## 3125 <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
## lang score
## 3125 en 5
tweets[which.min(tweets$score),]
## screen_name
## 6642 realDonaldTrump
## text
## 6642 Lindsey Graham is all over T.V., much like failed 47% candidate Mitt Romney. These nasty, angry, jealous failures have ZERO credibility!
## datetime
## 6642 2016-03-07 13:03:59
## source
## 6642 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## lang score
## 6642 en -4
# what is the proportion of positive, neutral, and negative tweets?
tweets$sentiment <- "neutral"
tweets$sentiment[tweets$score<0] <- "negative"
tweets$sentiment[tweets$score>0] <- "positive"
table(tweets$sentiment)
##
## negative neutral positive
## 1265 19602 5868
We can also disaggregate by groups of tweets, for example according to the party they mention.
# loop over candidates
candidates <- c("realDonaldTrump", "HillaryClinton", "tedcruz", "BernieSanders")
for (cand in candidates){
message(cand, " -- average sentiment: ",
round(mean(tweets$score[tweets$screen_name==cand]), 4)
)
}
## realDonaldTrump -- average sentiment: 0.2911
## HillaryClinton -- average sentiment: 0.1736
## tedcruz -- average sentiment: 0.1853
## BernieSanders -- average sentiment: 0.1384
But what happens if we now run the analysis excluding a single word?
pos.words <- pos.words[-which(pos.words=="great")]
mydict <- dictionary(list(positive = pos.words,
negative = neg.words))
sent <- dfm(twcorpus, dictionary = mydict)
tweets$score <- as.numeric(sent[,1]) - as.numeric(sent[,2])
for (cand in candidates){
message(cand, " -- average sentiment: ",
round(mean(tweets$score[tweets$screen_name==cand]), 4)
)
}
## realDonaldTrump -- average sentiment: 0.1431
## HillaryClinton -- average sentiment: 0.1547
## tedcruz -- average sentiment: 0.1573
## BernieSanders -- average sentiment: 0.1265
How would we normalize by text length? (Maybe not necessary here given that tweets have roughly the same length.)
# collapse by account into 4 documents
twdfm <- dfm(twcorpus, groups = "screen_name")
twdfm
## Document-feature matrix of: 4 documents, 43,426 features (66.9% sparse).
# turn word counts into proportions
twdfm[1:4, 1:10]
## Document-feature matrix of: 4 documents, 10 features (30% sparse).
## 4 x 10 sparse Matrix of class "dfm"
## features
## docs rt @geraldorivera : recruit @realdonaldtrump to
## BernieSanders 1018 0 4186 0 11 2407
## HillaryClinton 1449 0 7800 0 33 3389
## realDonaldTrump 607 8 7138 2 2278 2537
## tedcruz 4464 0 18871 3 203 4045
## features
## docs finish that horrid eyesore
## BernieSanders 0 747 0 0
## HillaryClinton 5 561 0 0
## realDonaldTrump 7 714 2 1
## tedcruz 6 429 0 0
twdfm <- dfm_weight(twdfm, scheme="prop")
twdfm[1:4, 1:10]
## Document-feature matrix of: 4 documents, 10 features (30% sparse).
## 4 x 10 sparse Matrix of class "dfm"
## features
## docs rt @geraldorivera : recruit
## BernieSanders 0.010252175 0 0.04215678 0
## HillaryClinton 0.009177857 0 0.04940461 0
## realDonaldTrump 0.003413027 4.498223e-05 0.04013540 1.124556e-05
## tedcruz 0.018250652 0 0.07715234 1.226522e-05
## features
## docs @realdonaldtrump to finish that
## BernieSanders 0.0001107799 0.02424065 0 0.007522962
## HillaryClinton 0.0002090195 0.02146567 3.166962e-05 0.003553332
## realDonaldTrump 0.0128086906 0.01426499 3.935945e-05 0.004014664
## tedcruz 0.0008299468 0.01653761 2.453045e-05 0.001753927
## features
## docs horrid eyesore
## BernieSanders 0 0
## HillaryClinton 0 0
## realDonaldTrump 1.124556e-05 5.622779e-06
## tedcruz 0 0
# Apply dictionary using `dfm_lookup()` function:
sent <- dfm_lookup(twdfm, dictionary = mydict)
sent
## Document-feature matrix of: 4 documents, 2 features (0% sparse).
## 4 x 2 sparse Matrix of class "dfm"
## features
## docs positive negative
## BernieSanders 0.008237995 0.003111908
## HillaryClinton 0.007467697 0.001868508
## realDonaldTrump 0.010553956 0.004486978
## tedcruz 0.007494051 0.001418677
(sent[,1]-sent[,2])*100
## 4 x 1 sparse Matrix of class "dgCMatrix"
## features
## docs positive
## BernieSanders 0.5126088
## HillaryClinton 0.5599189
## realDonaldTrump 0.6066979
## tedcruz 0.6075374