Supervised machine learning

We’ll be working again with tweets about the 2014 EP elections in the UK. Now we will be using the polite variable, which indicates whether each tweet was hand-coded as being polite (a tweet that adheres to politeness standards, i.e. it is written in a well-mannered and non-offensive way) or impolite (an ill-mannered, disrespectful tweet that may contain offensive language).

The source of the dataset is an article co-authored with Yannis Theocharis, Zoltan Fazekas, and Sebastian Popa, published in the Journal of Communication. The link is here. Our goal was to understand to what extent candidates are not engaging voters on Twitter because they’re exposed to mostly impolite messages.

Let’s start by reading the dataset and creating a dummy variable indicating whether each tweet is impolite.

library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
tweets <- read.csv("data/EP-elections-tweets.csv", stringsAsFactors=F)
tweets$impolite <- ifelse(tweets$polite=="polite", 0, 1)

We’ll do some cleaning as well – substituting handles with @. Why? We want to provent overfitting.

tweets$text <- gsub('@[0-9_A-Za-z]+', '@', tweets$text)

Create the dfm and trim it so that only tokens that appear in 2 or more tweets are included.

twcorpus <- corpus(tweets$text)
twdfm <- dfm(twcorpus, remove_punct=TRUE, remove=c(
  stopwords("english"), "t.co", "https", "rt", "amp", "http", "t.c", "can"))
twdfm <- dfm_trim(twdfm, min_docfreq = 2)

And split the dataset into training and test set. We’ll go with 80% training and 20% set. Note the use of a random seed to make sure our results are replicable.

set.seed(123)
training <- sample(1:nrow(tweets), floor(.80 * nrow(tweets)))
test <- (1:nrow(tweets))[1:nrow(tweets) %in% training == FALSE]

Our first step is to train the classifier using cross-validation. There are many packages in R to run machine learning models. For regularized regression, glmnet is in my opinion the best. It’s much faster than caret or mlr (in my experience at least), and it has cross-validation already built-in, so we don’t need to code it from scratch.

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
require(doMC)
## Loading required package: doMC
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores=3)
ridge <- cv.glmnet(twdfm[training,], tweets$impolite[training], 
    family="binomial", alpha=0, nfolds=5, parallel=TRUE, intercept=TRUE,
    type.measure="deviance")
plot(ridge)

We can now compute the performance metrics on the test set.

## function to compute accuracy
accuracy <- function(ypred, y){
    tab <- table(ypred, y)
    return(sum(diag(tab))/sum(tab))
}
# function to compute precision
precision <- function(ypred, y){
    tab <- table(ypred, y)
    return((tab[2,2])/(tab[2,1]+tab[2,2]))
}
# function to compute recall
recall <- function(ypred, y){
    tab <- table(ypred, y)
    return(tab[2,2]/(tab[1,2]+tab[2,2]))
}
# computing predicted values
preds <- predict(ridge, twdfm[test,], type="response") > mean(tweets$impolite[test])
# confusion matrix
table(preds, tweets$impolite[test])
##        
## preds      0    1
##   FALSE 1598   24
##   TRUE   304   74
# performance metrics
accuracy(preds, tweets$impolite[test])
## [1] 0.836
precision(preds, tweets$impolite[test])
## [1] 0.1957672
recall(preds, tweets$impolite[test])
## [1] 0.755102

Something that is often very useful is to look at the actual estimated coefficients and see which of these have the highest or lowest values:

# from the different values of lambda, let's pick the best one
best.lambda <- which(ridge$lambda==ridge$lambda.min)
beta <- ridge$glmnet.fit$beta[,best.lambda]
head(beta)
##         ukip       thanks         will           eu         vote 
##  0.001933743 -0.058822024 -0.040979641 -0.042785624 -0.005402521 
##         good 
## -0.035085212
## identifying predictive features
df <- data.frame(coef = as.numeric(beta),
                word = names(beta), stringsAsFactors=F)

df <- df[order(df$coef),]
head(df[,c("coef", "word")], n=30)
##            coef           word
## 5510 -0.2335195         haaaaa
## 5091 -0.1857681     agg0ib2pgc
## 5092 -0.1857643          bantz
## 5325 -0.1671897     yugjvg8lvm
## 5833 -0.1534823        villain
## 5831 -0.1534818           bald
## 5832 -0.1534803        cockney
## 2150 -0.1527478       somebody
## 7196 -0.1517809      responses
## 5088 -0.1496727       sherlock
## 7930 -0.1479727      corollary
## 7931 -0.1479704     cl9my4ob75
## 4483 -0.1464797            giv
## 6815 -0.1442375     t4qlnqsllv
## 6377 -0.1435600 #leftiesinfear
## 6376 -0.1435585      unnoticed
## 1373 -0.1408639        kippers
## 6478 -0.1393758         makers
## 4005 -0.1387464       password
## 3271 -0.1383189            cry
## 5508 -0.1382327          sheep
## 3441 -0.1380439          whose
## 8189 -0.1342053        billy's
## 5153 -0.1333133          apace
## 3427 -0.1324824        tweeted
## 3959 -0.1308682          jokes
## 5591 -0.1291067        #ukitty
## 2953 -0.1291052        hacking
## 2821 -0.1281738           weed
## 5137 -0.1277565            yea
paste(df$word[1:30], collapse=", ")
## [1] "haaaaa, agg0ib2pgc, bantz, yugjvg8lvm, villain, bald, cockney, somebody, responses, sherlock, corollary, cl9my4ob75, giv, t4qlnqsllv, #leftiesinfear, unnoticed, kippers, makers, password, cry, sheep, whose, billy's, apace, tweeted, jokes, #ukitty, hacking, weed, yea"
df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "word")], n=30)
##          coef          word
## 3511 1.567545         nigga
## 4627 1.558831        warner
## 6349 1.546070         shush
## 6834 1.544518          sham
## 6912 1.525034        rumour
## 3611 1.523492          scum
## 4393 1.493816         steps
## 4358 1.489541        ripped
## 7010 1.433516          fits
## 7787 1.416251      #fuckoff
## 5379 1.410230     #nigelpls
## 6419 1.396834 maryhoneyball
## 1481 1.385162        wanker
## 7913 1.369533          sieg
## 606  1.368929          cunt
## 5575 1.366665       fuckers
## 5263 1.362895          mein
## 1792 1.358397         cunts
## 8109 1.354591         messy
## 3342 1.347614         man's
## 4352 1.344486  carperbagger
## 3619 1.325238        hitler
## 5459 1.262744          1814
## 4510 1.257586       mothers
## 3733 1.247559         dicks
## 7964 1.240060      slipping
## 2170 1.234104          cock
## 7116 1.227212          #mug
## 3240 1.227165       unlucky
## 5808 1.222314      mindless
paste(df$word[1:30], collapse=", ")
## [1] "nigga, warner, shush, sham, rumour, scum, steps, ripped, fits, #fuckoff, #nigelpls, maryhoneyball, wanker, sieg, cunt, fuckers, mein, cunts, messy, man's, carperbagger, hitler, 1814, mothers, dicks, slipping, cock, #mug, unlucky, mindless"

A special case: wordscores

Let’s check an example of wordscores. Here we have tweets from a random sample of 100 Members of the U.S. Congress, as well as their ideal points based on roll-call votes. Can we replicate the ideal points only using the text of their tweets?

cong <- read.csv("data/congress-tweets.csv", stringsAsFactors=F)
# creating the corpus and dfm objects
ccorpus <- corpus(cong$text)
docnames(ccorpus) <- cong$screen_name
cdfm <- dfm(ccorpus, remove_punct=TRUE, remove=c(stopwords("english"), "t.co", "https", "rt", "amp", "http", "t.c", "can"))
cdfm <- dfm_trim(cdfm, min_docfreq = 2)
# running wordscores
ws <- textmodel(cdfm, cong$idealPoint, model="wordscores", smooth=.5)
ws
## Fitted wordscores model:
## Call:
##  textmodel_wordscores(x = x, y = y, smooth = 0.5)
## 
## Reference documents and reference scores:
## 
##        Documents Ref scores
##      usreprodney       0.52
##      reprichmond      -0.78
##      rephanabusa      -0.98
##  repstevestivers       0.64
##   chrisvanhollen      -1.15
##  marshablackburn       1.21
##     repjohnlewis      -1.54
##      repstutzman       1.14
##      replipinski      -0.49
##   jacksonleetx18      -1.01
##        repveasey      -0.92
##   senatorisakson       0.90
##      repmurphyfl      -0.41
##      replankford       1.01
##  juliabrownley26      -0.77
##   repterrisewell      -0.74
##    repsusandavis      -1.14
##   senatorbaldwin      -1.29
##   reploisfrankel      -1.16
##    repmarktakano      -1.24
##  repjohnsarbanes      -1.49
##  vancemcallister       1.03
##     reptimmurphy       0.84
##    senatorwicker       0.89
##     repgwenmoore      -1.32
##  gracenapolitano      -1.35
##     senatorleahy      -1.01
##       senmikelee       1.48
##  repdonnaedwards      -1.70
##      repcummings      -1.40
##        sentoomey       1.07
##  replindasanchez      -1.63
##  repmickmulvaney       1.00
##      senatorreid      -0.35
##  blumenauermedia      -1.14
##      darrellissa       0.75
##      reptomprice       1.18
##    repcartwright      -1.27
##  jefffortenberry       0.58
##      repcardenas      -0.87
##   repjohnconyers      -1.46
##    senorrinhatch       0.78
##       repwalberg       0.99
##  repraulgrijalva      -1.29
##   senatorcollins       0.31
##      howardcoble       0.82
##   repkevincramer       0.64
##      rephultgren       1.07
##      sendonnelly      -0.24
##        drphilroe       0.94
##      johnboozman       1.09
##      greggharper       0.64
##    repjaredpolis      -0.68
##       kencalvert       0.68
##   repderekkilmer      -0.76
##  jasoninthehouse       0.91
##    senatorcardin      -1.26
##    senbillnelson      -0.76
##       toddrokita       1.14
##       repgarrett       1.03
##       buckmckeon       0.68
##  replarrybucshon       1.04
##     repdinatitus      -0.93
##          mariodb       0.57
##    repgarypeters      -0.61
##  repbrianhiggins      -1.24
##     repvisclosky      -0.88
##         repdavid       1.18
##      billowensny      -0.26
##        reptipton       0.69
##  repthomasmassie       0.64
##  repstephenlynch      -0.80
##   repmikemichaud      -0.81
##  robert_aderholt       0.85
##     senatorhagan      -0.22
##   repmikecoffman       0.70
##    repkaygranger       0.93
##    repsandylevin      -1.18
##   repfitzpatrick       0.34
##      repandybarr       1.01
##  repchriscollins       0.73
##    waxmanclimate      -1.24
##     reptomgraves       1.23
##        repmullin       1.02
##    senblumenthal      -1.28
##    reptimgriffin       0.81
##    repbobbyscott      -1.12
##         cbrangel      -1.08
##     repmarkpocan      -2.12
##    repjoecrowley      -1.32
##  benniegthompson      -0.91
##      jimlangevin      -0.99
##       repsamfarr      -1.46
##   jimpressoffice       0.86
##     repduckworth      -0.67
##      judgecarter       0.83
##     repdanmaffei      -0.40
##     repkarenbass      -1.65
##    senatorharkin      -0.80
##  repandyharrismd       1.10
## 
## Word scores: showing first 30 scored features
## 
##     today      will         w     house     great      bill    thanks 
##    -0.105    -0.037    -0.308     0.063    -0.112    -0.074    -0.052 
##       act       new       now    bit.ly       day        us      jobs 
##    -0.219    -0.159    -0.181     0.188    -0.162    -0.265    -0.098 
##      work    health      time         2   support     watch      live 
##    -0.368    -0.195    -0.227    -0.341    -0.258     0.052     0.136 
##    senate      just      need     thank  congress      help    budget 
##     0.023     0.035    -0.319    -0.196    -0.209    -0.286     0.289 
## president      vote 
##     0.148    -0.238
# let's look at the most discriminant words
sw <- sort(ws@Sw)
head(sw, n=20)
##              #wi    @repmarkpocan @replindasanchez              #p2 
##       -1.3352153       -1.2675807       -1.2542263       -1.2069635 
##  @usprogressives    @repgwenmoore     @repcummings  @repjohnconyers 
##       -1.1726553       -1.1138345       -1.1095812       -1.1079106 
##   @oversightdems    #raisethewage         #renewui               tb 
##       -1.0948210       -1.0943028       -1.0780798       -1.0773701 
##   #climatechange     #gopshutdown @repdonnaedwards   @repcartwright 
##       -1.0456582       -1.0422948       -1.0421310       -1.0267977 
##    @repjohnlewis   @senatorcardin   @waysmeanscmte    #actonclimate 
##       -1.0255346       -1.0089428       -0.9947460       -0.9942281
tail(sw, n=20)
##         #pjnet     #obamacare @gopconference          is.gd          ow.ly 
##      0.6778709      0.6834714      0.6857207      0.6878029      0.6943258 
##          #az05   @edworkforce  @gopoversight      obamacare          #utah 
##      0.7026601      0.7044792      0.7068355      0.7129993      0.7423978 
##           #ar2    @toddrokita #cutcapbalance      #sctweets      @repdavid 
##      0.7446651      0.7696289      0.7708878      0.7791884      0.8269042 
##          #tcot    @senmikelee         #4jobs         #utsen         #utpol 
##      0.8358937      0.8610052      0.8776630      0.9463618      1.0911883

Now let’s split the data into training and test set and see what we can learn…

set.seed(123)
test <- sample(1:nrow(cong), floor(.20 * nrow(cong)))
# extracting ideal points and replacing them with missing values
refpoints <- cong$idealPoint
refpoints[test] <- NA
# running wordscores
ws <- textmodel(cdfm, refpoints, model="wordscores", smooth=.5)
# predicted values (this will take a while...)
preds <- predict(ws, rescaling="lbg")
scores <- preds@textscores
# and let's compare
plot(scores$textscore_lbg[test], cong$idealPoint[test])

cor(scores$textscore_lbg[test], cong$idealPoint[test])
## [1] 0.8435688