Wordscores

Let’s check an example of wordscores. Here we have tweets from a random sample of 100 Members of the U.S. Congress, as well as their ideal points based on roll-call votes. Can we replicate the ideal points only using the text of their tweets?

cong <- read.csv("../data/congress-tweets.csv", stringsAsFactors=F)
# creating the corpus and dfm objects
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.2
## quanteda version 0.99.9
## Using 3 of 4 threads for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
ccorpus <- corpus(cong$text)
docnames(ccorpus) <- cong$screen_name
cdfm <- dfm(ccorpus, remove_punct=TRUE, remove=c(stopwords("english"), "t.co", "https", "rt", "amp", "http", "t.c", "can"))
cdfm <- dfm_trim(cdfm, min_docfreq = 2)
# running wordscores
ws <- textmodel(cdfm, cong$idealPoint, model="wordscores", smooth=.5)
ws
## Fitted wordscores model:
## Call:
##  textmodel_wordscores.dfm(x = x, y = y, smooth = 0.5)
## 
## Reference documents and reference scores:
## 
##        Documents Ref scores
##      usreprodney       0.52
##      reprichmond      -0.78
##      rephanabusa      -0.98
##  repstevestivers       0.64
##   chrisvanhollen      -1.15
##  marshablackburn       1.21
##     repjohnlewis      -1.54
##      repstutzman       1.14
##      replipinski      -0.49
##   jacksonleetx18      -1.01
##        repveasey      -0.92
##   senatorisakson       0.90
##      repmurphyfl      -0.41
##      replankford       1.01
##  juliabrownley26      -0.77
##   repterrisewell      -0.74
##    repsusandavis      -1.14
##   senatorbaldwin      -1.29
##   reploisfrankel      -1.16
##    repmarktakano      -1.24
##  repjohnsarbanes      -1.49
##  vancemcallister       1.03
##     reptimmurphy       0.84
##    senatorwicker       0.89
##     repgwenmoore      -1.32
##  gracenapolitano      -1.35
##     senatorleahy      -1.01
##       senmikelee       1.48
##  repdonnaedwards      -1.70
##      repcummings      -1.40
##        sentoomey       1.07
##  replindasanchez      -1.63
##  repmickmulvaney       1.00
##      senatorreid      -0.35
##  blumenauermedia      -1.14
##      darrellissa       0.75
##      reptomprice       1.18
##    repcartwright      -1.27
##  jefffortenberry       0.58
##      repcardenas      -0.87
##   repjohnconyers      -1.46
##    senorrinhatch       0.78
##       repwalberg       0.99
##  repraulgrijalva      -1.29
##   senatorcollins       0.31
##      howardcoble       0.82
##   repkevincramer       0.64
##      rephultgren       1.07
##      sendonnelly      -0.24
##        drphilroe       0.94
##      johnboozman       1.09
##      greggharper       0.64
##    repjaredpolis      -0.68
##       kencalvert       0.68
##   repderekkilmer      -0.76
##  jasoninthehouse       0.91
##    senatorcardin      -1.26
##    senbillnelson      -0.76
##       toddrokita       1.14
##       repgarrett       1.03
##       buckmckeon       0.68
##  replarrybucshon       1.04
##     repdinatitus      -0.93
##          mariodb       0.57
##    repgarypeters      -0.61
##  repbrianhiggins      -1.24
##     repvisclosky      -0.88
##         repdavid       1.18
##      billowensny      -0.26
##        reptipton       0.69
##  repthomasmassie       0.64
##  repstephenlynch      -0.80
##   repmikemichaud      -0.81
##  robert_aderholt       0.85
##     senatorhagan      -0.22
##   repmikecoffman       0.70
##    repkaygranger       0.93
##    repsandylevin      -1.18
##   repfitzpatrick       0.34
##      repandybarr       1.01
##  repchriscollins       0.73
##    waxmanclimate      -1.24
##     reptomgraves       1.23
##        repmullin       1.02
##    senblumenthal      -1.28
##    reptimgriffin       0.81
##    repbobbyscott      -1.12
##         cbrangel      -1.08
##     repmarkpocan      -2.12
##    repjoecrowley      -1.32
##  benniegthompson      -0.91
##      jimlangevin      -0.99
##       repsamfarr      -1.46
##   jimpressoffice       0.86
##     repduckworth      -0.67
##      judgecarter       0.83
##     repdanmaffei      -0.40
##     repkarenbass      -1.65
##    senatorharkin      -0.80
##  repandyharrismd       1.10
## 
## Word scores: showing first 30 scored features
## 
##         just        voted  legislation          end          air 
##        0.035        0.145        0.022       -0.383       -0.051 
##      traffic      control       delays   nationwide         good 
##       -0.062        0.060        0.193       -0.269       -0.109 
##         news    traveling       public        great         time 
##       -0.018        0.223       -0.290       -0.112       -0.227 
##        taste   bloomfield     cultural       center       sunday 
##       -0.225       -0.177       -0.279       -0.273       -0.172 
##    afternoon         food       people      visited      morning 
##        0.207       -0.398       -0.219       -0.051        0.195 
## conversation      getting      economy       moving      service 
##       -0.048       -0.115       -0.230       -0.329       -0.156
# let's look at the most discriminant words
sw <- sort(ws@Sw)
head(sw, n=20)
##              #wi    @repmarkpocan @replindasanchez              #p2 
##       -1.3351011       -1.2673891       -1.2540401       -1.2068895 
##  @usprogressives    @repgwenmoore     @repcummings  @repjohnconyers 
##       -1.1725307       -1.1138718       -1.1096208       -1.1078346 
##   @oversightdems    #raisethewage         #renewui               tb 
##       -1.0948422       -1.0941956       -1.0778527       -1.0772057 
##   #climatechange     #gopshutdown @repdonnaedwards   @repcartwright 
##       -1.0455227       -1.0422560       -1.0420281       -1.0268366 
##    @repjohnlewis   @senatorcardin   @waysmeanscmte    #actonclimate 
##       -1.0255089       -1.0091856       -0.9947236       -0.9940664
tail(sw, n=20)
##         #pjnet     #obamacare @gopconference          is.gd          ow.ly 
##      0.6779170      0.6836844      0.6859472      0.6889041      0.6950465 
##          #az05   @edworkforce  @gopoversight      obamacare          #utah 
##      0.7030204      0.7046986      0.7069328      0.7131235      0.7436576 
##           #ar2    @toddrokita #cutcapbalance      #sctweets      @repdavid 
##      0.7447113      0.7698070      0.7715263      0.7791193      0.8271905 
##          #tcot    @senmikelee         #4jobs         #utsen         #utpol 
##      0.8360768      0.8621148      0.8777774      0.9475660      1.0917893

Now let’s split the data into training and test set and see what we can learn…

set.seed(123)
test <- sample(1:nrow(cong), floor(.20 * nrow(cong)))
# extracting ideal points and replacing them with missing values
refpoints <- cong$idealPoint
refpoints[test] <- NA
# running wordscores
ws <- textmodel(cdfm, refpoints, model="wordscores", smooth=.5)
# predicted values (this will take a while...)
preds <- predict(ws, rescaling="lbg")
scores <- preds@textscores
# and let's compare
plot(scores$textscore_lbg[test], cong$idealPoint[test])

cor(scores$textscore_lbg[test], cong$idealPoint[test])
## [1] 0.8446185

Wordfish

To explore an unsupervised approach to ideological scaling, let’s continue with our previous example tweets by Members of Congress. Can we recover a latent ideological dimension based on the text of their tweets?

# note heavy feature selection!
cdfm <- dfm_trim(cdfm, min_docfreq = 25)
# running wordfish
wf <- textmodel(cdfm, dir=c(10, 8), model="wordfish")
wf
## Fitted wordfish model:
## Call:
##  textmodel_wordfish.dfm(x = x, dir = ..1)
## 
## Estimated document positions:
## 
##           Documents       theta          SE       lower        upper
## 1       usreprodney -0.34654711 0.066273857 -0.47644387 -0.216650348
## 2       reprichmond -0.72432926 0.025602151 -0.77450947 -0.674149042
## 3       rephanabusa -0.30766156 0.036754170 -0.37969974 -0.235623388
## 4   repstevestivers  0.98444120 0.025086341  0.93527197  1.033610427
## 5    chrisvanhollen -0.89944961 0.022193493 -0.94294885 -0.855950361
## 6   marshablackburn  1.22568161 0.017790847  1.19081155  1.260551670
## 7      repjohnlewis -1.00507822 0.020014654 -1.04430694 -0.965849496
## 8       repstutzman  1.29810297 0.026802135  1.24557078  1.350635151
## 9       replipinski -0.26182664 0.052416972 -0.36456391 -0.159089377
## 10   jacksonleetx18 -1.03401450 0.014446623 -1.06232988 -1.005699120
## 11        repveasey -1.13959495 0.018219471 -1.17530511 -1.103884784
## 12   senatorisakson  0.28151171 0.046035626  0.19128189  0.371741541
## 13      repmurphyfl -0.84787586 0.027220426 -0.90122790 -0.794523830
## 14      replankford  1.59320490 0.011154795  1.57134150  1.615068294
## 15  juliabrownley26 -1.15087233 0.021920472 -1.19383646 -1.107908204
## 16   repterrisewell -0.97753742 0.019160577 -1.01509215 -0.939982693
## 17    repsusandavis -1.11875480 0.020803835 -1.15953031 -1.077979279
## 18   senatorbaldwin -0.99652727 0.014701204 -1.02534163 -0.967712915
## 19   reploisfrankel -1.23053378 0.013956845 -1.25788919 -1.203178362
## 20    repmarktakano -0.92185557 0.026373487 -0.97354760 -0.870163533
## 21  repjohnsarbanes -1.04229087 0.029854862 -1.10080640 -0.983775344
## 22  vancemcallister  0.35800148 0.123996731  0.11496788  0.601035069
## 23     reptimmurphy  0.86695056 0.026515327  0.81498052  0.918920598
## 24    senatorwicker  0.59910852 0.027913240  0.54439857  0.653818468
## 25     repgwenmoore -1.18940358 0.008315103 -1.20570118 -1.173105976
## 26  gracenapolitano -0.84620389 0.026941754 -0.89900973 -0.793398053
## 27     senatorleahy -0.98590997 0.012119844 -1.00966486 -0.962155075
## 28       senmikelee  1.17538880 0.017102927  1.14186706  1.208910533
## 29  repdonnaedwards -1.02544716 0.012807266 -1.05054940 -1.000344917
## 30      repcummings -0.69484257 0.026706552 -0.74718741 -0.642497728
## 31        sentoomey  0.60527623 0.021957724  0.56223909  0.648313368
## 32  replindasanchez -1.18441885 0.012918929 -1.20973995 -1.159097748
## 33  repmickmulvaney  1.28189071 0.025965570  1.23099819  1.332783223
## 34      senatorreid -0.75013297 0.015106225 -0.77974117 -0.720524765
## 35  blumenauermedia  0.06936877 0.065169668 -0.05836378  0.197101315
## 36      darrellissa  1.75632281 0.007041777  1.74252092  1.770124690
## 37      reptomprice  1.50291937 0.010370060  1.48259405  1.523244685
## 38    repcartwright -0.98474962 0.017590992 -1.01922796 -0.950271274
## 39  jefffortenberry  0.90425235 0.027941293  0.84948741  0.959017280
## 40      repcardenas -1.01062314 0.013698222 -1.03747166 -0.983774625
## 41   repjohnconyers -1.29340107 0.009927528 -1.31285903 -1.273943117
## 42    senorrinhatch  1.10791125 0.022239133  1.06432255  1.151499955
## 43       repwalberg  1.51722422 0.016219209  1.48543457  1.549013874
## 44  repraulgrijalva -0.92172561 0.016705612 -0.95446861 -0.888982609
## 45   senatorcollins -0.38473872 0.031901043 -0.44726476 -0.322212673
## 46      howardcoble  0.57609254 0.047126641  0.48372432  0.668460755
## 47   repkevincramer  0.87382874 0.040668654  0.79411818  0.953539305
## 48      rephultgren  0.81180144 0.022607189  0.76749135  0.856111533
## 49      sendonnelly -0.40656198 0.026714152 -0.45892171 -0.354202237
## 50        drphilroe  1.21638039 0.023997390  1.16934551  1.263415277
## 51      johnboozman  0.78489712 0.028136237  0.72975009  0.840044141
## 52      greggharper  1.18435054 0.019120048  1.14687524  1.221825830
## 53    repjaredpolis -1.14443070 0.016533330 -1.17683603 -1.112025376
## 54       kencalvert  1.26061396 0.016478406  1.22831628  1.292911636
## 55   repderekkilmer -0.84546844 0.035822740 -0.91568101 -0.775255874
## 56  jasoninthehouse  1.44465546 0.013973157  1.41726807  1.472042850
## 57    senatorcardin -1.27968469 0.006168287 -1.29177453 -1.267594844
## 58    senbillnelson  0.17371651 0.049326283  0.07703699  0.270396021
## 59       toddrokita  0.97901972 0.018383911  0.94298726  1.015052186
## 60       repgarrett  1.26365258 0.036970448  1.19119050  1.336114661
## 61       buckmckeon  1.26653064 0.021412158  1.22456281  1.308498473
## 62  replarrybucshon  1.32208714 0.014911655  1.29286029  1.351313980
## 63     repdinatitus -1.19308757 0.010996056 -1.21463984 -1.171535301
## 64          mariodb -0.03453142 0.026905696 -0.08726659  0.018203739
## 65    repgarypeters -0.94325151 0.018598970 -0.97970549 -0.906797526
## 66  repbrianhiggins -0.73073403 0.016648164 -0.76336443 -0.698103628
## 67     repvisclosky -0.30909502 0.030965725 -0.36978784 -0.248402199
## 68         repdavid  0.95262556 0.018893108  0.91559507  0.989656052
## 69      billowensny  0.15292718 0.026970313  0.10006537  0.205788997
## 70        reptipton  0.99501947 0.029227040  0.93773447  1.052304468
## 71  repthomasmassie  1.08584526 0.028732741  1.02952909  1.142161431
## 72  repstephenlynch -0.39488319 0.040142589 -0.47356267 -0.316203718
## 73   repmikemichaud -0.22561108 0.044107581 -0.31206194 -0.139160225
## 74  robert_aderholt  1.13310557 0.027482621  1.07923963  1.186971506
## 75     senatorhagan -0.70057420 0.016285957 -0.73249468 -0.668653727
## 76   repmikecoffman  1.30233208 0.019454339  1.26420158  1.340462584
## 77    repkaygranger  0.94161035 0.028952577  0.88486330  0.998357397
## 78    repsandylevin -1.27682647 0.009249804 -1.29495609 -1.258696857
## 79   repfitzpatrick  0.26292355 0.021007397  0.22174905  0.304098044
## 80      repandybarr  0.69609199 0.027114331  0.64294790  0.749236076
## 81  repchriscollins  1.08848542 0.020757678  1.04780037  1.129170466
## 82    waxmanclimate -1.53671530 0.004293674 -1.54513090 -1.528299697
## 83     reptomgraves  1.32378324 0.016484310  1.29147399  1.356092486
## 84        repmullin  1.01500480 0.030269603  0.95567638  1.074333220
## 85    senblumenthal -0.94253947 0.013414743 -0.96883237 -0.916246574
## 86    reptimgriffin  1.19009896 0.016537659  1.15768515  1.222512770
## 87    repbobbyscott -0.80465890 0.026591994 -0.85677921 -0.752538595
## 88         cbrangel -1.04647435 0.011774522 -1.06955241 -1.023396286
## 89     repmarkpocan -1.15302147 0.012509385 -1.17753986 -1.128503075
## 90    repjoecrowley -1.19854386 0.012559083 -1.22315967 -1.173928060
## 91  benniegthompson -0.12721879 0.062978467 -0.25065659 -0.003780997
## 92      jimlangevin -0.85679309 0.016522624 -0.88917744 -0.824408752
## 93       repsamfarr -0.99488811 0.020732126 -1.03552308 -0.954253147
## 94   jimpressoffice  1.24012896 0.067677990  1.10748010  1.372777825
## 95     repduckworth -0.66963484 0.031653250 -0.73167521 -0.607594468
## 96      judgecarter  1.37632360 0.013169031  1.35051230  1.402134904
## 97     repdanmaffei -0.38061978 0.043800577 -0.46646891 -0.294770652
## 98     repkarenbass -0.82506777 0.019043761 -0.86239355 -0.787742003
## 99    senatorharkin -0.83134290 0.016884405 -0.86443634 -0.798249469
## 100 repandyharrismd  1.08711566 0.026995562  1.03420436  1.140026966
## 
## Estimated feature scores: showing first 30 beta-hats for features
## 
##         just        voted  legislation          end          air 
##  0.194182768  0.285992671  0.132697790 -0.218869690  0.056187447 
##      traffic      control       delays   nationwide         good 
##  0.118271601  0.246467115  0.594608733 -0.409474969 -0.001504398 
##         news    traveling       public        great         time 
##  0.136417126  0.605587397 -0.137197270 -0.019235023 -0.067728778 
##        taste     cultural       center       sunday    afternoon 
## -0.386907589 -0.518190924 -0.138950408 -0.017914261  0.303544837 
##         food       people      visited      morning conversation 
## -0.345301076 -0.044568124 -0.005870757  0.314990104  0.082962744 
##      getting      economy       moving      service      academy 
##  0.035210498 -0.113627242 -0.280083823 -0.059922465  0.222496870
# let's look at the most discriminant words (note same notation as in slides)
sw <- data.frame(beta=wf@beta, word=wf@features)
sw <- sw[order(sw$beta),]
head(sw, n=20)
##            beta              word
## 3903 -12.718477    #climatechange
## 3736 -10.066701     #actonclimate
## 2404  -5.865594   @repjohnconyers
## 3738  -5.525047    @waysmeanscmte
## 3751  -5.311831    @repsandylevin
## 3228  -4.862391               ben
## 1477  -4.652580         pollution
## 2728  -3.798844         @lcvoters
## 3675  -3.777414     #womensucceed
## 2937  -3.607759             #lgbt
## 2372  -3.350516        #timeisnow
## 2165  -3.227202     @keithellison
## 2401  -3.035978          #renewui
## 3726  -2.948591              #cir
## 4124  -2.928738         #equality
## 4142  -2.704724    @janschakowsky
## 2374  -2.678966      #demandavote
## 2844  -2.594303             #enda
## 2320  -2.486419      #gopshutdown
## 2156  -2.442094 #stopthesequester
tail(sw, n=20)
##          beta             word
## 3089 1.486601       @washtimes
## 3067 1.526249  #balancedbudget
## 3170 1.526542      #fullrepeal
## 3033 1.611470            #tcot
## 3280 1.611940      #trainwreck
## 3262 1.705790             #irs
## 1070 1.717157           #4jobs
## 3909 1.734734  #fairnessforall
## 3082 1.742444     house-passed
## 3786 1.749518      bureaucrats
## 3140 1.776164   @gopconference
## 3041 1.802925         @gopwhip
## 2576 1.887696             j.mp
## 3159 1.912347 @waysandmeansgop
## 3374 2.518736 @republicanstudy
## 3790 2.562191           emails
## 3632 2.670262     @reptomprice
## 3423 3.107402             lois
## 3721 3.544742     @darrellissa
## 3252 7.064208    @gopoversight
# and now we can compare the estimate positions with the ideal points...
plot(wf@theta, cong$idealPoint)

cor(wf@theta, cong$idealPoint)
## [1] 0.9170117
cor(wf@theta[cong$party=="R"], cong$idealPoint[cong$party=="R"])
## [1] 0.4368558
cor(wf@theta[cong$party=="D"], cong$idealPoint[cong$party=="D"])
## [1] 0.4844951