Let’s check an example of wordscores. Here we have tweets from a random sample of 100 Members of the U.S. Congress, as well as their ideal points based on roll-call votes. Can we replicate the ideal points only using the text of their tweets?
cong <- read.csv("../data/congress-tweets.csv", stringsAsFactors=F)
# creating the corpus and dfm objects
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.2
## quanteda version 0.99.9
## Using 3 of 4 threads for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
ccorpus <- corpus(cong$text)
docnames(ccorpus) <- cong$screen_name
cdfm <- dfm(ccorpus, remove_punct=TRUE, remove=c(stopwords("english"), "t.co", "https", "rt", "amp", "http", "t.c", "can"))
cdfm <- dfm_trim(cdfm, min_docfreq = 2)
# running wordscores
ws <- textmodel(cdfm, cong$idealPoint, model="wordscores", smooth=.5)
ws
## Fitted wordscores model:
## Call:
## textmodel_wordscores.dfm(x = x, y = y, smooth = 0.5)
##
## Reference documents and reference scores:
##
## Documents Ref scores
## usreprodney 0.52
## reprichmond -0.78
## rephanabusa -0.98
## repstevestivers 0.64
## chrisvanhollen -1.15
## marshablackburn 1.21
## repjohnlewis -1.54
## repstutzman 1.14
## replipinski -0.49
## jacksonleetx18 -1.01
## repveasey -0.92
## senatorisakson 0.90
## repmurphyfl -0.41
## replankford 1.01
## juliabrownley26 -0.77
## repterrisewell -0.74
## repsusandavis -1.14
## senatorbaldwin -1.29
## reploisfrankel -1.16
## repmarktakano -1.24
## repjohnsarbanes -1.49
## vancemcallister 1.03
## reptimmurphy 0.84
## senatorwicker 0.89
## repgwenmoore -1.32
## gracenapolitano -1.35
## senatorleahy -1.01
## senmikelee 1.48
## repdonnaedwards -1.70
## repcummings -1.40
## sentoomey 1.07
## replindasanchez -1.63
## repmickmulvaney 1.00
## senatorreid -0.35
## blumenauermedia -1.14
## darrellissa 0.75
## reptomprice 1.18
## repcartwright -1.27
## jefffortenberry 0.58
## repcardenas -0.87
## repjohnconyers -1.46
## senorrinhatch 0.78
## repwalberg 0.99
## repraulgrijalva -1.29
## senatorcollins 0.31
## howardcoble 0.82
## repkevincramer 0.64
## rephultgren 1.07
## sendonnelly -0.24
## drphilroe 0.94
## johnboozman 1.09
## greggharper 0.64
## repjaredpolis -0.68
## kencalvert 0.68
## repderekkilmer -0.76
## jasoninthehouse 0.91
## senatorcardin -1.26
## senbillnelson -0.76
## toddrokita 1.14
## repgarrett 1.03
## buckmckeon 0.68
## replarrybucshon 1.04
## repdinatitus -0.93
## mariodb 0.57
## repgarypeters -0.61
## repbrianhiggins -1.24
## repvisclosky -0.88
## repdavid 1.18
## billowensny -0.26
## reptipton 0.69
## repthomasmassie 0.64
## repstephenlynch -0.80
## repmikemichaud -0.81
## robert_aderholt 0.85
## senatorhagan -0.22
## repmikecoffman 0.70
## repkaygranger 0.93
## repsandylevin -1.18
## repfitzpatrick 0.34
## repandybarr 1.01
## repchriscollins 0.73
## waxmanclimate -1.24
## reptomgraves 1.23
## repmullin 1.02
## senblumenthal -1.28
## reptimgriffin 0.81
## repbobbyscott -1.12
## cbrangel -1.08
## repmarkpocan -2.12
## repjoecrowley -1.32
## benniegthompson -0.91
## jimlangevin -0.99
## repsamfarr -1.46
## jimpressoffice 0.86
## repduckworth -0.67
## judgecarter 0.83
## repdanmaffei -0.40
## repkarenbass -1.65
## senatorharkin -0.80
## repandyharrismd 1.10
##
## Word scores: showing first 30 scored features
##
## just voted legislation end air
## 0.035 0.145 0.022 -0.383 -0.051
## traffic control delays nationwide good
## -0.062 0.060 0.193 -0.269 -0.109
## news traveling public great time
## -0.018 0.223 -0.290 -0.112 -0.227
## taste bloomfield cultural center sunday
## -0.225 -0.177 -0.279 -0.273 -0.172
## afternoon food people visited morning
## 0.207 -0.398 -0.219 -0.051 0.195
## conversation getting economy moving service
## -0.048 -0.115 -0.230 -0.329 -0.156
# let's look at the most discriminant words
sw <- sort(ws@Sw)
head(sw, n=20)
## #wi @repmarkpocan @replindasanchez #p2
## -1.3351011 -1.2673891 -1.2540401 -1.2068895
## @usprogressives @repgwenmoore @repcummings @repjohnconyers
## -1.1725307 -1.1138718 -1.1096208 -1.1078346
## @oversightdems #raisethewage #renewui tb
## -1.0948422 -1.0941956 -1.0778527 -1.0772057
## #climatechange #gopshutdown @repdonnaedwards @repcartwright
## -1.0455227 -1.0422560 -1.0420281 -1.0268366
## @repjohnlewis @senatorcardin @waysmeanscmte #actonclimate
## -1.0255089 -1.0091856 -0.9947236 -0.9940664
tail(sw, n=20)
## #pjnet #obamacare @gopconference is.gd ow.ly
## 0.6779170 0.6836844 0.6859472 0.6889041 0.6950465
## #az05 @edworkforce @gopoversight obamacare #utah
## 0.7030204 0.7046986 0.7069328 0.7131235 0.7436576
## #ar2 @toddrokita #cutcapbalance #sctweets @repdavid
## 0.7447113 0.7698070 0.7715263 0.7791193 0.8271905
## #tcot @senmikelee #4jobs #utsen #utpol
## 0.8360768 0.8621148 0.8777774 0.9475660 1.0917893
Now let’s split the data into training and test set and see what we can learn…
set.seed(123)
test <- sample(1:nrow(cong), floor(.20 * nrow(cong)))
# extracting ideal points and replacing them with missing values
refpoints <- cong$idealPoint
refpoints[test] <- NA
# running wordscores
ws <- textmodel(cdfm, refpoints, model="wordscores", smooth=.5)
# predicted values (this will take a while...)
preds <- predict(ws, rescaling="lbg")
scores <- preds@textscores
# and let's compare
plot(scores$textscore_lbg[test], cong$idealPoint[test])
cor(scores$textscore_lbg[test], cong$idealPoint[test])
## [1] 0.8446185
To explore an unsupervised approach to ideological scaling, let’s continue with our previous example tweets by Members of Congress. Can we recover a latent ideological dimension based on the text of their tweets?
# note heavy feature selection!
cdfm <- dfm_trim(cdfm, min_docfreq = 25)
# running wordfish
wf <- textmodel(cdfm, dir=c(10, 8), model="wordfish")
wf
## Fitted wordfish model:
## Call:
## textmodel_wordfish.dfm(x = x, dir = ..1)
##
## Estimated document positions:
##
## Documents theta SE lower upper
## 1 usreprodney -0.34654711 0.066273857 -0.47644387 -0.216650348
## 2 reprichmond -0.72432926 0.025602151 -0.77450947 -0.674149042
## 3 rephanabusa -0.30766156 0.036754170 -0.37969974 -0.235623388
## 4 repstevestivers 0.98444120 0.025086341 0.93527197 1.033610427
## 5 chrisvanhollen -0.89944961 0.022193493 -0.94294885 -0.855950361
## 6 marshablackburn 1.22568161 0.017790847 1.19081155 1.260551670
## 7 repjohnlewis -1.00507822 0.020014654 -1.04430694 -0.965849496
## 8 repstutzman 1.29810297 0.026802135 1.24557078 1.350635151
## 9 replipinski -0.26182664 0.052416972 -0.36456391 -0.159089377
## 10 jacksonleetx18 -1.03401450 0.014446623 -1.06232988 -1.005699120
## 11 repveasey -1.13959495 0.018219471 -1.17530511 -1.103884784
## 12 senatorisakson 0.28151171 0.046035626 0.19128189 0.371741541
## 13 repmurphyfl -0.84787586 0.027220426 -0.90122790 -0.794523830
## 14 replankford 1.59320490 0.011154795 1.57134150 1.615068294
## 15 juliabrownley26 -1.15087233 0.021920472 -1.19383646 -1.107908204
## 16 repterrisewell -0.97753742 0.019160577 -1.01509215 -0.939982693
## 17 repsusandavis -1.11875480 0.020803835 -1.15953031 -1.077979279
## 18 senatorbaldwin -0.99652727 0.014701204 -1.02534163 -0.967712915
## 19 reploisfrankel -1.23053378 0.013956845 -1.25788919 -1.203178362
## 20 repmarktakano -0.92185557 0.026373487 -0.97354760 -0.870163533
## 21 repjohnsarbanes -1.04229087 0.029854862 -1.10080640 -0.983775344
## 22 vancemcallister 0.35800148 0.123996731 0.11496788 0.601035069
## 23 reptimmurphy 0.86695056 0.026515327 0.81498052 0.918920598
## 24 senatorwicker 0.59910852 0.027913240 0.54439857 0.653818468
## 25 repgwenmoore -1.18940358 0.008315103 -1.20570118 -1.173105976
## 26 gracenapolitano -0.84620389 0.026941754 -0.89900973 -0.793398053
## 27 senatorleahy -0.98590997 0.012119844 -1.00966486 -0.962155075
## 28 senmikelee 1.17538880 0.017102927 1.14186706 1.208910533
## 29 repdonnaedwards -1.02544716 0.012807266 -1.05054940 -1.000344917
## 30 repcummings -0.69484257 0.026706552 -0.74718741 -0.642497728
## 31 sentoomey 0.60527623 0.021957724 0.56223909 0.648313368
## 32 replindasanchez -1.18441885 0.012918929 -1.20973995 -1.159097748
## 33 repmickmulvaney 1.28189071 0.025965570 1.23099819 1.332783223
## 34 senatorreid -0.75013297 0.015106225 -0.77974117 -0.720524765
## 35 blumenauermedia 0.06936877 0.065169668 -0.05836378 0.197101315
## 36 darrellissa 1.75632281 0.007041777 1.74252092 1.770124690
## 37 reptomprice 1.50291937 0.010370060 1.48259405 1.523244685
## 38 repcartwright -0.98474962 0.017590992 -1.01922796 -0.950271274
## 39 jefffortenberry 0.90425235 0.027941293 0.84948741 0.959017280
## 40 repcardenas -1.01062314 0.013698222 -1.03747166 -0.983774625
## 41 repjohnconyers -1.29340107 0.009927528 -1.31285903 -1.273943117
## 42 senorrinhatch 1.10791125 0.022239133 1.06432255 1.151499955
## 43 repwalberg 1.51722422 0.016219209 1.48543457 1.549013874
## 44 repraulgrijalva -0.92172561 0.016705612 -0.95446861 -0.888982609
## 45 senatorcollins -0.38473872 0.031901043 -0.44726476 -0.322212673
## 46 howardcoble 0.57609254 0.047126641 0.48372432 0.668460755
## 47 repkevincramer 0.87382874 0.040668654 0.79411818 0.953539305
## 48 rephultgren 0.81180144 0.022607189 0.76749135 0.856111533
## 49 sendonnelly -0.40656198 0.026714152 -0.45892171 -0.354202237
## 50 drphilroe 1.21638039 0.023997390 1.16934551 1.263415277
## 51 johnboozman 0.78489712 0.028136237 0.72975009 0.840044141
## 52 greggharper 1.18435054 0.019120048 1.14687524 1.221825830
## 53 repjaredpolis -1.14443070 0.016533330 -1.17683603 -1.112025376
## 54 kencalvert 1.26061396 0.016478406 1.22831628 1.292911636
## 55 repderekkilmer -0.84546844 0.035822740 -0.91568101 -0.775255874
## 56 jasoninthehouse 1.44465546 0.013973157 1.41726807 1.472042850
## 57 senatorcardin -1.27968469 0.006168287 -1.29177453 -1.267594844
## 58 senbillnelson 0.17371651 0.049326283 0.07703699 0.270396021
## 59 toddrokita 0.97901972 0.018383911 0.94298726 1.015052186
## 60 repgarrett 1.26365258 0.036970448 1.19119050 1.336114661
## 61 buckmckeon 1.26653064 0.021412158 1.22456281 1.308498473
## 62 replarrybucshon 1.32208714 0.014911655 1.29286029 1.351313980
## 63 repdinatitus -1.19308757 0.010996056 -1.21463984 -1.171535301
## 64 mariodb -0.03453142 0.026905696 -0.08726659 0.018203739
## 65 repgarypeters -0.94325151 0.018598970 -0.97970549 -0.906797526
## 66 repbrianhiggins -0.73073403 0.016648164 -0.76336443 -0.698103628
## 67 repvisclosky -0.30909502 0.030965725 -0.36978784 -0.248402199
## 68 repdavid 0.95262556 0.018893108 0.91559507 0.989656052
## 69 billowensny 0.15292718 0.026970313 0.10006537 0.205788997
## 70 reptipton 0.99501947 0.029227040 0.93773447 1.052304468
## 71 repthomasmassie 1.08584526 0.028732741 1.02952909 1.142161431
## 72 repstephenlynch -0.39488319 0.040142589 -0.47356267 -0.316203718
## 73 repmikemichaud -0.22561108 0.044107581 -0.31206194 -0.139160225
## 74 robert_aderholt 1.13310557 0.027482621 1.07923963 1.186971506
## 75 senatorhagan -0.70057420 0.016285957 -0.73249468 -0.668653727
## 76 repmikecoffman 1.30233208 0.019454339 1.26420158 1.340462584
## 77 repkaygranger 0.94161035 0.028952577 0.88486330 0.998357397
## 78 repsandylevin -1.27682647 0.009249804 -1.29495609 -1.258696857
## 79 repfitzpatrick 0.26292355 0.021007397 0.22174905 0.304098044
## 80 repandybarr 0.69609199 0.027114331 0.64294790 0.749236076
## 81 repchriscollins 1.08848542 0.020757678 1.04780037 1.129170466
## 82 waxmanclimate -1.53671530 0.004293674 -1.54513090 -1.528299697
## 83 reptomgraves 1.32378324 0.016484310 1.29147399 1.356092486
## 84 repmullin 1.01500480 0.030269603 0.95567638 1.074333220
## 85 senblumenthal -0.94253947 0.013414743 -0.96883237 -0.916246574
## 86 reptimgriffin 1.19009896 0.016537659 1.15768515 1.222512770
## 87 repbobbyscott -0.80465890 0.026591994 -0.85677921 -0.752538595
## 88 cbrangel -1.04647435 0.011774522 -1.06955241 -1.023396286
## 89 repmarkpocan -1.15302147 0.012509385 -1.17753986 -1.128503075
## 90 repjoecrowley -1.19854386 0.012559083 -1.22315967 -1.173928060
## 91 benniegthompson -0.12721879 0.062978467 -0.25065659 -0.003780997
## 92 jimlangevin -0.85679309 0.016522624 -0.88917744 -0.824408752
## 93 repsamfarr -0.99488811 0.020732126 -1.03552308 -0.954253147
## 94 jimpressoffice 1.24012896 0.067677990 1.10748010 1.372777825
## 95 repduckworth -0.66963484 0.031653250 -0.73167521 -0.607594468
## 96 judgecarter 1.37632360 0.013169031 1.35051230 1.402134904
## 97 repdanmaffei -0.38061978 0.043800577 -0.46646891 -0.294770652
## 98 repkarenbass -0.82506777 0.019043761 -0.86239355 -0.787742003
## 99 senatorharkin -0.83134290 0.016884405 -0.86443634 -0.798249469
## 100 repandyharrismd 1.08711566 0.026995562 1.03420436 1.140026966
##
## Estimated feature scores: showing first 30 beta-hats for features
##
## just voted legislation end air
## 0.194182768 0.285992671 0.132697790 -0.218869690 0.056187447
## traffic control delays nationwide good
## 0.118271601 0.246467115 0.594608733 -0.409474969 -0.001504398
## news traveling public great time
## 0.136417126 0.605587397 -0.137197270 -0.019235023 -0.067728778
## taste cultural center sunday afternoon
## -0.386907589 -0.518190924 -0.138950408 -0.017914261 0.303544837
## food people visited morning conversation
## -0.345301076 -0.044568124 -0.005870757 0.314990104 0.082962744
## getting economy moving service academy
## 0.035210498 -0.113627242 -0.280083823 -0.059922465 0.222496870
# let's look at the most discriminant words (note same notation as in slides)
sw <- data.frame(beta=wf@beta, word=wf@features)
sw <- sw[order(sw$beta),]
head(sw, n=20)
## beta word
## 3903 -12.718477 #climatechange
## 3736 -10.066701 #actonclimate
## 2404 -5.865594 @repjohnconyers
## 3738 -5.525047 @waysmeanscmte
## 3751 -5.311831 @repsandylevin
## 3228 -4.862391 ben
## 1477 -4.652580 pollution
## 2728 -3.798844 @lcvoters
## 3675 -3.777414 #womensucceed
## 2937 -3.607759 #lgbt
## 2372 -3.350516 #timeisnow
## 2165 -3.227202 @keithellison
## 2401 -3.035978 #renewui
## 3726 -2.948591 #cir
## 4124 -2.928738 #equality
## 4142 -2.704724 @janschakowsky
## 2374 -2.678966 #demandavote
## 2844 -2.594303 #enda
## 2320 -2.486419 #gopshutdown
## 2156 -2.442094 #stopthesequester
tail(sw, n=20)
## beta word
## 3089 1.486601 @washtimes
## 3067 1.526249 #balancedbudget
## 3170 1.526542 #fullrepeal
## 3033 1.611470 #tcot
## 3280 1.611940 #trainwreck
## 3262 1.705790 #irs
## 1070 1.717157 #4jobs
## 3909 1.734734 #fairnessforall
## 3082 1.742444 house-passed
## 3786 1.749518 bureaucrats
## 3140 1.776164 @gopconference
## 3041 1.802925 @gopwhip
## 2576 1.887696 j.mp
## 3159 1.912347 @waysandmeansgop
## 3374 2.518736 @republicanstudy
## 3790 2.562191 emails
## 3632 2.670262 @reptomprice
## 3423 3.107402 lois
## 3721 3.544742 @darrellissa
## 3252 7.064208 @gopoversight
# and now we can compare the estimate positions with the ideal points...
plot(wf@theta, cong$idealPoint)
cor(wf@theta, cong$idealPoint)
## [1] 0.9170117
cor(wf@theta[cong$party=="R"], cong$idealPoint[cong$party=="R"])
## [1] 0.4368558
cor(wf@theta[cong$party=="D"], cong$idealPoint[cong$party=="D"])
## [1] 0.4844951