Xgboost

If we really want the best performance at a low computational cost, the cutting-edge method many people are using is Distributed Gradient Boosting, based on the same ideas as boosted trees / random forests, implemented as xgboost. You can read more about the history of this package here.

First, let’s prepare the data…

library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.4
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
tweets <- read.csv("~/data/UK-tweets.csv", stringsAsFactors=F)
tweets$engaging <- ifelse(tweets$communication=="engaging", 1, 0)
tweets <- tweets[!is.na(tweets$engaging),]
# clean text and create DFM
tweets$text <- gsub('@[0-9_A-Za-z]+', '@', tweets$text)
twcorpus <- corpus(tweets$text)
twdfm <- dfm(twcorpus, remove=stopwords("english"), remove_url=TRUE, 
             ngrams=1:2, verbose=TRUE)
## Creating a dfm from a corpus input...
##    ... lowercasing
##    ... found 4,566 documents, 48,657 features
##    ... removed 169 features
##    ... created a 4,566 x 48,488 sparse dfm
##    ... complete. 
## Elapsed time: 0.872 seconds.
twdfm <- dfm_trim(twdfm, min_docfreq = 2, verbose=TRUE)
## Removing features occurring: 
##   - in fewer than 2 documents: 38,258
##   Total features removed: 38,258 (78.9%).
# training and test sets
set.seed(123)
training <- sample(1:nrow(tweets), floor(.80 * nrow(tweets)))
test <- (1:nrow(tweets))[1:nrow(tweets) %in% training == FALSE]

Now we can train the model:

library(xgboost)
# converting matrix object
X <- as(twdfm, "dgCMatrix")
# parameters to explore
tryEta <- c(1,2)
tryDepths <- c(1,2,4)
# placeholders for now
bestEta=NA
bestDepth=NA
bestAcc=0

for(eta in tryEta){
  for(dp in tryDepths){ 
    bst <- xgb.cv(data = X[training,], 
            label =  tweets$engaging[training], 
            max.depth = dp,
          eta = eta, 
          nthread = 4,
          nround = 500,
          nfold=5,
          print_every_n = 100L,
          objective = "binary:logistic")
    # cross-validated accuracy
    acc <- 1-mean(tail(bst$evaluation_log$test_error_mean))
        cat("Results for eta=",eta," and depth=", dp, " : ",
                acc," accuracy.\n",sep="")
        if(acc>bestAcc){
                bestEta=eta
                bestAcc=acc
                bestDepth=dp
        }
    }
}
## [1]  train-error:0.143484+0.003302   test-error:0.143492+0.013204 
## [101]    train-error:0.084201+0.002176   test-error:0.113915+0.007685 
## [201]    train-error:0.071947+0.002044   test-error:0.114463+0.008919 
## [301]    train-error:0.065786+0.001797   test-error:0.114738+0.012973 
## [401]    train-error:0.062910+0.002123   test-error:0.114737+0.011022 
## [500]    train-error:0.059556+0.001976   test-error:0.114738+0.011267 
## Results for eta=1 and depth=1 : 0.885627 accuracy.
## [1]  train-error:0.141635+0.004059   test-error:0.144033+0.006434 
## [101]    train-error:0.062363+0.002086   test-error:0.111444+0.007603 
## [201]    train-error:0.047234+0.002674   test-error:0.111444+0.008669 
## [301]    train-error:0.037787+0.001308   test-error:0.113363+0.010731 
## [401]    train-error:0.030942+0.001508   test-error:0.113088+0.011953 
## [500]    train-error:0.025740+0.002640   test-error:0.113634+0.012855 
## Results for eta=1 and depth=2 : 0.8856361 accuracy.
## [1]  train-error:0.130065+0.004314   test-error:0.140464+0.012396 
## [101]    train-error:0.035323+0.001358   test-error:0.116649+0.007075 
## [201]    train-error:0.017661+0.001178   test-error:0.118837+0.006757 
## [301]    train-error:0.010816+0.001029   test-error:0.118292+0.006537 
## [401]    train-error:0.009789+0.001158   test-error:0.120210+0.008825 
## [500]    train-error:0.008625+0.001394   test-error:0.120757+0.006748 
## Results for eta=1 and depth=4 : 0.8790151 accuracy.
## [1]  train-error:0.143483+0.002159   test-error:0.143483+0.008635 
## [101]    train-error:0.419455+0.196886   test-error:0.402685+0.195750 
## [201]    train-error:0.419455+0.196886   test-error:0.402685+0.195750 
## [301]    train-error:0.419455+0.196886   test-error:0.402685+0.195750 
## [401]    train-error:0.419455+0.196886   test-error:0.402685+0.195750 
## [500]    train-error:0.419455+0.196886   test-error:0.402685+0.195750 
## Results for eta=2 and depth=1 : 0.5973154 accuracy.
## [1]  train-error:0.141704+0.005646   test-error:0.143486+0.010365 
## [101]    train-error:0.238015+0.090648   test-error:0.244821+0.091305 
## [201]    train-error:0.231784+0.100037   test-error:0.246189+0.089610 
## [301]    train-error:0.227881+0.106240   test-error:0.244821+0.091305 
## [401]    train-error:0.225074+0.110827   test-error:0.245915+0.089945 
## [500]    train-error:0.223157+0.114013   test-error:0.246463+0.089277 
## Results for eta=2 and depth=2 : 0.7539478 accuracy.
## [1]  train-error:0.131708+0.002685   test-error:0.138007+0.003166 
## [101]    train-error:0.297999+0.080431   test-error:0.291021+0.086205 
## [201]    train-error:0.297999+0.080431   test-error:0.291021+0.086205 
## [301]    train-error:0.297999+0.080431   test-error:0.291021+0.086205 
## [401]    train-error:0.297999+0.080431   test-error:0.291021+0.086205 
## [500]    train-error:0.297999+0.080431   test-error:0.291021+0.086205 
## Results for eta=2 and depth=4 : 0.708979 accuracy.
cat("Best model has eta=",bestEta," and depth=", bestDepth, " : ",
    bestAcc," accuracy.\n",sep="")
## Best model has eta=1 and depth=2 : 0.8856361 accuracy.

How well does it perform out-of-sample?

# running best model
rf <- xgboost(data = X[training,], 
    label = tweets$engaging[training], 
        max.depth = bestDepth,
    eta = bestEta, 
    nthread = 4,
    nround = 1000,
        print_every_n=100L,
    objective = "binary:logistic")
## [1]  train-error:0.143483 
## [101]    train-error:0.060241 
## [201]    train-error:0.044907 
## [301]    train-error:0.038061 
## [401]    train-error:0.032311 
## [501]    train-error:0.026561 
## [601]    train-error:0.023275 
## [701]    train-error:0.018072 
## [801]    train-error:0.016977 
## [901]    train-error:0.014786 
## [1000]   train-error:0.011774
# out-of-sample accuracy
preds <- predict(rf, X[test,])

## function to compute accuracy
accuracy <- function(ypred, y){
    return(
      sum(ypred==y)/length(y)
      )
}
# function to compute precision
precision <- function(ypred, y){
    tab <- table(ypred, y)
    return(
        sum((ypred==1)&(y==1)) /
        (sum((ypred==1)&(y==0)) + sum((ypred==1)&(y==1)))
        )
}
# function to compute recall
recall <- function(ypred, y){
    tab <- table(ypred, y)
    return(
        sum((ypred==1)&(y==1)) /
        (sum((ypred==0)&(y==1)) + sum((ypred==1)&(y==1)))
        )
}

cat("\nAccuracy on test set=", round(accuracy(preds>.50, tweets$engaging[test]),3))
## 
## Accuracy on test set= 0.862
cat("\nPrecision(1) on test set=", round(precision(preds>.50, tweets$engaging[test]),3))
## 
## Precision(1) on test set= 0.913
cat("\nRecall(1) on test set=", round(recall(preds>.50, tweets$engaging[test]),3))
## 
## Recall(1) on test set= 0.913
cat("\nPrecision(0) on test set=", round(precision(preds<.50, tweets$engaging[test]==0),3))
## 
## Precision(0) on test set= 0.674
cat("\nRecall(0) on test set=", round(recall(preds<.50, tweets$engaging[test]==0),3))
## 
## Recall(0) on test set= 0.674

What we sacrifice is interpretability (yet again!). We can check feature importance, but it’s often hard to tell what’s going on exactly. Why? We only what features “matter”, but not why!

# feature importance
labels <- dimnames(X)[[2]]
importance <- xgb.importance(labels, model = rf, data=X, label=tweets$engaging)
## Warning: 'cBind' is deprecated.
##  Since R version 3.2.0, base's cbind() should work fine with S4 objects
## Warning in `[.data.table`(result, , `:=`("RealCover", as.numeric(vec)), :
## with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.
importance <- importance[order(importance$Gain, decreasing=TRUE),]
head(importance, n=20)
##         Feature        Split        Gain       Cover    Frequency
##  1:           @           18 0.264736859 0.005880419 0.0029461279
##  2:         @_@           16 0.030717230 0.002698893 0.0025252525
##  3:       via_@ -9.53674e-07 0.029703724 0.003286964 0.0021043771
##  4:         @_:            4 0.026794880 0.006501015 0.0054713805
##  5:        to_@ -9.53674e-07 0.022473426 0.004029277 0.0037878788
##  6:      with_@            4 0.021574196 0.003906800 0.0029461279
##  7:           @          8.5 0.020637223 0.001307227 0.0004208754
##  8:     #ep2014 -9.53674e-07 0.015337161 0.002806951 0.0021043771
##  9:      thanks          1.5 0.014170553 0.004756773 0.0033670034
## 10:        on_@            4 0.013816053 0.003468362 0.0025252525
## 11:        @_is            4 0.012531153 0.003347890 0.0025252525
## 12:       and_@            4 0.011006502 0.003971663 0.0033670034
## 13:         @_i -9.53674e-07 0.009840859 0.004742656 0.0033670034
## 14:       today            4 0.009604410 0.003892204 0.0033670034
## 15:         @_' -9.53674e-07 0.009143074 0.002285718 0.0016835017
## 16:           ?           16 0.007883604 0.001391986 0.0008417508
## 17:         @_@          5.5 0.007024904 0.001431421 0.0012626263
## 18:           "          1.5 0.006335513 0.001906599 0.0012626263
## 19: #votelabour -9.53674e-07 0.006274174 0.001554370 0.0008417508
## 20:           .          1.5 0.006246246 0.001023718 0.0046296296
##     RealCover  RealCover %
##  1:      1410 0.3962900506
##  2:       751 0.2110736369
##  3:         5 0.0014052839
##  4:        43 0.0120854413
##  5:         9 0.0025295110
##  6:         6 0.0016863406
##  7:      1371 0.3853288364
##  8:        19 0.0053400787
##  9:       161 0.0452501405
## 10:         2 0.0005621135
## 11:        21 0.0059021922
## 12:        13 0.0036537381
## 13:       124 0.0348510399
## 14:        20 0.0056211355
## 15:        10 0.0028105677
## 16:       296 0.0831928049
## 17:       772 0.2169758291
## 18:        50 0.0140528387
## 19:         0 0.0000000000
## 20:       691 0.1942102305
# adding sign
sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(X[tweets[,"engaging"]==v,])
}
sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
    
df <- data.frame(
    Feature = labels, 
    sign = sign-1,
    stringsAsFactors=F)
importance <- merge(importance, df, by="Feature")
    
## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importance <- importance[order(importance$Gain, decreasing=TRUE),]
    print(head(importance[importance$sign==v,], n=50))
    cat("\n")
    cat(paste(unique(head(importance$Feature[importance$sign==v], n=50)), collapse=", "))
}
## 
## 
## value== 0            Feature        Split         Gain        Cover    Frequency
##  1:           via_@ -9.53674e-07 0.0297037239 0.0032869640 0.0021043771
##  2:            to_@ -9.53674e-07 0.0224734264 0.0040292769 0.0037878788
##  3:          with_@            4 0.0215741960 0.0039067998 0.0029461279
##  4:         #ep2014 -9.53674e-07 0.0153371611 0.0028069514 0.0021043771
##  5:            on_@            4 0.0138160528 0.0034683619 0.0025252525
##  6:           and_@            4 0.0110065020 0.0039716634 0.0033670034
##  7:           today            4 0.0096044104 0.0038922038 0.0033670034
##  8:             @_' -9.53674e-07 0.0091430744 0.0022857176 0.0016835017
##  9:     #votelabour -9.53674e-07 0.0062741742 0.0015543696 0.0008417508
## 10:          hacked -9.53674e-07 0.0061983928 0.0026976335 0.0016835017
## 11:           green          1.5 0.0055692617 0.0025298120 0.0021043771
## 12:              rt -9.53674e-07 0.0052969882 0.0066042520 0.0058922559
## 13:  #votegreen2014            4 0.0048746039 0.0045265023 0.0033670034
## 14: #labourdoorstep -9.53674e-07 0.0046951683 0.0030986651 0.0021043771
## 15:             ,_@          1.5 0.0040519138 0.0043943198 0.0037878788
## 16:     campaigning -9.53674e-07 0.0040082426 0.0020104380 0.0016835017
## 17:        european          1.5 0.0037851511 0.0028840753 0.0021043771
## 18:            meet            4 0.0033863859 0.0036983090 0.0029461279
## 19:             -_@ -9.53674e-07 0.0032590750 0.0046142144 0.0037878788
## 20:            east          1.5 0.0027484878 0.0023004109 0.0021043771
## 21:       about_the -9.53674e-07 0.0023804112 0.0055826606 0.0050505051
## 22:            at_@ -9.53674e-07 0.0023029333 0.0013917170 0.0008417508
## 23:          from_@            4 0.0020383574 0.0013920705 0.0008417508
## 24:      candidates          1.5 0.0020284477 0.0046441814 0.0037878788
## 25:           for_@            2 0.0020202807 0.0004494746 0.0004208754
## 26:           the_@            4 0.0019224182 0.0040660753 0.0037878788
## 27:          that_@ -9.53674e-07 0.0017646319 0.0039263663 0.0029461279
## 28:          #bbcqt            4 0.0017434346 0.0043493136 0.0037878788
## 29:         english          1.5 0.0016774384 0.0025028088 0.0016835017
## 30:             :_" -9.53674e-07 0.0016420779 0.0007533589 0.0004208754
## 31:            seat            2 0.0016039763 0.0003220450 0.0004208754
## 32:             #eu -9.53674e-07 0.0015926600 0.0023519217 0.0016835017
## 33:            if_@ -9.53674e-07 0.0015377188 0.0035687285 0.0025252525
## 34:            of_@ -9.53674e-07 0.0015271896 0.0033199790 0.0029461279
## 35:       candidate -9.53674e-07 0.0014925319 0.0033006845 0.0025252525
## 36:               |            6 0.0013987770 0.0060535947 0.0054713805
## 37:    nigel_farage -9.53674e-07 0.0013672303 0.0007599091 0.0004208754
## 38:             ,_@          2.5 0.0013550773 0.0011676976 0.0008417508
## 39:         tonight          1.5 0.0012717055 0.0027763563 0.0025252525
## 40:             new -9.53674e-07 0.0012583799 0.0023763417 0.0021043771
## 41:            euro -9.53674e-07 0.0012144607 0.0017581809 0.0012626263
## 42:   conservatives -9.53674e-07 0.0010936143 0.0034493229 0.0029461279
## 43:         meeting -9.53674e-07 0.0010546601 0.0018953448 0.0012626263
## 44:        hustings -9.53674e-07 0.0009584558 0.0013092718 0.0008417508
## 45:          london            4 0.0009316371 0.0012456236 0.0008417508
## 46:           see_@ -9.53674e-07 0.0009230591 0.0007342954 0.0004208754
## 47:          with_@          1.5 0.0009139735 0.0007283325 0.0004208754
## 48:          london          1.5 0.0008857997 0.0015457012 0.0012626263
## 49:               |          2.5 0.0008069986 0.0035385141 0.0033670034
## 50:             @_) -9.53674e-07 0.0007572001 0.0042732664 0.0037878788
##             Feature        Split         Gain        Cover    Frequency
##     RealCover  RealCover % sign
##  1:         5 0.0014052839    0
##  2:         9 0.0025295110    0
##  3:         6 0.0016863406    0
##  4:        19 0.0053400787    0
##  5:         2 0.0005621135    0
##  6:        13 0.0036537381    0
##  7:        20 0.0056211355    0
##  8:        10 0.0028105677    0
##  9:         0 0.0000000000    0
## 10:         3 0.0008431703    0
## 11:        14 0.0039347948    0
## 12:         8 0.0022484542    0
## 13:        11 0.0030916245    0
## 14:         2 0.0005621135    0
## 15:        11 0.0030916245    0
## 16:         8 0.0022484542    0
## 17:        17 0.0047779651    0
## 18:         4 0.0011242271    0
## 19:         4 0.0011242271    0
## 20:         5 0.0014052839    0
## 21:         4 0.0011242271    0
## 22:         1 0.0002810568    0
## 23:         4 0.0011242271    0
## 24:        12 0.0033726813    0
## 25:         4 0.0011242271    0
## 26:         5 0.0014052839    0
## 27:         2 0.0005621135    0
## 28:         6 0.0016863406    0
## 29:         7 0.0019673974    0
## 30:         1 0.0002810568    0
## 31:         5 0.0014052839    0
## 32:         5 0.0014052839    0
## 33:         1 0.0002810568    0
## 34:         6 0.0016863406    0
## 35:         8 0.0022484542    0
## 36:         0 0.0000000000    0
## 37:         2 0.0005621135    0
## 38:        13 0.0036537381    0
## 39:         9 0.0025295110    0
## 40:        12 0.0033726813    0
## 41:        15 0.0042158516    0
## 42:         3 0.0008431703    0
## 43:         1 0.0002810568    0
## 44:         0 0.0000000000    0
## 45:         8 0.0022484542    0
## 46:         2 0.0005621135    0
## 47:         7 0.0019673974    0
## 48:         7 0.0019673974    0
## 49:         1 0.0002810568    0
## 50:         3 0.0008431703    0
##     RealCover  RealCover % sign
## 
## via_@, to_@, with_@, #ep2014, on_@, and_@, today, @_', #votelabour, hacked, green, rt, #votegreen2014, #labourdoorstep, ,_@, campaigning, european, meet, -_@, east, about_the, at_@, from_@, candidates, for_@, the_@, that_@, #bbcqt, english, :_", seat, #eu, if_@, of_@, candidate, |, nigel_farage, tonight, new, euro, conservatives, meeting, hustings, london, see_@, @_)
## 
## value== 1            Feature        Split        Gain        Cover    Frequency
##  1:               @           18 0.264736859 5.880419e-03 0.0029461279
##  2:             @_@           16 0.030717230 2.698893e-03 0.0025252525
##  3:             @_:            4 0.026794880 6.501015e-03 0.0054713805
##  4:               @          8.5 0.020637223 1.307227e-03 0.0004208754
##  5:          thanks          1.5 0.014170553 4.756773e-03 0.0033670034
##  6:            @_is            4 0.012531153 3.347890e-03 0.0025252525
##  7:             @_i -9.53674e-07 0.009840859 4.742656e-03 0.0033670034
##  8:               ?           16 0.007883604 1.391986e-03 0.0008417508
##  9:             @_@          5.5 0.007024904 1.431421e-03 0.0012626263
## 10:               "          1.5 0.006335513 1.906599e-03 0.0012626263
## 11:               .          1.5 0.006246246 1.023718e-03 0.0046296296
## 12:             @_:            2 0.005917055 1.721824e-05 0.0008417508
## 13:               .          1.5 0.005512524 1.683578e-03 0.0079966330
## 14:               @          1.5 0.005454286 5.685892e-03 0.0172558923
## 15:               @          1.5 0.005217535 9.442590e-04 0.0172558923
## 16:       thank_you -9.53674e-07 0.004996416 2.150748e-03 0.0012626263
## 17:       good_luck            4 0.004459443 1.897449e-03 0.0012626263
## 18:            meps -9.53674e-07 0.004380918 8.833716e-04 0.0004208754
## 19:               @          2.5 0.004198886 2.022188e-03 0.0126262626
## 20:               (          1.5 0.004117353 1.461347e-04 0.0004208754
## 21:              eu          1.5 0.003891656 2.670747e-03 0.0021043771
## 22:            @_on -9.53674e-07 0.003737015 2.609601e-03 0.0021043771
## 23:           #ukip          1.5 0.003700234 1.950539e-03 0.0016835017
## 24:               !          1.5 0.003693330 1.461580e-03 0.0021043771
## 25:         another -9.53674e-07 0.003514251 3.083479e-03 0.0025252525
## 26:               .          2.5 0.003422497 9.493924e-04 0.0071548822
## 27:          really          1.5 0.003123756 3.429944e-03 0.0025252525
## 28:             yes          2.5 0.003123623 2.615903e-03 0.0016835017
## 29:               .          7.5 0.003110102 1.395100e-03 0.0008417508
## 30: congratulations            4 0.002948321 4.204675e-03 0.0033670034
## 31:              uk            6 0.002940197 5.549896e-03 0.0046296296
## 32:            need            4 0.002837481 4.140961e-03 0.0029461279
## 33:          please -9.53674e-07 0.002832092 5.390337e-03 0.0042087542
## 34:               .           16 0.002818621 3.350547e-04 0.0008417508
## 35:            vote          1.5 0.002804096 4.040649e-03 0.0037878788
## 36:         council            4 0.002785229 2.880497e-03 0.0021043771
## 37:             !_@ -9.53674e-07 0.002752099 7.179162e-03 0.0058922559
## 38:         @_great -9.53674e-07 0.002660405 1.352579e-03 0.0008417508
## 39:               @          2.5 0.002543536 1.229023e-03 0.0054713805
## 40:               ?            4 0.002530869 1.311744e-04 0.0012626263
## 41:             :_- -9.53674e-07 0.002505698 1.950093e-03 0.0012626263
## 42:               .           12 0.002480572 2.220131e-04 0.0037878788
## 43:             can          1.5 0.002434854 3.343689e-03 0.0029461279
## 44:             see            4 0.002431834 5.078348e-04 0.0008417508
## 45:               ,          2.5 0.002356436 2.778229e-03 0.0050505051
## 46:               .            8 0.002286422 9.671921e-05 0.0042087542
## 47:           voted -9.53674e-07 0.002250851 3.707100e-03 0.0033670034
## 48:            back            2 0.002237645 8.208192e-04 0.0016835017
## 49:             @_,           12 0.002225158 2.368821e-03 0.0025252525
## 50:             @_@          3.5 0.002216377 5.267791e-04 0.0050505051
##             Feature        Split        Gain        Cover    Frequency
##     RealCover RealCover % sign
##  1:      1410 0.396290051    1
##  2:       751 0.211073637    1
##  3:        43 0.012085441    1
##  4:      1371 0.385328836    1
##  5:       161 0.045250141    1
##  6:        21 0.005902192    1
##  7:       124 0.034851040    1
##  8:       296 0.083192805    1
##  9:       772 0.216975829    1
## 10:        50 0.014052839    1
## 11:       691 0.194210230    1
## 12:        39 0.010961214    1
## 13:       675 0.189713322    1
## 14:      1421 0.399381675    1
## 15:      1403 0.394322653    1
## 16:        55 0.015458123    1
## 17:        38 0.010680157    1
## 18:        10 0.002810568    1
## 19:      1410 0.396290051    1
## 20:        33 0.009274874    1
## 21:        74 0.020798201    1
## 22:        14 0.003934795    1
## 23:        61 0.017144463    1
## 24:       334 0.093872962    1
## 25:         8 0.002248454    1
## 26:       659 0.185216414    1
## 27:        32 0.008993817    1
## 28:        49 0.013771782    1
## 29:       688 0.193367060    1
## 30:        19 0.005340079    1
## 31:        42 0.011804384    1
## 32:        37 0.010399101    1
## 33:        22 0.006183249    1
## 34:       688 0.193367060    1
## 35:        62 0.017425520    1
## 36:        11 0.003091625    1
## 37:        12 0.003372681    1
## 38:        12 0.003372681    1
## 39:      1387 0.389825745    1
## 40:       317 0.089094997    1
## 41:        37 0.010399101    1
## 42:       680 0.191118606    1
## 43:        82 0.023046655    1
## 44:        64 0.017987634    1
## 45:       402 0.112984823    1
## 46:       680 0.191118606    1
## 47:        12 0.003372681    1
## 48:        28 0.007869590    1
## 49:        10 0.002810568    1
## 50:       759 0.213322091    1
##     RealCover RealCover % sign
## 
## @, @_@, @_:, thanks, @_is, @_i, ?, ", ., thank_you, good_luck, meps, (, eu, @_on, #ukip, !, another, really, yes, congratulations, uk, need, please, vote, council, !_@, @_great, :_-, can, see, ,, voted, back, @_,