If we really want the best performance at a low computational cost, the cutting-edge method many people are using is Distributed Gradient Boosting, based on the same ideas as boosted trees / random forests, implemented as xgboost
. You can read more about the history of this package here.
First, let’s prepare the data…
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.4
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
tweets <- read.csv("~/data/UK-tweets.csv", stringsAsFactors=F)
tweets$engaging <- ifelse(tweets$communication=="engaging", 1, 0)
tweets <- tweets[!is.na(tweets$engaging),]
# clean text and create DFM
tweets$text <- gsub('@[0-9_A-Za-z]+', '@', tweets$text)
twcorpus <- corpus(tweets$text)
twdfm <- dfm(twcorpus, remove=stopwords("english"), remove_url=TRUE,
ngrams=1:2, verbose=TRUE)
## Creating a dfm from a corpus input...
## ... lowercasing
## ... found 4,566 documents, 48,657 features
## ... removed 169 features
## ... created a 4,566 x 48,488 sparse dfm
## ... complete.
## Elapsed time: 1.03 seconds.
twdfm <- dfm_trim(twdfm, min_docfreq = 2, verbose=TRUE)
## Removing features occurring:
## - in fewer than 2 documents: 38,258
## Total features removed: 38,258 (78.9%).
# training and test sets
set.seed(123)
training <- sample(1:nrow(tweets), floor(.80 * nrow(tweets)))
test <- (1:nrow(tweets))[1:nrow(tweets) %in% training == FALSE]
Now we can train the model:
library(xgboost)
# converting matrix object
X <- as(twdfm, "dgCMatrix")
# parameters to explore
tryEta <- c(1,2)
tryDepths <- c(1,2,4)
# placeholders for now
bestEta=NA
bestDepth=NA
bestAcc=0
for(eta in tryEta){
for(dp in tryDepths){
bst <- xgb.cv(data = X[training,],
label = tweets$engaging[training],
max.depth = dp,
eta = eta,
nthread = 4,
nround = 500,
nfold=5,
print_every_n = 100L,
objective = "binary:logistic")
# cross-validated accuracy
acc <- 1-mean(tail(bst$evaluation_log$test_error_mean))
cat("Results for eta=",eta," and depth=", dp, " : ",
acc," accuracy.\n",sep="")
if(acc>bestAcc){
bestEta=eta
bestAcc=acc
bestDepth=dp
}
}
}
## [1] train-error:0.143484+0.003302 test-error:0.143492+0.013204
## [101] train-error:0.084201+0.002176 test-error:0.113915+0.007685
## [201] train-error:0.071947+0.002044 test-error:0.114463+0.008919
## [301] train-error:0.065786+0.001797 test-error:0.114738+0.012973
## [401] train-error:0.062910+0.002123 test-error:0.114737+0.011022
## [500] train-error:0.059556+0.001976 test-error:0.114738+0.011267
## Results for eta=1 and depth=1 : 0.885627 accuracy.
## [1] train-error:0.141635+0.004059 test-error:0.144033+0.006434
## [101] train-error:0.062363+0.002086 test-error:0.111444+0.007603
## [201] train-error:0.047234+0.002674 test-error:0.111444+0.008669
## [301] train-error:0.037787+0.001308 test-error:0.113363+0.010731
## [401] train-error:0.030942+0.001508 test-error:0.113088+0.011953
## [500] train-error:0.025740+0.002640 test-error:0.113634+0.012855
## Results for eta=1 and depth=2 : 0.8856361 accuracy.
## [1] train-error:0.130065+0.004314 test-error:0.140464+0.012396
## [101] train-error:0.035323+0.001358 test-error:0.116649+0.007075
## [201] train-error:0.017661+0.001178 test-error:0.118837+0.006757
## [301] train-error:0.010816+0.001029 test-error:0.118292+0.006537
## [401] train-error:0.009789+0.001158 test-error:0.120210+0.008825
## [500] train-error:0.008625+0.001394 test-error:0.120757+0.006748
## Results for eta=1 and depth=4 : 0.8790151 accuracy.
## [1] train-error:0.143483+0.002159 test-error:0.143483+0.008635
## [101] train-error:0.419455+0.196886 test-error:0.402685+0.195750
## [201] train-error:0.419455+0.196886 test-error:0.402685+0.195750
## [301] train-error:0.419455+0.196886 test-error:0.402685+0.195750
## [401] train-error:0.419455+0.196886 test-error:0.402685+0.195750
## [500] train-error:0.419455+0.196886 test-error:0.402685+0.195750
## Results for eta=2 and depth=1 : 0.5973154 accuracy.
## [1] train-error:0.141704+0.005646 test-error:0.143486+0.010365
## [101] train-error:0.238015+0.090648 test-error:0.244821+0.091305
## [201] train-error:0.231784+0.100037 test-error:0.246189+0.089610
## [301] train-error:0.227881+0.106240 test-error:0.244821+0.091305
## [401] train-error:0.225074+0.110827 test-error:0.245915+0.089945
## [500] train-error:0.223157+0.114013 test-error:0.246463+0.089277
## Results for eta=2 and depth=2 : 0.7539478 accuracy.
## [1] train-error:0.131708+0.002685 test-error:0.138007+0.003166
## [101] train-error:0.297999+0.080431 test-error:0.291021+0.086205
## [201] train-error:0.297999+0.080431 test-error:0.291021+0.086205
## [301] train-error:0.297999+0.080431 test-error:0.291021+0.086205
## [401] train-error:0.297999+0.080431 test-error:0.291021+0.086205
## [500] train-error:0.297999+0.080431 test-error:0.291021+0.086205
## Results for eta=2 and depth=4 : 0.708979 accuracy.
cat("Best model has eta=",bestEta," and depth=", bestDepth, " : ",
bestAcc," accuracy.\n",sep="")
## Best model has eta=1 and depth=2 : 0.8856361 accuracy.
How well does it perform out-of-sample?
# running best model
rf <- xgboost(data = X[training,],
label = tweets$engaging[training],
max.depth = bestDepth,
eta = bestEta,
nthread = 4,
nround = 1000,
print_every_n=100L,
objective = "binary:logistic")
## [1] train-error:0.143483
## [101] train-error:0.060241
## [201] train-error:0.044907
## [301] train-error:0.038061
## [401] train-error:0.032311
## [501] train-error:0.026561
## [601] train-error:0.023275
## [701] train-error:0.018072
## [801] train-error:0.016977
## [901] train-error:0.014786
## [1000] train-error:0.011774
# out-of-sample accuracy
preds <- predict(rf, X[test,])
## function to compute accuracy
accuracy <- function(ypred, y){
tab <- table(ypred, y)
return(sum(diag(tab))/sum(tab))
}
# function to compute precision
precision <- function(ypred, y){
tab <- table(ypred, y)
return((tab[2,2])/(tab[2,1]+tab[2,2]))
}
# function to compute recall
recall <- function(ypred, y){
tab <- table(ypred, y)
return(tab[2,2]/(tab[1,2]+tab[2,2]))
}
cat("\nAccuracy on test set=", round(accuracy(preds>.50, tweets$engaging[test]),3))
##
## Accuracy on test set= 0.862
cat("\nPrecision(1) on test set=", round(precision(preds>.50, tweets$engaging[test]),3))
##
## Precision(1) on test set= 0.913
cat("\nRecall(1) on test set=", round(recall(preds>.50, tweets$engaging[test]),3))
##
## Recall(1) on test set= 0.913
cat("\nPrecision(0) on test set=", round(precision(preds<.50, tweets$engaging[test]==0),3))
##
## Precision(0) on test set= 0.674
cat("\nRecall(0) on test set=", round(recall(preds<.50, tweets$engaging[test]==0),3))
##
## Recall(0) on test set= 0.674
What we sacrifice is interpretability (yet again!). We can check feature importance, but it’s often hard to tell what’s going on exactly. Why? We only what features “matter”, but not why!
# feature importance
labels <- dimnames(X)[[2]]
importance <- xgb.importance(labels, model = rf, data=X, label=tweets$engaging)
## Warning: 'cBind' is deprecated.
## Since R version 3.2.0, base's cbind() should work fine with S4 objects
## Warning in `[.data.table`(result, , `:=`("RealCover", as.numeric(vec)), :
## with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.
importance <- importance[order(importance$Gain, decreasing=TRUE),]
head(importance, n=20)
## Feature Split Gain Cover Frequency
## 1: @ 18 0.264736859 0.005880419 0.0029461279
## 2: @_@ 16 0.030717230 0.002698893 0.0025252525
## 3: via_@ -9.53674e-07 0.029703724 0.003286964 0.0021043771
## 4: @_: 4 0.026794880 0.006501015 0.0054713805
## 5: to_@ -9.53674e-07 0.022473426 0.004029277 0.0037878788
## 6: with_@ 4 0.021574196 0.003906800 0.0029461279
## 7: @ 8.5 0.020637223 0.001307227 0.0004208754
## 8: #ep2014 -9.53674e-07 0.015337161 0.002806951 0.0021043771
## 9: thanks 1.5 0.014170553 0.004756773 0.0033670034
## 10: on_@ 4 0.013816053 0.003468362 0.0025252525
## 11: @_is 4 0.012531153 0.003347890 0.0025252525
## 12: and_@ 4 0.011006502 0.003971663 0.0033670034
## 13: @_i -9.53674e-07 0.009840859 0.004742656 0.0033670034
## 14: today 4 0.009604410 0.003892204 0.0033670034
## 15: @_' -9.53674e-07 0.009143074 0.002285718 0.0016835017
## 16: ? 16 0.007883604 0.001391986 0.0008417508
## 17: @_@ 5.5 0.007024904 0.001431421 0.0012626263
## 18: " 1.5 0.006335513 0.001906599 0.0012626263
## 19: #votelabour -9.53674e-07 0.006274174 0.001554370 0.0008417508
## 20: . 1.5 0.006246246 0.001023718 0.0046296296
## RealCover RealCover %
## 1: 1410 0.3962900506
## 2: 751 0.2110736369
## 3: 5 0.0014052839
## 4: 43 0.0120854413
## 5: 9 0.0025295110
## 6: 6 0.0016863406
## 7: 1371 0.3853288364
## 8: 19 0.0053400787
## 9: 161 0.0452501405
## 10: 2 0.0005621135
## 11: 21 0.0059021922
## 12: 13 0.0036537381
## 13: 124 0.0348510399
## 14: 20 0.0056211355
## 15: 10 0.0028105677
## 16: 296 0.0831928049
## 17: 772 0.2169758291
## 18: 50 0.0140528387
## 19: 0 0.0000000000
## 20: 691 0.1942102305
# adding sign
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(X[tweets[,"engaging"]==v,])
}
sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
df <- data.frame(
Feature = labels,
sign = sign-1,
stringsAsFactors=F)
importance <- merge(importance, df, by="Feature")
## best predictors
for (v in 0:1){
cat("\n\n")
cat("value==", v)
importance <- importance[order(importance$Gain, decreasing=TRUE),]
print(head(importance[importance$sign==v,], n=50))
cat("\n")
cat(paste(unique(head(importance$Feature[importance$sign==v], n=50)), collapse=", "))
}
##
##
## value== 0 Feature Split Gain Cover Frequency
## 1: via_@ -9.53674e-07 0.0297037239 0.0032869640 0.0021043771
## 2: to_@ -9.53674e-07 0.0224734264 0.0040292769 0.0037878788
## 3: with_@ 4 0.0215741960 0.0039067998 0.0029461279
## 4: #ep2014 -9.53674e-07 0.0153371611 0.0028069514 0.0021043771
## 5: on_@ 4 0.0138160528 0.0034683619 0.0025252525
## 6: and_@ 4 0.0110065020 0.0039716634 0.0033670034
## 7: today 4 0.0096044104 0.0038922038 0.0033670034
## 8: @_' -9.53674e-07 0.0091430744 0.0022857176 0.0016835017
## 9: #votelabour -9.53674e-07 0.0062741742 0.0015543696 0.0008417508
## 10: hacked -9.53674e-07 0.0061983928 0.0026976335 0.0016835017
## 11: green 1.5 0.0055692617 0.0025298120 0.0021043771
## 12: rt -9.53674e-07 0.0052969882 0.0066042520 0.0058922559
## 13: #votegreen2014 4 0.0048746039 0.0045265023 0.0033670034
## 14: #labourdoorstep -9.53674e-07 0.0046951683 0.0030986651 0.0021043771
## 15: ,_@ 1.5 0.0040519138 0.0043943198 0.0037878788
## 16: campaigning -9.53674e-07 0.0040082426 0.0020104380 0.0016835017
## 17: european 1.5 0.0037851511 0.0028840753 0.0021043771
## 18: meet 4 0.0033863859 0.0036983090 0.0029461279
## 19: -_@ -9.53674e-07 0.0032590750 0.0046142144 0.0037878788
## 20: east 1.5 0.0027484878 0.0023004109 0.0021043771
## 21: about_the -9.53674e-07 0.0023804112 0.0055826606 0.0050505051
## 22: at_@ -9.53674e-07 0.0023029333 0.0013917170 0.0008417508
## 23: from_@ 4 0.0020383574 0.0013920705 0.0008417508
## 24: candidates 1.5 0.0020284477 0.0046441814 0.0037878788
## 25: for_@ 2 0.0020202807 0.0004494746 0.0004208754
## 26: the_@ 4 0.0019224182 0.0040660753 0.0037878788
## 27: that_@ -9.53674e-07 0.0017646319 0.0039263663 0.0029461279
## 28: #bbcqt 4 0.0017434346 0.0043493136 0.0037878788
## 29: english 1.5 0.0016774384 0.0025028088 0.0016835017
## 30: :_" -9.53674e-07 0.0016420779 0.0007533589 0.0004208754
## 31: seat 2 0.0016039763 0.0003220450 0.0004208754
## 32: #eu -9.53674e-07 0.0015926600 0.0023519217 0.0016835017
## 33: if_@ -9.53674e-07 0.0015377188 0.0035687285 0.0025252525
## 34: of_@ -9.53674e-07 0.0015271896 0.0033199790 0.0029461279
## 35: candidate -9.53674e-07 0.0014925319 0.0033006845 0.0025252525
## 36: | 6 0.0013987770 0.0060535947 0.0054713805
## 37: nigel_farage -9.53674e-07 0.0013672303 0.0007599091 0.0004208754
## 38: ,_@ 2.5 0.0013550773 0.0011676976 0.0008417508
## 39: tonight 1.5 0.0012717055 0.0027763563 0.0025252525
## 40: new -9.53674e-07 0.0012583799 0.0023763417 0.0021043771
## 41: euro -9.53674e-07 0.0012144607 0.0017581809 0.0012626263
## 42: conservatives -9.53674e-07 0.0010936143 0.0034493229 0.0029461279
## 43: meeting -9.53674e-07 0.0010546601 0.0018953448 0.0012626263
## 44: hustings -9.53674e-07 0.0009584558 0.0013092718 0.0008417508
## 45: london 4 0.0009316371 0.0012456236 0.0008417508
## 46: see_@ -9.53674e-07 0.0009230591 0.0007342954 0.0004208754
## 47: with_@ 1.5 0.0009139735 0.0007283325 0.0004208754
## 48: london 1.5 0.0008857997 0.0015457012 0.0012626263
## 49: | 2.5 0.0008069986 0.0035385141 0.0033670034
## 50: @_) -9.53674e-07 0.0007572001 0.0042732664 0.0037878788
## Feature Split Gain Cover Frequency
## RealCover RealCover % sign
## 1: 5 0.0014052839 0
## 2: 9 0.0025295110 0
## 3: 6 0.0016863406 0
## 4: 19 0.0053400787 0
## 5: 2 0.0005621135 0
## 6: 13 0.0036537381 0
## 7: 20 0.0056211355 0
## 8: 10 0.0028105677 0
## 9: 0 0.0000000000 0
## 10: 3 0.0008431703 0
## 11: 14 0.0039347948 0
## 12: 8 0.0022484542 0
## 13: 11 0.0030916245 0
## 14: 2 0.0005621135 0
## 15: 11 0.0030916245 0
## 16: 8 0.0022484542 0
## 17: 17 0.0047779651 0
## 18: 4 0.0011242271 0
## 19: 4 0.0011242271 0
## 20: 5 0.0014052839 0
## 21: 4 0.0011242271 0
## 22: 1 0.0002810568 0
## 23: 4 0.0011242271 0
## 24: 12 0.0033726813 0
## 25: 4 0.0011242271 0
## 26: 5 0.0014052839 0
## 27: 2 0.0005621135 0
## 28: 6 0.0016863406 0
## 29: 7 0.0019673974 0
## 30: 1 0.0002810568 0
## 31: 5 0.0014052839 0
## 32: 5 0.0014052839 0
## 33: 1 0.0002810568 0
## 34: 6 0.0016863406 0
## 35: 8 0.0022484542 0
## 36: 0 0.0000000000 0
## 37: 2 0.0005621135 0
## 38: 13 0.0036537381 0
## 39: 9 0.0025295110 0
## 40: 12 0.0033726813 0
## 41: 15 0.0042158516 0
## 42: 3 0.0008431703 0
## 43: 1 0.0002810568 0
## 44: 0 0.0000000000 0
## 45: 8 0.0022484542 0
## 46: 2 0.0005621135 0
## 47: 7 0.0019673974 0
## 48: 7 0.0019673974 0
## 49: 1 0.0002810568 0
## 50: 3 0.0008431703 0
## RealCover RealCover % sign
##
## via_@, to_@, with_@, #ep2014, on_@, and_@, today, @_', #votelabour, hacked, green, rt, #votegreen2014, #labourdoorstep, ,_@, campaigning, european, meet, -_@, east, about_the, at_@, from_@, candidates, for_@, the_@, that_@, #bbcqt, english, :_", seat, #eu, if_@, of_@, candidate, |, nigel_farage, tonight, new, euro, conservatives, meeting, hustings, london, see_@, @_)
##
## value== 1 Feature Split Gain Cover Frequency
## 1: @ 18 0.264736859 5.880419e-03 0.0029461279
## 2: @_@ 16 0.030717230 2.698893e-03 0.0025252525
## 3: @_: 4 0.026794880 6.501015e-03 0.0054713805
## 4: @ 8.5 0.020637223 1.307227e-03 0.0004208754
## 5: thanks 1.5 0.014170553 4.756773e-03 0.0033670034
## 6: @_is 4 0.012531153 3.347890e-03 0.0025252525
## 7: @_i -9.53674e-07 0.009840859 4.742656e-03 0.0033670034
## 8: ? 16 0.007883604 1.391986e-03 0.0008417508
## 9: @_@ 5.5 0.007024904 1.431421e-03 0.0012626263
## 10: " 1.5 0.006335513 1.906599e-03 0.0012626263
## 11: . 1.5 0.006246246 1.023718e-03 0.0046296296
## 12: @_: 2 0.005917055 1.721824e-05 0.0008417508
## 13: . 1.5 0.005512524 1.683578e-03 0.0079966330
## 14: @ 1.5 0.005454286 5.685892e-03 0.0172558923
## 15: @ 1.5 0.005217535 9.442590e-04 0.0172558923
## 16: thank_you -9.53674e-07 0.004996416 2.150748e-03 0.0012626263
## 17: good_luck 4 0.004459443 1.897449e-03 0.0012626263
## 18: meps -9.53674e-07 0.004380918 8.833716e-04 0.0004208754
## 19: @ 2.5 0.004198886 2.022188e-03 0.0126262626
## 20: ( 1.5 0.004117353 1.461347e-04 0.0004208754
## 21: eu 1.5 0.003891656 2.670747e-03 0.0021043771
## 22: @_on -9.53674e-07 0.003737015 2.609601e-03 0.0021043771
## 23: #ukip 1.5 0.003700234 1.950539e-03 0.0016835017
## 24: ! 1.5 0.003693330 1.461580e-03 0.0021043771
## 25: another -9.53674e-07 0.003514251 3.083479e-03 0.0025252525
## 26: . 2.5 0.003422497 9.493924e-04 0.0071548822
## 27: really 1.5 0.003123756 3.429944e-03 0.0025252525
## 28: yes 2.5 0.003123623 2.615903e-03 0.0016835017
## 29: . 7.5 0.003110102 1.395100e-03 0.0008417508
## 30: congratulations 4 0.002948321 4.204675e-03 0.0033670034
## 31: uk 6 0.002940197 5.549896e-03 0.0046296296
## 32: need 4 0.002837481 4.140961e-03 0.0029461279
## 33: please -9.53674e-07 0.002832092 5.390337e-03 0.0042087542
## 34: . 16 0.002818621 3.350547e-04 0.0008417508
## 35: vote 1.5 0.002804096 4.040649e-03 0.0037878788
## 36: council 4 0.002785229 2.880497e-03 0.0021043771
## 37: !_@ -9.53674e-07 0.002752099 7.179162e-03 0.0058922559
## 38: @_great -9.53674e-07 0.002660405 1.352579e-03 0.0008417508
## 39: @ 2.5 0.002543536 1.229023e-03 0.0054713805
## 40: ? 4 0.002530869 1.311744e-04 0.0012626263
## 41: :_- -9.53674e-07 0.002505698 1.950093e-03 0.0012626263
## 42: . 12 0.002480572 2.220131e-04 0.0037878788
## 43: can 1.5 0.002434854 3.343689e-03 0.0029461279
## 44: see 4 0.002431834 5.078348e-04 0.0008417508
## 45: , 2.5 0.002356436 2.778229e-03 0.0050505051
## 46: . 8 0.002286422 9.671921e-05 0.0042087542
## 47: voted -9.53674e-07 0.002250851 3.707100e-03 0.0033670034
## 48: back 2 0.002237645 8.208192e-04 0.0016835017
## 49: @_, 12 0.002225158 2.368821e-03 0.0025252525
## 50: @_@ 3.5 0.002216377 5.267791e-04 0.0050505051
## Feature Split Gain Cover Frequency
## RealCover RealCover % sign
## 1: 1410 0.396290051 1
## 2: 751 0.211073637 1
## 3: 43 0.012085441 1
## 4: 1371 0.385328836 1
## 5: 161 0.045250141 1
## 6: 21 0.005902192 1
## 7: 124 0.034851040 1
## 8: 296 0.083192805 1
## 9: 772 0.216975829 1
## 10: 50 0.014052839 1
## 11: 691 0.194210230 1
## 12: 39 0.010961214 1
## 13: 675 0.189713322 1
## 14: 1421 0.399381675 1
## 15: 1403 0.394322653 1
## 16: 55 0.015458123 1
## 17: 38 0.010680157 1
## 18: 10 0.002810568 1
## 19: 1410 0.396290051 1
## 20: 33 0.009274874 1
## 21: 74 0.020798201 1
## 22: 14 0.003934795 1
## 23: 61 0.017144463 1
## 24: 334 0.093872962 1
## 25: 8 0.002248454 1
## 26: 659 0.185216414 1
## 27: 32 0.008993817 1
## 28: 49 0.013771782 1
## 29: 688 0.193367060 1
## 30: 19 0.005340079 1
## 31: 42 0.011804384 1
## 32: 37 0.010399101 1
## 33: 22 0.006183249 1
## 34: 688 0.193367060 1
## 35: 62 0.017425520 1
## 36: 11 0.003091625 1
## 37: 12 0.003372681 1
## 38: 12 0.003372681 1
## 39: 1387 0.389825745 1
## 40: 317 0.089094997 1
## 41: 37 0.010399101 1
## 42: 680 0.191118606 1
## 43: 82 0.023046655 1
## 44: 64 0.017987634 1
## 45: 402 0.112984823 1
## 46: 680 0.191118606 1
## 47: 12 0.003372681 1
## 48: 28 0.007869590 1
## 49: 10 0.002810568 1
## 50: 759 0.213322091 1
## RealCover RealCover % sign
##
## @, @_@, @_:, thanks, @_is, @_i, ?, ", ., thank_you, good_luck, meps, (, eu, @_on, #ukip, !, another, really, yes, congratulations, uk, need, please, vote, council, !_@, @_great, :_-, can, see, ,, voted, back, @_,