Continuing with our previous example, we’ll now try other classifiers. First up is SVMs – the code below illustrates a common trade-off between performance and interpretability of the model.
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.4
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
tweets <- read.csv("~/data/UK-tweets.csv", stringsAsFactors=F)
tweets$engaging <- ifelse(tweets$communication=="engaging", 1, 0)
tweets <- tweets[!is.na(tweets$engaging),]
# clean text and create DFM
tweets$text <- gsub('@[0-9_A-Za-z]+', '@', tweets$text)
twcorpus <- corpus(tweets$text)
twdfm <- dfm(twcorpus, remove=stopwords("english"), remove_url=TRUE,
ngrams=1:2, verbose=TRUE)
## Creating a dfm from a corpus input...
## ... lowercasing
## ... found 4,566 documents, 48,657 features
## ... removed 169 features
## ... created a 4,566 x 48,488 sparse dfm
## ... complete.
## Elapsed time: 2.49 seconds.
twdfm <- dfm_trim(twdfm, min_docfreq = 5, verbose=TRUE)
## Removing features occurring:
## - in fewer than 5 documents: 45,755
## Total features removed: 45,755 (94.4%).
# training and test sets
set.seed(123)
training <- sample(1:nrow(tweets), floor(.80 * nrow(tweets)))
test <- (1:nrow(tweets))[1:nrow(tweets) %in% training == FALSE]
First, let’s run SVM by choosing manually the cost parameter (regularization term).
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
fit <- svm(x=twdfm[training,], y=factor(tweets$engaging[training]),
kernel="linear", cost=10, probability=TRUE)
preds <- predict(fit, twdfm[test,])
## function to compute accuracy
accuracy <- function(ypred, y){
tab <- table(ypred, y)
return(sum(diag(tab))/sum(tab))
}
# function to compute precision
precision <- function(ypred, y){
tab <- table(ypred, y)
return((tab[2,2])/(tab[2,1]+tab[2,2]))
}
# function to compute recall
recall <- function(ypred, y){
tab <- table(ypred, y)
return(tab[2,2]/(tab[1,2]+tab[2,2]))
}
# confusion matrix
table(preds, tweets$engaging[test])
##
## preds 0 1
## 0 113 92
## 1 80 629
# performance metrics
accuracy(preds, tweets$engaging[test])
## [1] 0.8118162
precision(preds, tweets$engaging[test])
## [1] 0.887165
recall(preds, tweets$engaging[test])
## [1] 0.8723994
Unlike regularized regression, here looking at the estimated coefficients is not as informative because they only tell us what support vectors were estimated in the model. But we can have a sense of what observations are more “important” or “separate” better the data by extracting the support vectors in the data matrix and then their corresponding coefficients (times the training labels):
df <- data.frame(
vector = tweets$text[training][fit$index],
coef = fit$coefs,
stringsAsFactors = F
)
df <- df[order(df$coef),]
head(df[,c("coef", "vector")], n=10)
## coef
## 761 -10
## 767 -10
## 775 -10
## 790 -10
## 809 -10
## 814 -10
## 871 -10
## 885 -10
## 907 -10
## 985 -10
## vector
## 761 Well done Chevron B34 Paolo Barillla wins Monaco F3 @
## 767 Viva Rod Petrie
## 775 #EP2014 cands SWEngland @ @ @ @ @ @ @ @
## 790 The one-world selfie http://t.co/vcgqA0RrDu
## 809 Rotten Relations in TurkishFootball which protects MatchFixers http://t.co/yN2TrOC5aM …@ @ @ @
## 814 A modern jazzy reversion of Mien Kampf @ #twat http://t.co/wRRKomnUB6 via @
## 871 @ Old Etonian isn't he?
## 885 http://t.co/IrhS1H6kzY @ outclassing Ed Milliand
## 907 @ elected with 159,815.
## 985 Mandy discussing the manifesto document in Wolverhampton @ http://t.co/8DT0LOEyKM
df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "vector")], n=10)
## coef
## 214 10
## 437 10
## 532 10
## 544 10
## 545 10
## 548 10
## 581 10
## 626 10
## 685 10
## 713 10
## vector
## 214 @ #opportunists #transient
## 437 Why Pray? http://t.co/F2C5ZZCJ6v
## 532 @ #sobs
## 544 @ #ohyesyouare
## 545 @ tragi-comedy
## 548 @ starts at 9am
## 581 @ https://t.co/0lMtYKTfVz germany
## 626 MT '@: I've supported Vote #TUSC on May 22nd on @ // @ http://t.co/LnvjxTm4o4
## 685 GREEN NEWS Peterborough UK is out! http://t.co/tLn6M2uTi8 Stories via @ @
## 713 @ congratulation
Note that this will only work with linear kernels.
There’s a version of svm with cross-validation already built-in. We’ll now use to find the best cost parameter:
fit <- tune(svm, train.x=twdfm[training,],
train.y=factor(tweets$engaging[training]),
kernel="linear",
ranges=list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(fit)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.1114395
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.2231679 0.01707819
## 2 1e-02 0.1883928 0.01187115
## 3 1e-01 0.1114395 0.01761187
## 4 1e+00 0.1388278 0.01747049
## 5 5e+00 0.1667557 0.02373938
## 6 1e+01 0.1703166 0.02220133
## 7 1e+02 0.1768860 0.02423627
# best model
bestmodel <- fit$best.model
summary(bestmodel)
##
## Call:
## best.tune(method = svm, train.x = twdfm[training, ], train.y = factor(tweets$engaging[training]),
## ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)),
## kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
## gamma: 0.0003658983
##
## Number of Support Vectors: 1523
##
## ( 881 642 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# evaluating performance
preds <- predict(bestmodel, twdfm[test,])
# confusion matrix
table(preds, tweets$engaging[test])
##
## preds 0 1
## 0 120 42
## 1 73 679
# performance metrics
accuracy(preds, tweets$engaging[test])
## [1] 0.8741794
precision(preds, tweets$engaging[test])
## [1] 0.9029255
recall(preds, tweets$engaging[test])
## [1] 0.9417476
df <- data.frame(
vector = tweets$text[training][bestmodel$index],
coef = bestmodel$coefs,
stringsAsFactors = F
)
df <- df[order(df$coef),]
head(df[,c("coef", "vector")], n=10)
## coef
## 882 -0.1
## 883 -0.1
## 884 -0.1
## 885 -0.1
## 887 -0.1
## 888 -0.1
## 889 -0.1
## 891 -0.1
## 892 -0.1
## 893 -0.1
## vector
## 882 .@ Sally Ibbotson @ comes in second after Labour. http://t.co/MM0KdqPl7A
## 883 @ is Britain's Berlusconi – someone foreigners look at and say, "How can you vote for this buffoon?"
## 884 “@: Revealed - the #fossilfuel lobbies behind attacks on #renewables - News - The Ecologist http://t.co/dhVXjtWn95 #climate”
## 885 Unusual polling stations - in pictures http://t.co/sGmewcCHZd
## 887 While UKIP miss or oppose key votes, @ are working hard to stand up for UK citizens http://t.co/jxwJAF71a5 http://t.co/RHkFhUsG7n
## 888 @ spotted among good friends at @ event in Manchester #EU2014 @ http://t.co/AAI63wqLzE
## 889 Congratulations to @ on being re-elected!
## 891 Forest carbon loss 'underestimated' http://t.co/fXjglnu849
## 892 "seedbombing" one of the more innovative suggestions at #leithnp
## 893 @ @ Well done to everyone in #Winchester, great result!
df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "vector")], n=10)
## coef
## 1 0.1
## 4 0.1
## 7 0.1
## 13 0.1
## 21 0.1
## 24 0.1
## 29 0.1
## 33 0.1
## 34 0.1
## 38 0.1
## vector
## 1 @ to be fair I think people have been listening to your leader rather than labour this last day or two
## 4 Thanks to @ & @ for covering our https://t.co/GcPbvFVvnY report on BBC ‘Click’. Listen again at http://t.co/iB4Wx9ZuyQ
## 7 @ @ Is this truly the best use of his time? WTF...
## 13 @ UKIP: for people incapable of eating without assistance
## 21 Congratulations to @ and @ elected Yorkshire &Humber MEP Labour
## 24 Greens to launch LGBTIQ European manifesto @ http://t.co/XzqauVWLIk
## 29 Look @ @
## 33 @ @ in metropolitan areas, the wealthy (a lot of foreign oligarchs and billionaires) have priced out ordinary Britons
## 34 Out + about with #Peterborough's MEP @ #cllrdarrenfower http://t.co/VN7g94AcJw
## 38 @ he is Lenny not Kenny and the person who said that remark has been removed
Turning to random forests, we’ll need to dramatically reduce the size of our matrix to make sure it doesn’t take hours to run; so we’ll just pick features that are not frequent but also not too rare. We’ll see that random forests tend to perform quite well, but at the cost of higher computational cost and even less interpretability (we can only learn of “feature importance” – the extent to which a variable tends to separate classes more frequently than others).
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(777)
X <- as.matrix(dfm_trim(twdfm, min_docfreq = 50,
max_docfreq=0.80*nrow(twdfm), verbose=TRUE))
## Removing features occurring:
## - in fewer than 50 documents: 2,609
## - in more than 3652.8 documents: 1
## Total features removed: 2,610 (95.5%).
dim(X)
## [1] 4566 123
rf <- randomForest(x=X[training,],
y=factor(tweets$engaging[training]),
xtest=X[test,],
ytest=factor(tweets$engaging[test]),
importance=TRUE,
mtry=20,
ntree=100
)
rf
##
## Call:
## randomForest(x = X[training, ], y = factor(tweets$engaging[training]), xtest = X[test, ], ytest = factor(tweets$engaging[test]), ntree = 100, mtry = 20, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 20
##
## OOB estimate of error rate: 18.07%
## Confusion matrix:
## 0 1 class.error
## 0 408 407 0.49938650
## 1 253 2584 0.08917871
## Test set error rate: 20.13%
## Confusion matrix:
## 0 1 class.error
## 0 91 102 0.5284974
## 1 82 639 0.1137309
importance(rf)
## 0 1 MeanDecreaseAccuracy
## thank 0.404034591 3.331322524 4.086123157
## ! 0.951601140 5.060794919 4.766557273
## look 0.441457091 -0.460575003 -0.127346111
## @_@ 34.891786749 15.293939691 36.236265663
## @_thank 4.860124501 4.102580349 5.039955957
## thank_you 4.634467423 4.196317354 5.108207654
## have_a -3.362470887 -0.078766628 -1.723293352
## . 3.785506224 7.196413239 8.209250429
## : 1.328411888 16.757946462 15.360255748
## ' 11.091733535 5.561275037 10.084594671
## go 1.700440524 -1.867451924 -0.611152517
## ? 3.494834241 11.328889484 12.115317330
## @_and 6.582342540 5.906028384 8.688767945
## really 3.089598032 -1.255732909 1.078010715
## " 3.732292699 6.204320330 8.056073428
## done -1.583779330 -1.048253344 -1.612880288
## , -0.181748509 5.829864729 4.489022625
## now 2.989190584 3.640882215 4.980673054
## time -1.998960499 0.002802874 -0.942076883
## is_a -1.985114853 1.159323050 0.352833870
## new -0.005745222 1.366208938 0.996225976
## election -0.191472351 2.533014200 2.343492680
## campaign 3.971290055 8.774066970 8.257557590
## "_@ -1.858572212 3.057888911 2.169099542
## @_: -3.849810817 6.675199511 4.740958406
## for_the -3.603365923 2.350208769 -0.064047167
## ._. 0.021003542 1.833973472 1.786137593
## think -2.520282913 1.802751596 -0.318773972
## i_think 1.871914999 -0.913632536 0.116101594
## / 0.201732892 2.706647199 2.283671374
## much -2.475588923 3.494119839 2.271244158
## know -0.810362944 -1.448363190 -1.477923517
## ._@ 0.674330817 1.855324116 2.119906708
## - -0.996242766 8.023298655 6.055051001
## ,_but 0.954155658 -0.345417073 0.196331166
## @_is 5.842738137 6.830571280 8.637270404
## ._i -1.011114186 3.500015839 2.411817154
## ukip 0.350906176 -0.996556267 -0.688625396
## & 0.339414734 1.935558297 2.308806119
## ; -0.803896006 2.705179947 2.169465817
## of_the 3.079070189 3.979643049 4.503209851
## @_i 17.418517401 14.064506625 21.403769641
## to_be 2.367056092 -0.619363099 0.983618782
## rt 8.208351735 6.218143337 9.803599127
## #ep2014 12.439750594 15.420549716 18.146959587
## good 1.055520245 4.394517758 4.203372279
## well -1.213801223 -1.460259012 -1.824212811
## ) 3.362513052 5.328077417 6.164466483
## see 0.157597908 -2.773622814 -2.635145140
## :_) 2.898655105 5.963651699 6.379533205
## support 0.907175966 3.968272082 3.562716103
## going -2.269631615 1.481857761 0.098453725
## today 11.712390322 14.291756958 16.345111889
## one -0.493209091 -2.232646966 -2.174116629
## racist -0.443808527 0.150490822 -0.161824548
## :_- -0.725606771 6.507868300 6.673531645
## -_) 0.005181277 1.712665047 1.655656414
## best 2.654534573 1.582554840 2.555976327
## will_be 2.964001149 -0.265911188 1.323178234
## vote -0.537568563 4.249194358 3.094594938
## green 5.954951500 9.791998236 10.182036818
## back -0.093874214 -1.211945220 -1.197024918
## people -0.835358484 4.976164040 3.707270246
## on_the -0.908813800 3.617408259 1.969514686
## uk -2.074077672 4.702749031 3.068297707
## europe 2.181913460 5.110384007 6.078944360
## eu -0.646807094 1.740343704 1.106785521
## great 2.619948732 4.568933509 5.157028101
## amp 0.666904464 2.377118711 2.565121376
## day 2.570970249 3.416027425 4.171936888
## #votegreen2014 6.316994577 12.644365709 13.609257466
## &_amp 1.112106702 3.864792992 4.293535277
## amp_; 0.638059248 3.268250426 3.351111837
## ( 2.182503967 1.497880484 2.575270916
## mep -2.568540007 4.627909917 2.766344356
## the_eu -3.155230728 -1.153057364 -2.459379549
## get 0.190859135 -2.086780105 -1.531214983
## want 2.437879153 -1.728013414 -0.191181804
## @_what 0.651495545 -1.469004137 -1.180258235
## please 0.477932293 3.049734706 2.738425393
## in_the -1.244111116 4.867043816 3.198348419
## thanks 4.637968445 5.928493180 8.200997740
## @_thanks 8.220978832 5.473586015 7.769828415
## to_the -1.275008431 -0.909581060 -1.406871765
## #ukip -0.407034900 -0.444619233 -0.573312271
## party -1.964984977 1.262613227 -0.393214750
## way -0.699849976 -2.360272033 -2.146061648
## like 1.244664249 0.464373655 1.034988743
## @_you 7.369987492 -0.463791757 4.529605672
## work -1.770263781 -2.106555879 -2.399986541
## @_no 4.809913651 2.173406603 4.388547189
## luck 2.447156064 3.234026724 3.875170243
## good_luck 1.457225301 2.353977326 2.747122042
## can 2.425922025 2.729193052 3.575810232
## just -2.005805154 -1.587330972 -2.334865948
## still -0.506652006 0.329679623 -0.007416713
## labour 0.770948467 1.785537893 2.035368386
## right 1.335052763 -0.080614827 0.546724825
## european 7.143375041 6.616500629 8.715770577
## 2 -1.134690711 0.342446231 -0.592701177
## hope -1.104465122 1.172102313 0.382984440
## many 3.228478960 1.909629487 3.202432080
## @_the 4.685123991 3.519770496 4.714373278
## !_! 0.006983534 0.486278721 0.465232893
## yes 5.598397414 1.680094883 4.348092709
## @_yes 4.385878983 0.283045585 2.485273097
## i_am -2.652984867 4.295525266 2.441105493
## need 4.399443623 4.811293019 6.140065915
## % -2.354133551 4.483086692 2.753348304
## to_@ 9.163820830 9.818137412 12.208980599
## us -2.861644878 1.248209330 -0.872472856
## may -0.835713234 2.959699694 1.982006337
## make -1.707092535 1.630070018 0.624450526
## say -0.783604242 2.923903418 1.952338871
## scotland -1.640678813 -2.404904419 -2.722782773
## @_many 3.028252414 3.139524995 3.713227588
## via 7.924418590 6.672043049 9.028597128
## via_@ 6.576209114 7.272763770 10.094883778
## got -0.565687581 0.147636766 -0.188471128
## take 0.484343729 2.555875528 2.465263492
## news 2.244690886 4.501742781 4.748984371
## many_thanks 1.889222858 2.136940996 2.346039950
## thanks_for -1.005037815 1.475266665 1.285919705
## MeanDecreaseGini
## thank 3.8086147
## ! 20.6795067
## look 3.1189271
## @_@ 131.0505636
## @_thank 2.9175207
## thank_you 4.3413932
## have_a 2.6760461
## . 40.1241392
## : 23.7147682
## ' 14.3334494
## go 2.6510825
## ? 18.9284633
## @_and 10.6795108
## really 3.0066332
## " 15.3118585
## done 5.6135986
## , 30.5562428
## now 8.4939488
## time 4.7918707
## is_a 3.8843367
## new 5.7062401
## election 7.2413857
## campaign 6.7577393
## "_@ 3.0296721
## @_: 8.7444766
## for_the 6.1329890
## ._. 10.0500689
## think 3.8386296
## i_think 1.7774490
## / 8.5631857
## much 2.9537076
## know 5.4272155
## ._@ 7.4143133
## - 21.1819884
## ,_but 4.7765901
## @_is 9.4180788
## ._i 3.0732465
## ukip 10.2003650
## & 5.0813516
## ; 6.8824949
## of_the 9.3936462
## @_i 15.9445960
## to_be 3.3721352
## rt 9.6587039
## #ep2014 20.6742640
## good 7.9230155
## well 4.6345347
## ) 13.2032154
## see 6.7281591
## :_) 3.6594238
## support 4.1969586
## going 2.3610418
## today 17.4127413
## one 4.2818113
## racist 2.9058923
## :_- 5.8308746
## -_) 2.7750152
## best 4.2049285
## will_be 6.7849319
## vote 11.4907029
## green 13.8229901
## back 5.6373261
## people 5.7073554
## on_the 5.4215207
## uk 7.1971464
## europe 5.2207813
## eu 10.2733928
## great 8.7304794
## amp 4.6327151
## day 5.5867717
## #votegreen2014 13.0889782
## &_amp 4.9026270
## amp_; 4.9736058
## ( 8.9212199
## mep 7.0426111
## the_eu 2.0231436
## get 7.8439676
## want 2.6526643
## @_what 1.0291934
## please 4.5759146
## in_the 7.9217515
## thanks 12.5953460
## @_thanks 7.4841878
## to_the 3.6847906
## #ukip 7.7284453
## party 5.7572878
## way 3.7218336
## like 5.2385220
## @_you 4.9908577
## work 4.2155889
## @_no 2.8962307
## luck 2.8726736
## good_luck 2.3319884
## can 9.1602322
## just 8.0596035
## still 2.0784527
## labour 7.5242958
## right 5.4737919
## european 12.7455022
## 2 2.9060540
## hope 3.7314689
## many 4.5871990
## @_the 4.1863564
## !_! 2.1930673
## yes 4.0587560
## @_yes 1.8883227
## i_am 3.0193288
## need 4.1507210
## % 3.8657799
## to_@ 14.4110898
## us 3.9937197
## may 6.3954885
## make 4.0346282
## say 2.4059946
## scotland 2.1853072
## @_many 0.9120189
## via 11.6982583
## via_@ 13.6049394
## got 3.2102596
## take 3.6316162
## news 4.7665145
## many_thanks 0.8964799
## thanks_for 0.5753982
varImpPlot(rf)
One interesting way to run supervised learning algorithms is to combine the using ensemble methods. The idea is the following: run each method, check how well they perform, and then pick a prediction for the units in the test set that is weighted according to how well each classifier performs. The SuperLearner
package in R is probably the best library for ensemble methods in ML.
library(SuperLearner)
## Warning: package 'SuperLearner' was built under R version 3.4.4
## Loading required package: nnls
## Super Learner
## Version: 2.0-23
## Package created on 2018-03-09
# X needs to be a data frame
Xtrain <- as.data.frame(as.matrix(dfm_trim(twdfm, min_docfreq = 100,
max_docfreq=0.80*nrow(twdfm), verbose=TRUE)))
## Removing features occurring:
## - in fewer than 100 documents: 2,684
## - in more than 3652.8 documents: 1
## Total features removed: 2,685 (98.2%).
features <- names(Xtrain)
names(Xtrain) <- paste0("V", 1:ncol(Xtrain))
set.seed(777)
sl <- SuperLearner(Y = tweets$engaging[training], X = Xtrain[training,],
family = binomial(),
SL.library = c("SL.mean", "SL.glmnet",
"SL.step"),
cvControl=list(V=5))
## Loading required package: glmnet
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.4.4
## Loading required package: foreach
## Loaded glmnet 2.0-13
sl
##
## Call:
## SuperLearner(Y = tweets$engaging[training], X = Xtrain[training, ],
## family = binomial(), SL.library = c("SL.mean", "SL.glmnet", "SL.step"),
## cvControl = list(V = 5))
##
##
## Risk Coef
## SL.mean_All 0.1734652 0.0000000
## SL.glmnet_All 0.1401416 0.4493452
## SL.step_All 0.1400493 0.5506548
# what other classifiers are available?
listWrappers()
## All prediction algorithm wrappers in SuperLearner:
## [1] "SL.bartMachine" "SL.bayesglm" "SL.biglasso"
## [4] "SL.caret" "SL.caret.rpart" "SL.cforest"
## [7] "SL.dbarts" "SL.earth" "SL.extraTrees"
## [10] "SL.gam" "SL.gbm" "SL.glm"
## [13] "SL.glm.interaction" "SL.glmnet" "SL.ipredbagg"
## [16] "SL.kernelKnn" "SL.knn" "SL.ksvm"
## [19] "SL.lda" "SL.leekasso" "SL.lm"
## [22] "SL.loess" "SL.logreg" "SL.mean"
## [25] "SL.nnet" "SL.nnls" "SL.polymars"
## [28] "SL.qda" "SL.randomForest" "SL.ranger"
## [31] "SL.ridge" "SL.rpart" "SL.rpartPrune"
## [34] "SL.speedglm" "SL.speedlm" "SL.step"
## [37] "SL.step.forward" "SL.step.interaction" "SL.stepAIC"
## [40] "SL.svm" "SL.template" "SL.xgboost"
##
## All screening algorithm wrappers in SuperLearner:
## [1] "All"
## [1] "screen.corP" "screen.corRank" "screen.glmnet"
## [4] "screen.randomForest" "screen.SIS" "screen.template"
## [7] "screen.ttest" "write.screen.template"
# measuring peformance
preds <- predict(sl, Xtrain[test,])
head(preds$library.predict)
## SL.mean_All SL.glmnet_All SL.step_All
## [1,] 0.7768346 0.9926007 0.9972790
## [2,] 0.7768346 0.5873418 0.5825969
## [3,] 0.7768346 0.9145715 0.9237239
## [4,] 0.7768346 0.6367596 0.6152044
## [5,] 0.7768346 0.7397472 0.7820490
## [6,] 0.7768346 0.8825395 0.8837987
preds <- preds$pred > 0.50
# confusion matrix
table(preds, tweets$engaging[test])
##
## preds 0 1
## FALSE 46 29
## TRUE 147 692
# performance metrics
accuracy(preds, tweets$engaging[test])
## [1] 0.8074398
precision(preds, tweets$engaging[test])
## [1] 0.8247914
recall(preds, tweets$engaging[test])
## [1] 0.9597781