library(topicmodels)
# reading data and preparing corpus object
nyt <- read.csv("../data/nytimes.csv", stringsAsFactors = FALSE)
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.4
nytcorpus <- corpus(nyt$lead_paragraph)
nytdfm <- dfm(nytcorpus, remove=stopwords("english"), verbose=TRUE,
remove_punct=TRUE, remove_numbers=TRUE)
cdfm <- dfm_trim(nytdfm, min_docfreq = 2)
Most text corpora have not only the documents per se, but also a lot of metadata associated – we know the author, characteristics of the author, when the document was produced, etc. The structural topic model takes advantage of this metadata to improve the discovery of topics. Here we will learn how it works, how we can interpret the output, and some issues related to its usage for research.
We will continue with the previous example, but now adding one covariate: the party of the president.
library(stm)
## stm v1.3.3 (2018-1-26) successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
# extracting covariates
year <- as.numeric(substr(nyt$datetime, 1, 4))
repub <- ifelse(year %in% c(1981:1992, 2000:2008), 1, 0)
And now we’re ready to run stm
!
# metadata into a data frame
meta <- data.frame(year=year, repub=repub)
# running STM
stm <- stm(documents=cdfm, K=30, prevalence=~repub+s(year),
data=meta, seed=123)
save(stm, file="../backup/stm-output.Rdata")
stm
offers a series of features to explore the output. First, just like LDA, we can look at the words that are most associated with each topic.
load("../backup/stm-output.Rdata")
# looking at a few topics
labelTopics(stm, topics=1)
## Topic 1 Top Words:
## Highest Prob: tax, house, congress, plan, bill, president, senate
## FREX: y, senate, package, bill, stimulus, plan, legislation
## Lift: accompany, boren, boren's, fiscally, furious, lame-duck, lower-income
## Score: y, tax, senate, democrats, republicans, bill, congress
labelTopics(stm, topics=4)
## Topic 4 Top Words:
## Highest Prob: federal, rates, reserve, interest, fed, rate, policy
## FREX: reserve, fed, fed's, greenspan, rates, interest, reserve's
## Lift: accommodative, affirming, c1, deliberate, f.o.m.c, fed's, groundwork
## Score: reserve, fed, rates, federal, interest, greenspan, fed's
labelTopics(stm, topics=7)
## Topic 7 Top Words:
## Highest Prob: general, auto, industry, motors, said, japanese, cars
## FREX: motors, auto, motor, ford, general, g.m, detroit
## Lift: downs, stocked, thatcher's, ups, autoworkers, basement, caldwell
## Score: motors, auto, motor, g.m, japanese, ford, automotive
labelTopics(stm, topics=10)
## Topic 10 Top Words:
## Highest Prob: year, corporate, last, retailers, sales, like, america
## FREX: wal-mart, retailers, stores, christmas, shopping, microsoft, holiday
## Lift: buffett's, kingdom, motorola, naisbitt, retailing, teenagers, abercrombie
## Score: retailers, stores, wal-mart, sales, corporate, dividends, microsoft
But unlike LDA, we now can estimate the effects of the features we considered into the prevalence of different topics
# effects
est <- estimateEffect(~repub, stm,
uncertainty="None")
labelTopics(stm, topics=1)
## Topic 1 Top Words:
## Highest Prob: tax, house, congress, plan, bill, president, senate
## FREX: y, senate, package, bill, stimulus, plan, legislation
## Lift: accompany, boren, boren's, fiscally, furious, lame-duck, lower-income
## Score: y, tax, senate, democrats, republicans, bill, congress
summary(est, topics=1)
##
## Call:
## estimateEffect(formula = ~repub, stmobj = stm, uncertainty = "None")
##
##
## Topic 1:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.054158 0.002531 21.399 <2e-16 ***
## repub -0.026772 0.003223 -8.307 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
labelTopics(stm, topics=4)
## Topic 4 Top Words:
## Highest Prob: federal, rates, reserve, interest, fed, rate, policy
## FREX: reserve, fed, fed's, greenspan, rates, interest, reserve's
## Lift: accommodative, affirming, c1, deliberate, f.o.m.c, fed's, groundwork
## Score: reserve, fed, rates, federal, interest, greenspan, fed's
summary(est, topics=4)
##
## Call:
## estimateEffect(formula = ~repub, stmobj = stm, uncertainty = "None")
##
##
## Topic 4:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.054050 0.002612 20.692 < 2e-16 ***
## repub -0.026623 0.003242 -8.213 2.72e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
labelTopics(stm, topics=7)
## Topic 7 Top Words:
## Highest Prob: general, auto, industry, motors, said, japanese, cars
## FREX: motors, auto, motor, ford, general, g.m, detroit
## Lift: downs, stocked, thatcher's, ups, autoworkers, basement, caldwell
## Score: motors, auto, motor, g.m, japanese, ford, automotive
summary(est, topics=7)
##
## Call:
## estimateEffect(formula = ~repub, stmobj = stm, uncertainty = "None")
##
##
## Topic 7:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.053989 0.002405 22.445 <2e-16 ***
## repub -0.026598 0.003143 -8.462 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
labelTopics(stm, topics=9)
## Topic 9 Top Words:
## Highest Prob: one, editor, us, people, can, many, front
## FREX: op-ed, re, front, column, us, story, editor
## Lift: brooks, carroll, first-class, pearl, prejudice, scenario, solace
## Score: editor, op-ed, re, column, us, bible, bob
summary(est, topics=9)
##
## Call:
## estimateEffect(formula = ~repub, stmobj = stm, uncertainty = "None")
##
##
## Topic 9:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.054139 0.002647 20.455 <2e-16 ***
## repub -0.026755 0.003155 -8.479 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Let’s say we’re interested in finding the most partisan topics. How would we do this?
# let's look at the structure of the output object...
names(est)
## [1] "parameters" "topics" "call" "uncertainty" "formula"
## [6] "data" "modelframe" "varlist"
length(est$parameters)
## [1] 30
est$parameters[[1]]
## [[1]]
## [[1]]$est
## (Intercept) repub
## 0.05412944 -0.02675423
##
## [[1]]$vcov
## [,1] [,2]
## [1,] 6.650603e-06 -6.650603e-06
## [2,] -6.650603e-06 1.022855e-05
# aha! we'll just extract the coefficients for each topic
coef <- se <- rep(NA, 30)
for (i in 1:30){
coef[i] <- est$parameters[[i]][[1]]$est[2]
se[i] <- sqrt(est$parameters[[i]][[1]]$vcov[2,2])
}
df <- data.frame(topic = 1:30, coef=coef, se=se)
df <- df[order(df$coef),] # sorting by "partisanship"
head(df[order(df$coef),])
## topic coef se
## 1 1 -0.02675423 0.003198210
## 5 5 -0.02213393 0.003469592
## 26 26 -0.01843612 0.002570712
## 27 27 -0.01757819 0.003072289
## 6 6 -0.01741150 0.004230113
## 24 24 -0.01252548 0.003358187
tail(df[order(df$coef),])
## topic coef se
## 12 12 0.007762979 0.003108421
## 2 2 0.007990383 0.004256040
## 18 18 0.015119115 0.002677673
## 11 11 0.017551712 0.002733533
## 28 28 0.021337990 0.002986835
## 16 16 0.028718732 0.003195332
# three most "democratic" topics
labelTopics(stm, topics=df$topic[1])
## Topic 1 Top Words:
## Highest Prob: tax, house, congress, plan, bill, president, senate
## FREX: y, senate, package, bill, stimulus, plan, legislation
## Lift: accompany, boren, boren's, fiscally, furious, lame-duck, lower-income
## Score: y, tax, senate, democrats, republicans, bill, congress
labelTopics(stm, topics=df$topic[5])
## Topic 6 Top Words:
## Highest Prob: president, mr, bush, obama, clinton, campaign, economy
## FREX: romney, kerry, voters, presidential, campaign, bush, clinton
## Lift: accomplishments, audacious, bachmann, battery, big-spending, boasts, campaign's
## Score: obama, bush, voters, mr, clinton, president, senator
labelTopics(stm, topics=df$topic[26])
## Topic 2 Top Words:
## Highest Prob: percent, prices, inflation, increase, said, today, consumer
## FREX: producer, inflation, increase, two-tenths, rise, one-tenth, prices
## Lift: d6, fixed-rate, moskowitz, skidded, utilities, york-northeastern, 12-month
## Score: index, percent, prices, producer, rose, two-tenths, inflation
# three most "republican" topics
labelTopics(stm, topics=df$topic[16])
## Topic 19 Top Words:
## Highest Prob: said, report, economy, yesterday, economic, index, consumer
## FREX: activity, survey, confidence, manufacturing, purchasing, released, pace
## Lift: contracted, cooled, disruptions, fabian, leveled, linden, management-chicago
## Score: index, survey, purchasing, pace, activity, consumer, confidence
labelTopics(stm, topics=df$topic[28])
## Topic 11 Top Words:
## Highest Prob: mr, said, economic, president, secretary, treasury, advisers
## FREX: regan, advisers, secretary, council, mondale, vice, t
## Lift: beryl, feldstein, fulfill, haig, jordan, nicholas, parry
## Score: secretary, mr, advisers, treasury, regan, mondale, rubin
labelTopics(stm, topics=df$topic[11])
## Topic 9 Top Words:
## Highest Prob: one, editor, us, people, can, many, front
## FREX: op-ed, re, front, column, us, story, editor
## Lift: brooks, carroll, first-class, pearl, prejudice, scenario, solace
## Score: editor, op-ed, re, column, us, bible, bob