The file data/congress-facebook.rdata
contains a data frame with the Facebook posts of all current Members of Congress, with row corresponding to a different document, after collapsing all the Facebook posts from a single member into only one character string.
We’re going to run LDA to explore the different topics that Members of Congress discuss on Facebook, and whether there is variation across different characteristics, such as party and chamber.
You can skip to the end of this script if you want to see how I collected and cleaned the data, but for now let’s jump straight to the topic modeling part:
load("data/congress-facebook.rdata")
nrow(posts)
## [1] 502
str(posts)
## 'data.frame': 502 obs. of 6 variables:
## $ text : chr "Enjoyed seeing all the TX-02 constituents up in D.C. this week! Lots of young Texans have been coming by the of"| __truncated__ "A foreign adversary launched an attack on our democracy. That is a threat which every American, regardless of "| __truncated__ "Watch video here of my statement calling for increased federal support of 1890s Land-grant HBCUs. ICYMI: Earlie"| __truncated__ "NA Children's health should be bipartisan. #TrumpCare cuts billions from Medicaid and leaves 3 million children"| __truncated__ ...
## $ account: chr "106631626049851" "109135405838588" "113303673339" "115356957005" ...
## $ name : chr "Ted Poe" "Wm. Lacy Clay" "David Scott" "Henry C. \"Hank\" Johnson, Jr." ...
## $ party : chr "Republican" "Democrat" "Democrat" "Democrat" ...
## $ type : chr "rep" "rep" "rep" "rep" ...
## $ gender : chr "M" "M" "M" "M" ...
Creating the document-feature matrix in quanteda:
library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
fbcorpus <- corpus(posts$text)
cdfm <- dfm(fbcorpus, remove=stopwords("english"), verbose=TRUE,
remove_punct=TRUE, remove_numbers=TRUE)
## Creating a dfm from a corpus ...
## ... tokenizing texts
## ... lowercasing
## ... found 502 documents, 92,444 features
## ...
## dfm_select removed 171 features and 0 documents, padding 0s for 0 features and 0 documents.
## ... created a 502 x 90,753 sparse dfm
## ... complete.
## Elapsed time: 0.488 seconds.
cdfm <- dfm_trim(cdfm, min_docfreq = 2, verbose=TRUE)
## Removing features occurring:
## - in fewer than 2 document: 54,988
## Total features removed: 54,988 (60.6%).
# we now export to a format that we can run the topic model with
dtm <- convert(cdfm, to="topicmodels")
## Loading required namespace: tm
And now we can run the topic model
# estimate LDA with K topics
library(topicmodels)
K <- 50
lda <- LDA(dtm, k = K, method = "Gibbs",
control = list(verbose=25L, seed = 123, burnin = 100, iter = 500))
save(lda, file="backup/lda-output.Rdata")
The code above will take some time, so I already ran it before and put it in the backup folder. Let’s start looking at the results…
library(topicmodels)
load("backup/lda-output.Rdata")
terms(lda)
## Topic 1 Topic 2 Topic 3 Topic 4
## "washington" "grassley" "connecticut" "san"
## Topic 5 Topic 6 Topic 7 Topic 8
## "colorado" "http" "president" "texas"
## Topic 9 Topic 10 Topic 11 Topic 12
## "today" "president" "new" "trump"
## Topic 13 Topic 14 Topic 15 Topic 16
## "illinois" "will" "senate" "bit.ly"
## Topic 17 Topic 18 Topic 19 Topic 20
## "health" "military" "school" "virginia"
## Topic 21 Topic 22 Topic 23 Topic 24
## "north" "tulsi" "joe" "florida"
## Topic 25 Topic 26 Topic 27 Topic 28
## "maine" "people" "new" "politics"
## Topic 29 Topic 30 Topic 31 Topic 32
## "veterans" "de" "michigan" "bill"
## Topic 33 Topic 34 Topic 35 Topic 36
## "massachusetts" "said" "new" "maryland"
## Topic 37 Topic 38 Topic 39 Topic 40
## "nevada" "north" "california" "kansas"
## Topic 41 Topic 42 Topic 43 Topic 44
## "georgia" "new" "alabama" "trump"
## Topic 45 Topic 46 Topic 47 Topic 48
## "https" "town" "pennsylvania" "ohio"
## Topic 49 Topic 50
## "act" "minnesota"
# top 10 words associated to each topic
trms <- t(terms(lda, k=10))
# some topics are easy to identify
trms[6,] # media appearances
## [1] "http" "https" "news"
## [4] "press-releases" "media-center" "watch"
## [7] "read" "www.youtube.com" "v"
## [10] "documentsingle.aspx"
trms[15,] # supreme court nomination
## [1] "senate" "senator" "judge" "secretary" "court"
## [6] "nomination" "general" "nominee" "gorsuch" "hearing"
trms[17,] # repealing obamacare
## [1] "health" "care" "obamacare" "house" "american"
## [6] "act" "tax" "americans" "will" "repeal"
trms[18,] # foreign policy
## [1] "military" "u.s" "security" "united" "today" "foreign"
## [7] "israel" "allies" "defense" "north"
trms[32,] # budgetary issues?
## [1] "bill" "repeal" "spending" "debt" "healthcare"
## [6] "budget" "paul" "rep" "vote" "government"
trms[33,] # energy policy
## [1] "massachusetts" "climate" "u.s" "trump"
## [5] "clean" "energy" "nuclear" "oil"
## [9] "change" "boston"
trms[44,] # Trump-Russia investigation
## [1] "trump" "house" "president" "investigation"
## [5] "committee" "russian" "russia" "trump's"
## [9] "intelligence" "election"
trms[46,] # town halls
## [1] "town" "hall" "office" "district"
## [5] "will" "questions" "congressional" "congressman"
## [9] "constituents" "na"
trms[49,] # legislative terms
## [1] "act" "h.r" "house" "bill" "passed"
## [6] "voted" "legislation" "vote" "regulations" "support"
# but many others appear to simply refer to states
trms[1,]
## [1] "washington" "oregon" "nebraska" "seattle" "community"
## [6] "sound" "oregonians" "rep" "eastern" "pacific"
trms[2,]
## [1] "grassley" "senator" "kentucky" "idaho" "wyoming"
## [6] "comer" "delaware" "#kentucky" "https" "said"
# so probably we would want to clean the data to get rid of state names
Let’s look at how often different Members of Congress mention each topic…
# matrix with topic distributions
mat <- lda@gamma
dim(mat)
## [1] 502 50
mat[1, 1:50]
## [1] 0.0001518142 0.0003036284 0.0004554425 0.0003036284 0.0001518142
## [6] 0.0027326552 0.0198876575 0.0154850463 0.0156368605 0.0139669045
## [11] 0.0004554425 0.0006072567 0.0015181418 0.1381509033 0.0010626993
## [16] 0.0001518142 0.0347654471 0.1263093973 0.1017155002 0.0003036284
## [21] 0.0001518142 0.0001518142 0.0007590709 0.0003036284 0.0001518142
## [26] 0.1694246243 0.0044026112 0.0004554425 0.0200394717 0.0007590709
## [31] 0.0001518142 0.0004554425 0.0003036284 0.2899650827 0.0003036284
## [36] 0.0015181418 0.0021253985 0.0001518142 0.0003036284 0.0003036284
## [41] 0.0003036284 0.0003036284 0.0001518142 0.0036435403 0.0004554425
## [46] 0.0159404888 0.0001518142 0.0001518142 0.0113860635 0.0012145134
# i.e.: Member of Congress 1 is talking 16.9% of the time about Topic 26
# who is the Member of Congress that talks more about...
# immigration?
posts$name[which.max(mat[,44])] # Trump-Russia
## [1] "Maxine Waters"
posts$name[which.max(mat[,17])] # repealing obamacare
## [1] "Steve Scalise"
# northwest?
posts$name[which.max(mat[,1])]
## [1] "Rick Larsen"
# supreme court nomination
posts$name[which.max(mat[,15])]
## [1] "Patrick J. Leahy"
# looking at differences across parties
rep.props <- apply(lda@gamma[posts$party=="Republican",], 2, mean)
dem.props <- apply(lda@gamma[posts$party=="Democrat",], 2, mean)
ratio <- rep.props / (dem.props + rep.props)
# % of topic usage by party
ratio[15] # supreme court nomination
## [1] 0.4614485
ratio[17] # repealing obamacare
## [1] 0.902871
ratio[18] # foreign policy
## [1] 0.6692792
ratio[32] # budgetary issues?
## [1] 0.7922182
ratio[33] # energy policy
## [1] 0.184171
ratio[44] # Trump-Russia investigation
## [1] 0.1612373
And now, here’s how I collected and cleaned the data.
First of all, let’s scrape the dataset with the social media accounts of Members of Congress.
scrapeCongressData <- function(){
## Downloading Congress data
txt <- httr::content(httr::GET(paste0("https://raw.githubusercontent.com/unitedstates/",
"congress-legislators/master/legislators-current.yaml")), 'text')
congress <- yaml::yaml.load(txt)
congress <- data.frame(
id = unlistCongress(congress, c('id', 'thomas')),
bioid = unlistCongress(congress, c('id', 'bioguide')),
name = unlistCongress(congress, c('name', 'official_full')),
gender = unlistCongress(congress, c('bio', 'gender')),
type = unlist(lapply(congress, function(x)
x$terms[[length(x$terms)]]$type)),
party = unlist(lapply(congress, function(x)
x$terms[[length(x$terms)]]$party)),
stringsAsFactors=F)
## Downloading List of Social Media Accounts
txt <- httr::content(httr::GET(paste0("https://raw.githubusercontent.com/unitedstates/",
"congress-legislators/master/legislators-social-media.yaml")), 'text')
sm <- yaml::yaml.load(txt)
sm <- data.frame(
bioid = unlistCongress(sm, c('id', 'bioguide')),
twitter = unlistCongress(sm, c('social', 'twitter')),
facebook = unlistCongress(sm, c('social', 'facebook')),
youtube = unlistCongress(sm, c('social', 'youtube')),
stringsAsFactors=F)
## merging
df <- merge(congress, sm, all.x=TRUE)
return(df)
}
unlistCongress <- function(lst, field){
if (length(field)==1){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst, '[[', field))
}
if (length(field)==2){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst, function(x) x[[field[1]]][[field[2]]]))
}
return(vect)
}
congress <- scrapeCongressData()
And this is a copy of the file I downloaded…
congress <- read.csv("data/congress-social-media.csv", stringsAsFactors=F)
Keeping only members of congress with FB accounts
table(is.na(congress$facebook))
congress <- congress[!is.na(congress$facebook),]
Collecting Facebook page data
library(Rfacebook)
fb_oauth <- "YOUR_TOKEN_HERE"
# get yours from: https://developers.facebook.com/tools/explorer/
# list of accounts
accounts <- congress$facebook
# removing those already downloaded
accounts.done <- list.files("data/facebook")
accounts.left <- accounts[tolower(accounts) %in% tolower(gsub(".csv", "", accounts.done)) == FALSE]
# loop over accounts
for (account in accounts.left){
message(account)
# download page data (with error handling)
error <- tryCatch(df <- getPage(account, token=fb_oauth, n=5000, since='2017/01/01',
reactions=FALSE, api="v2.9"),
error=function(e) e)
if (inherits(error, 'error')){ next }
## cleaning text from \n (line breaks)
df$message <- gsub("\n", " ", df$message)
## save page data csv format
write.csv(df, file=paste0("data/facebook/", account, ".csv"),
row.names=F)
}
Merging text from Facebook pages into a single data frame where all posts are collapsed into a single string.
accounts <- sort(gsub(".csv", "", list.files("data/facebook")))
congress <- read.csv("data/congress-social-media.csv", stringsAsFactors=F)
congress <- congress[congress$facebook %in% accounts,]
congress <- congress[order(congress$facebook),]
posts <- data.frame(
text = "",
account = accounts,
name = congress$name,
party = congress$party,
type = congress$type,
gender = congress$gender, stringsAsFactors=F )
for (account in accounts){
message(account)
d <- read.csv(paste0("data/facebook/", account, ".csv"),
stringsAsFactors=F)
posts$text[posts$account==account] <- paste(d$message, collapse=" ")
}
nrow(posts)
save(posts, file="data/congress-facebook.rdata")