The first part of this guided coding session will be devoted to learning how to work with shape files in R. The dataset we will contains the coordinates of around 16,000 tweets that mentioned the word “shutdown,” collected on October 1st, 2013.
df <- read.csv("../data/shutdown_locations.csv", stringsAsFactors=F)
str(df)
## 'data.frame': 16721 obs. of 2 variables:
## $ Longitude: num -81.4 -81.8 -96.7 -90.1 -77 ...
## $ Latitude : num 41.2 32.4 40.9 29.9 38.9 ...
R already has some shape files built in that we can quickly work with. For example, at the country level:
library(maps)
countries <- map.where(database="world", x=df$Longitude, y=df$Latitude)
tail(sort(table(countries)))
## countries
## Indonesia:Java Australia Canada
## 87 96 177
## USA:New York:Long Island UK:Great Britain USA
## 265 290 13925
map("world")
At the state level in the U.S.:
states <- map.where(database="state", x=df$Longitude, y=df$Latitude)
tail(sort(table(states)))
## states
## illinois pennsylvania florida
## 522 588 591
## district of columbia texas california
## 660 1249 1535
map("state")
And at the county level in the U.S:
counties <- map.where(database="county", x=df$Longitude, y=df$Latitude)
tail(sort(table(counties)))
## counties
## texas,harris arizona,maricopa
## 172 205
## new york,new york illinois,cook
## 268 274
## california,los angeles district of columbia,washington
## 547 660
map("county")
But we can also load our own shape file. For example, the official region shape files, available here.
library(maptools)
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.4.1
## Checking rgeos availability: TRUE
region <- readShapePoly("../data/region/cb_2016_us_region_20m.shp")
## Warning: use rgdal::readOGR or sf::st_read
plot(region)
region@data
## REGIONCE AFFGEOID GEOID NAME LSAD ALAND AWATER
## 0 1 0200000US1 1 Northeast 68 4.193571e+11 50260036355
## 1 2 0200000US2 2 Midwest 68 1.943880e+12 184377979016
## 2 3 0200000US3 3 South 68 2.249494e+12 134461796380
## 3 4 0200000US4 4 West 68 4.535284e+12 316520461413
# now convert to a Spatial Polygons object to identify region for each point
cc <- SpatialPolygons2map(region, namefield="NAME")
plot(cc) # this is what is internally in the object, just a set of points!
regions <- map.where(database=cc, x=df$Longitude, y=df$Latitude)
table(states)
## states
## alabama arizona
## 280 298
## arkansas california
## 124 1535
## colorado connecticut
## 231 137
## delaware district of columbia
## 45 660
## florida georgia
## 591 437
## idaho illinois
## 48 522
## indiana iowa
## 275 113
## kansas kentucky
## 173 247
## louisiana maine
## 154 25
## maryland massachusetts:main
## 407 381
## massachusetts:nantucket michigan:north
## 1 5
## michigan:south minnesota
## 352 161
## mississippi missouri
## 127 285
## montana nebraska
## 23 78
## nevada new hampshire
## 139 36
## new jersey new mexico
## 317 89
## new york:long island new york:main
## 238 365
## new york:manhattan north carolina:knotts
## 268 3
## north carolina:main north dakota
## 406 36
## ohio oklahoma
## 513 161
## oregon pennsylvania
## 173 588
## rhode island south carolina
## 54 218
## south dakota tennessee
## 27 328
## texas utah
## 1249 98
## vermont virginia:chesapeake
## 15 1
## virginia:main washington:main
## 467 243
## washington:whidbey island west virginia
## 1 88
## wisconsin wyoming
## 200 17
Now imagine we want to produce a choropleth map, that is, a map where each area is shaded as a function of some parameter of the data. Let’s say we want to produce a map of the U.S. states with each region shaded by the number of tweets.
First, we will create a data frame with the data
states <- map.where(database="state", x=df$Longitude, y=df$Latitude)
tail(sort(table(states)))
## states
## illinois pennsylvania florida
## 522 588 591
## district of columbia texas california
## 660 1249 1535
# cleaning state names and creating data frame with counts
states <- gsub('(.*):.*', states, repl="\\1")
tab <- table(states, exclude=NULL)
tab <- data.frame(tab); names(tab) <- c("state", "tweets")
head(tab)
## state tweets
## 1 alabama 280
## 2 arizona 298
## 3 arkansas 124
## 4 california 1535
## 5 colorado 231
## 6 connecticut 137
Now let’s start with a simple map:
library(ggplot2)
library(maps)
states_map <- map_data('state')
p <- ggplot(tab, aes(map_id=state))
p + geom_map(aes(fill=tweets), map = states_map) +
expand_limits(x = states_map$long, y = states_map$lat)
We can then start playing around with the color and size of the borders.
p + geom_map(aes(fill=tweets), map = states_map, color="yellow") +
expand_limits(x = states_map$long, y = states_map$lat)
pq <- p + geom_map(aes(fill=tweets), map = states_map, color="grey", size=.1) +
expand_limits(x = states_map$long, y = states_map$lat)
pq
And try with different color scales:
pq + scale_fill_continuous(low="white", high="black") ## same as...
pq + scale_fill_gradient(low="white", high="black") ## smoothing btw 2 colors
pq + scale_fill_gradientn(colours = rainbow(7)) ## for >2 colors
pq + scale_fill_gradientn(colours = c("red", "white", "blue")) ## for >2 colors
library(RColorBrewer) ## by far the best. Check: http://colorbrewer2.org/
colors <- rev(brewer.pal(7,"YlOrRd"))
pq + scale_fill_continuous(low=colors[7], high=colors[1])
Given the skewed distribution of the dataset, we probably want to transform it by taking the log:
pq + scale_fill_continuous(trans="log")
pq + scale_fill_continuous(trans="log", breaks=c(10, 100, 1000),
labels = c("10", "100", "1K"))
The two final steps will be to remove axes and background, and to change the orientation and location of the legend
pq <- pq + theme(
# removing axes
axis.line = element_blank(), axis.text = element_blank(),
axis.ticks = element_blank(), axis.title = element_blank(),
# removing background / borders
panel.background = element_blank(), plot.background = element_blank(),
panel.border = element_blank(), panel.grid = element_blank()
)
pq
pq + theme(legend.direction="horizontal",
legend.position = c(.20, .10),
legend.key.size = unit(c(.8), "cm"))
The other type of map we may want to produce is one where you overlay dots over a map. For example, let’s produce a map of tweets in Europe, colored by language. The dataset we will use contains 80,300 tweets collected on September 15, 2013.
We will start by preparing the underlying map of Europe:
library(cshapes)
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
##
## ozone
library(scales)
library(ggplot2)
library(grid)
library(mapproj)
# preparing data frame with current country boundaries
world.data <- fortify(cshp(date=as.Date("2012-06-30")))
## Warning in strptime(xx, f <- "%Y-%m-%d", tz = "GMT"): unknown timezone
## 'zone/tz/2017c.1.0/zoneinfo/America/Los_Angeles'
## Warning: use rgdal::readOGR or sf::st_read
## Regions defined for each Polygons
# base layer: map of Europe
p <- ggplot(world.data, aes(long,lat,group=group)) +
geom_polygon(fill="black", color="grey80", size=0.25) +
coord_map("lagrange", xlim=c(-14.5, 35), ylim=c(32.5,61))
p
# removing axes, ticks, changing background...
pq <- p + theme(
# dark background
panel.background = element_rect(fill = "black"),
plot.background = element_rect(fill="black"),
# removing axis lines and ticks
axis.line = element_blank(), axis.text = element_blank(),
axis.ticks = element_blank(), axis.title = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank()
)
pq
Now let’s clean the tweet data:
df <- read.csv("../data/loc_lang.csv", stringsAsFactors=F)
df <- df[df$lang!="und",] ## removing "undetermined" language
df <- df[df$lat>32.5 & df$lon>(-14.5) & df$lat<61 & df$lon<35,]
tab <- sort(table(df$lang), dec=TRUE) ## table for tweets by language
top15lang <- names(tab)[1:15] ## top 15 languages
df <- df[df$lang %in% top15lang,]
And we’re ready to plot it!
pq <- ggplot(world.data, aes(long,lat,group=group)) +
geom_polygon(fill="black", color="grey80", size=0.25) +
geom_point(data = df,
aes(x = lon, y=lat, color=lang, group=NA), size=0.3) +
coord_map("lagrange", xlim=c(-14.5, 35), ylim=c(32.5,61)) +
theme(
panel.background = element_rect(fill = "black"),
plot.background = element_rect(fill="black"),
axis.line = element_blank(), axis.text = element_blank(),
axis.ticks = element_blank(), axis.title = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
legend.background = element_rect(colour = F, fill = "black"),
legend.key = element_rect(fill = "black", colour = F),
legend.title = element_text(color="white"),
legend.text = element_text(color="white", size=10)
) +
guides(color = guide_legend(override.aes = list(size=5)))
# adding language labels
langs <- c("German", "English", "Spanish", "Estonian",
"French", "Indonesian", "Italian", "Latvian", "Dutch",
"Portuguese", "Russian", "Slovak", "Slovenian",
"Tagalog", "Turkish")
pq <- pq + scale_color_discrete("Language", labels = langs)
pq