Working with shape files

The first part of this guided coding session will be devoted to learning how to work with shape files in R. The dataset we will contains the coordinates of around 16,000 tweets that mentioned the word “shutdown,” collected on October 1st, 2013.

df <- read.csv("../data/shutdown_locations.csv", stringsAsFactors=F)
str(df)
## 'data.frame':    16721 obs. of  2 variables:
##  $ Longitude: num  -81.4 -81.8 -96.7 -90.1 -77 ...
##  $ Latitude : num  41.2 32.4 40.9 29.9 38.9 ...

R already has some shape files built in that we can quickly work with. For example, at the country level:

library(maps)
countries <- map.where(database="world", x=df$Longitude, y=df$Latitude)
tail(sort(table(countries)))
## countries
##           Indonesia:Java                Australia                   Canada 
##                       87                       96                      177 
## USA:New York:Long Island         UK:Great Britain                      USA 
##                      265                      290                    13925
map("world")

At the state level in the U.S.:

states <- map.where(database="state", x=df$Longitude, y=df$Latitude)
tail(sort(table(states)))
## states
##             illinois         pennsylvania              florida 
##                  522                  588                  591 
## district of columbia                texas           california 
##                  660                 1249                 1535
map("state")

And at the county level in the U.S:

counties <- map.where(database="county", x=df$Longitude, y=df$Latitude)
tail(sort(table(counties)))
## counties
##                    texas,harris                arizona,maricopa 
##                             172                             205 
##               new york,new york                   illinois,cook 
##                             268                             274 
##          california,los angeles district of columbia,washington 
##                             547                             660
map("county")

But we can also load our own shape file. For example, the official region shape files, available here.

library(maptools)
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.4.1
## Checking rgeos availability: TRUE
region <- readShapePoly("../data/region/cb_2016_us_region_20m.shp")
## Warning: use rgdal::readOGR or sf::st_read
plot(region)

region@data
##   REGIONCE   AFFGEOID GEOID      NAME LSAD        ALAND       AWATER
## 0        1 0200000US1     1 Northeast   68 4.193571e+11  50260036355
## 1        2 0200000US2     2   Midwest   68 1.943880e+12 184377979016
## 2        3 0200000US3     3     South   68 2.249494e+12 134461796380
## 3        4 0200000US4     4      West   68 4.535284e+12 316520461413
# now convert to a Spatial Polygons object to identify region for each point
cc <- SpatialPolygons2map(region, namefield="NAME")
plot(cc) # this is what is internally in the object, just a set of points!

regions <- map.where(database=cc, x=df$Longitude, y=df$Latitude)
table(states)
## states
##                   alabama                   arizona 
##                       280                       298 
##                  arkansas                california 
##                       124                      1535 
##                  colorado               connecticut 
##                       231                       137 
##                  delaware      district of columbia 
##                        45                       660 
##                   florida                   georgia 
##                       591                       437 
##                     idaho                  illinois 
##                        48                       522 
##                   indiana                      iowa 
##                       275                       113 
##                    kansas                  kentucky 
##                       173                       247 
##                 louisiana                     maine 
##                       154                        25 
##                  maryland        massachusetts:main 
##                       407                       381 
##   massachusetts:nantucket            michigan:north 
##                         1                         5 
##            michigan:south                 minnesota 
##                       352                       161 
##               mississippi                  missouri 
##                       127                       285 
##                   montana                  nebraska 
##                        23                        78 
##                    nevada             new hampshire 
##                       139                        36 
##                new jersey                new mexico 
##                       317                        89 
##      new york:long island             new york:main 
##                       238                       365 
##        new york:manhattan     north carolina:knotts 
##                       268                         3 
##       north carolina:main              north dakota 
##                       406                        36 
##                      ohio                  oklahoma 
##                       513                       161 
##                    oregon              pennsylvania 
##                       173                       588 
##              rhode island            south carolina 
##                        54                       218 
##              south dakota                 tennessee 
##                        27                       328 
##                     texas                      utah 
##                      1249                        98 
##                   vermont       virginia:chesapeake 
##                        15                         1 
##             virginia:main           washington:main 
##                       467                       243 
## washington:whidbey island             west virginia 
##                         1                        88 
##                 wisconsin                   wyoming 
##                       200                        17

Producing choropleth map

Now imagine we want to produce a choropleth map, that is, a map where each area is shaded as a function of some parameter of the data. Let’s say we want to produce a map of the U.S. states with each region shaded by the number of tweets.

First, we will create a data frame with the data

states <- map.where(database="state", x=df$Longitude, y=df$Latitude)
tail(sort(table(states)))
## states
##             illinois         pennsylvania              florida 
##                  522                  588                  591 
## district of columbia                texas           california 
##                  660                 1249                 1535
# cleaning state names and creating data frame with counts
states <- gsub('(.*):.*', states, repl="\\1")
tab <- table(states, exclude=NULL)
tab <- data.frame(tab); names(tab) <- c("state", "tweets")
head(tab)
##         state tweets
## 1     alabama    280
## 2     arizona    298
## 3    arkansas    124
## 4  california   1535
## 5    colorado    231
## 6 connecticut    137

Now let’s start with a simple map:

library(ggplot2)
library(maps)
states_map <- map_data('state')

p <- ggplot(tab, aes(map_id=state))
p + geom_map(aes(fill=tweets), map = states_map) +
         expand_limits(x = states_map$long, y = states_map$lat)

We can then start playing around with the color and size of the borders.

p + geom_map(aes(fill=tweets), map = states_map, color="yellow") +
         expand_limits(x = states_map$long, y = states_map$lat)

pq <- p + geom_map(aes(fill=tweets), map = states_map, color="grey", size=.1) +
         expand_limits(x = states_map$long, y = states_map$lat)
pq

And try with different color scales:

pq + scale_fill_continuous(low="white", high="black") ## same as...

pq + scale_fill_gradient(low="white", high="black") ## smoothing btw 2 colors

pq + scale_fill_gradientn(colours = rainbow(7)) ## for >2 colors

pq + scale_fill_gradientn(colours = c("red", "white", "blue")) ## for >2 colors

library(RColorBrewer) ## by far the best. Check: http://colorbrewer2.org/
colors <- rev(brewer.pal(7,"YlOrRd"))

pq + scale_fill_continuous(low=colors[7], high=colors[1])

Given the skewed distribution of the dataset, we probably want to transform it by taking the log:

pq + scale_fill_continuous(trans="log")

pq + scale_fill_continuous(trans="log", breaks=c(10, 100, 1000),
    labels = c("10", "100", "1K"))

The two final steps will be to remove axes and background, and to change the orientation and location of the legend

pq <- pq + theme(
        # removing axes
        axis.line = element_blank(), axis.text = element_blank(), 
        axis.ticks = element_blank(), axis.title = element_blank(),
        # removing background / borders
        panel.background = element_blank(), plot.background = element_blank(),
        panel.border = element_blank(), panel.grid = element_blank()
    )
pq

pq + theme(legend.direction="horizontal", 
    legend.position = c(.20, .10),
    legend.key.size = unit(c(.8), "cm"))

Producing maps of locations

The other type of map we may want to produce is one where you overlay dots over a map. For example, let’s produce a map of tweets in Europe, colored by language. The dataset we will use contains 80,300 tweets collected on September 15, 2013.

We will start by preparing the underlying map of Europe:

library(cshapes)
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
## 
##     ozone
library(scales)
library(ggplot2)
library(grid)
library(mapproj)

# preparing data frame with current country boundaries
world.data <- fortify(cshp(date=as.Date("2012-06-30")))
## Warning in strptime(xx, f <- "%Y-%m-%d", tz = "GMT"): unknown timezone
## 'zone/tz/2017c.1.0/zoneinfo/America/Los_Angeles'
## Warning: use rgdal::readOGR or sf::st_read
## Regions defined for each Polygons
# base layer: map of Europe
p <- ggplot(world.data, aes(long,lat,group=group)) + 
    geom_polygon(fill="black", color="grey80", size=0.25) +
    coord_map("lagrange", xlim=c(-14.5, 35), ylim=c(32.5,61))
p

# removing axes, ticks, changing background...
pq <- p + theme(
    # dark background
        panel.background = element_rect(fill = "black"),
        plot.background = element_rect(fill="black"),
        # removing axis lines and ticks
        axis.line = element_blank(), axis.text = element_blank(), 
        axis.ticks = element_blank(), axis.title = element_blank(), 
        panel.border = element_blank(), 
        panel.grid.major = element_blank(), panel.grid.minor = element_blank()
    )
pq

Now let’s clean the tweet data:

df <- read.csv("../data/loc_lang.csv", stringsAsFactors=F)
df <- df[df$lang!="und",] ## removing "undetermined" language
df <- df[df$lat>32.5 & df$lon>(-14.5) & df$lat<61 & df$lon<35,]
tab <- sort(table(df$lang), dec=TRUE) ## table for tweets by language
top15lang <- names(tab)[1:15] ## top 15 languages
df <- df[df$lang %in% top15lang,]

And we’re ready to plot it!

pq <- ggplot(world.data, aes(long,lat,group=group)) + 
    geom_polygon(fill="black", color="grey80", size=0.25) +
    geom_point(data = df, 
        aes(x = lon, y=lat, color=lang, group=NA), size=0.3) +
    coord_map("lagrange", xlim=c(-14.5, 35), ylim=c(32.5,61)) +
    theme(
        panel.background = element_rect(fill = "black"),
        plot.background = element_rect(fill="black"),
        axis.line = element_blank(), axis.text = element_blank(), 
        axis.ticks = element_blank(), axis.title = element_blank(), 
        panel.border = element_blank(), 
        panel.grid = element_blank(), 
        legend.background = element_rect(colour = F, fill = "black"),
        legend.key = element_rect(fill = "black", colour = F),
        legend.title = element_text(color="white"),
        legend.text = element_text(color="white", size=10)
    ) +
    guides(color = guide_legend(override.aes = list(size=5)))
# adding language labels
langs <- c("German", "English", "Spanish", "Estonian",
    "French", "Indonesian", "Italian", "Latvian", "Dutch",
    "Portuguese", "Russian", "Slovak", "Slovenian", 
    "Tagalog", "Turkish")
pq <- pq + scale_color_discrete("Language", labels = langs)
pq