Let’s create the following two vectors with random values sampled from a uniform distribution:
set.seed(777)
n <- 1000000
x <- runif(n)
y <- runif(n)
Which of the following chunks of code will run faster?
loop <- function(){
z <- rep(NA, n)
for (i in 1:n){
z[i] <- x[i] + y[1]
}
return(z)
}
sapply_code <- function(){
z <- sapply(1:n, function(i) x[i] + y[i])
return(z)
}
vectorized <- function(){
z <- x + y
return(z)
}
system.time(loop())
## user system elapsed
## 0.202 0.008 0.229
system.time(sapply_code())
## user system elapsed
## 2.501 0.059 2.652
system.time(vectorized())
## user system elapsed
## 0.002 0.001 0.006
Other examples where vectorized functions can dramatically speed up our code. The first example shows the power of ifelse
and the [
(replacement) function.
set.seed(123)
n <- 10000000
ideology <- sample(1:10, n, replace=TRUE)
# vectorized code
group_ideology <- function(){
groups <- rep(NA, n)
groups[ideology>5] <- "right"
groups[ideology<5] <- "left"
return(groups)
}
# non-vectorized code
ideology_loop <- function(){
groups <- rep(NA, n)
for (i in 1:n){
if (ideology[i]>5) groups[i] <- "left"
if (ideology[i]>5) groups[i] <- "right"
}
}
# benchmarks
system.time(group_ideology())
## user system elapsed
## 0.487 0.080 0.591
system.time(ideology_loop())
## user system elapsed
## 4.273 0.078 4.745
This second example shows how matrix operations are vectorized too:
set.seed(777)
n <- 1000000
df <- data.frame(grade1=runif(n, 0, 100),
grade2=runif(n, 0, 100),
grade3=runif(n, 0, 100),
grade4=runif(n, 0, 100),
grade5=runif(n, 0, 100))
system.time(total1 <- df$grade1 + df$grade2 + df$grade3 + df$grade4 + df$grade5)
## user system elapsed
## 0.011 0.003 0.028
system.time(total2 <- rowSums(df))
## user system elapsed
## 0.047 0.030 0.163
total3 <- rep(NA, n)
system.time(for (i in 1:n){
total3[i] <- df$grade1[i] + df$grade2[i] + df$grade3[i] + df$grade4[i] + df$grade5[i]
}
)
## user system elapsed
## 33.192 0.376 37.113
# why is rowSums slower than + ?
df <- as.matrix(df)
system.time(total1 <- df[,1] + df[,2] + df[,3] + df[,4] + df[,5])
## user system elapsed
## 0.053 0.022 0.078
system.time(total2 <- rowSums(df))
## user system elapsed
## 0.021 0.008 0.030
The other way to write more efficient code is to avoid memory copy:
set.seed(123)
n <- 10000
func1 <- function(){
rnd <- c()
for (i in 1:n){
rnd <- c(rnd, rnorm(n=1))
}
return(rnd)
}
func2 <- function(){
rnd <- rep(NA, n)
for (i in 1:n){
rnd[i] <- rnorm(n=1)
}
return(rnd)
}
system.time(func1())
## user system elapsed
## 0.186 0.126 0.330
system.time(func2())
## user system elapsed
## 0.017 0.004 0.020
# of course, a WAY more efficient way to run this, with vectorized function...
func3 <- function(){
rnd <- rnorm(n=n)
return(rnd)
}
system.time(func3())
## user system elapsed
## 0.000 0.000 0.001
Memory copy applies as well to data frames. A very frequent scenario looks something like the following. You can avoid it by storing results in a list, and then using do.call(rbind, LIST)
to convert back to a single data frame.
set.seed(123)
n <- 200
## using rbind
func1 <- function(){
df <- data.frame()
for (i in 1:n){
df <- rbind(df, data.frame(
age = sample(18:100, n, replace=TRUE),
state = sample(state.abb, n, replace=TRUE),
party = sample(c("R", "D", "I"), n, replace=TRUE)
) )
}
return(df)
}
## using lists to avoid memory copy
init <- Sys.time()
func2 <- function(){
df.list <- list()
for (i in 1:n){
df.list[[i]] <- data.frame(
age = sample(18:100, n, replace=TRUE),
state = sample(state.abb, n, replace=TRUE),
party = sample(c("R", "D", "I"), n, replace=TRUE)
)
}
df <- do.call(rbind, df.list)
return(df)
}
system.time(func1())
## user system elapsed
## 0.933 0.345 1.558
system.time(func2())
## user system elapsed
## 0.260 0.036 0.654
Finding slow spots in your code using Rprof
Rprof() # start monitoring
invisible(func1()) # run your code (but don't show it!)
Rprof(NULL) # stop
summaryRprof() # see results
## $by.self
## self.time self.pct total.time total.pct
## "[<-.factor" 0.50 31.65 0.52 32.91
## "as.vector" 0.36 22.78 0.38 24.05
## "factor" 0.34 21.52 0.54 34.18
## "attr" 0.08 5.06 0.08 5.06
## "rbind" 0.06 3.80 1.58 100.00
## "as.vector.factor" 0.02 1.27 0.38 24.05
## "as.data.frame.vector" 0.02 1.27 0.04 2.53
## "force" 0.02 1.27 0.04 2.53
## "attributes" 0.02 1.27 0.02 1.27
## "is.na" 0.02 1.27 0.02 1.27
## "levels" 0.02 1.27 0.02 1.27
## "make.names" 0.02 1.27 0.02 1.27
## "mode" 0.02 1.27 0.02 1.27
## "names" 0.02 1.27 0.02 1.27
## "sample" 0.02 1.27 0.02 1.27
## "sort.list" 0.02 1.27 0.02 1.27
## "unique" 0.02 1.27 0.02 1.27
##
## $by.total
## total.time total.pct self.time self.pct
## "rbind" 1.58 100.00 0.06 3.80
## "block_exec" 1.58 100.00 0.00 0.00
## "call_block" 1.58 100.00 0.00 0.00
## "eval" 1.58 100.00 0.00 0.00
## "evaluate_call" 1.58 100.00 0.00 0.00
## "evaluate" 1.58 100.00 0.00 0.00
## "func1" 1.58 100.00 0.00 0.00
## "handle" 1.58 100.00 0.00 0.00
## "in_dir" 1.58 100.00 0.00 0.00
## "knitr::knit" 1.58 100.00 0.00 0.00
## "process_file" 1.58 100.00 0.00 0.00
## "process_group.block" 1.58 100.00 0.00 0.00
## "process_group" 1.58 100.00 0.00 0.00
## "rmarkdown::render" 1.58 100.00 0.00 0.00
## "timing_fn" 1.58 100.00 0.00 0.00
## "withCallingHandlers" 1.58 100.00 0.00 0.00
## "withVisible" 1.58 100.00 0.00 0.00
## "factor" 0.54 34.18 0.34 21.52
## "[<-.factor" 0.52 32.91 0.50 31.65
## "[<-" 0.52 32.91 0.00 0.00
## "as.vector" 0.38 24.05 0.36 22.78
## "as.vector.factor" 0.38 24.05 0.02 1.27
## "data.frame" 0.14 8.86 0.00 0.00
## "as.data.frame" 0.10 6.33 0.00 0.00
## "attr" 0.08 5.06 0.08 5.06
## "as.data.frame.character" 0.08 5.06 0.00 0.00
## "as.data.frame.vector" 0.04 2.53 0.02 1.27
## "force" 0.04 2.53 0.02 1.27
## "attributes" 0.02 1.27 0.02 1.27
## "is.na" 0.02 1.27 0.02 1.27
## "levels" 0.02 1.27 0.02 1.27
## "make.names" 0.02 1.27 0.02 1.27
## "mode" 0.02 1.27 0.02 1.27
## "names" 0.02 1.27 0.02 1.27
## "sample" 0.02 1.27 0.02 1.27
## "sort.list" 0.02 1.27 0.02 1.27
## "unique" 0.02 1.27 0.02 1.27
## "%in%" 0.02 1.27 0.00 0.00
## "as.data.frame.integer" 0.02 1.27 0.00 0.00
## "deparse" 0.02 1.27 0.00 0.00
## "paste" 0.02 1.27 0.00 0.00
## "structure" 0.02 1.27 0.00 0.00
##
## $sample.interval
## [1] 0.02
##
## $sampling.time
## [1] 1.58
How to read large files faster? There are different packages to help with this; and also being specific about the data you’re reading will help. readr
is a good option:
system.time(d <- read.csv("../data/EP-elections-tweets.csv"))
## user system elapsed
## 0.257 0.008 0.300
system.time(d <- read.csv("../data/EP-elections-tweets.csv", stringsAsFactors=F))
## user system elapsed
## 0.175 0.006 0.194
library(readr)
system.time(d <- read_csv("../data/EP-elections-tweets.csv"))
## Parsed with column specification:
## cols(
## text = col_character(),
## screen_name = col_character(),
## id = col_double(),
## polite = col_character()
## )
## user system elapsed
## 0.048 0.009 0.063