Let’s create the following two vectors with random values sampled from a uniform distribution:
set.seed(777)
n <- 1000000
x <- runif(n)
y <- runif(n)
Which of the following chunks of code will run faster?
loop <- function(){
z <- rep(NA, n)
for (i in 1:n){
z[i] <- x[i] + y[1]
}
return(z)
}
sapply_code <- function(){
z <- sapply(1:n, function(i) x[i] + y[i])
return(z)
}
vectorized <- function(){
z <- x + y
return(z)
}
system.time(loop())
## user system elapsed
## 0.200 0.010 0.275
system.time(sapply_code())
## user system elapsed
## 2.301 0.079 2.552
system.time(vectorized())
## user system elapsed
## 0.003 0.001 0.004
There are many other cases where vectorized functions can dramatically speed up our code. The first example shows the power of ifelse
and the [
(replacement) function.
set.seed(123)
n <- 10000000
ideology <- sample(1:10, n, replace=TRUE)
# vectorized code
group_ideology <- function(){
groups <- rep(NA, n)
groups[ideology>5] <- "right"
groups[ideology<5] <- "left"
return(groups)
}
# non-vectorized code
ideology_loop <- function(){
groups <- rep(NA, n)
for (i in 1:n){
if (ideology[i]>5) groups[i] <- "left"
if (ideology[i]>5) groups[i] <- "right"
}
}
# benchmarks
system.time(group_ideology())
## user system elapsed
## 0.491 0.114 0.643
system.time(ideology_loop())
## user system elapsed
## 3.968 0.215 4.475
This second example shows how matrix operations are vectorized too:
set.seed(777)
n <- 1000000
df <- data.frame(grade1=runif(n, 0, 100),
grade2=runif(n, 0, 100),
grade3=runif(n, 0, 100),
grade4=runif(n, 0, 100),
grade5=runif(n, 0, 100))
system.time(total1 <- df$grade1 + df$grade2 + df$grade3 + df$grade4 + df$grade5)
## user system elapsed
## 0.010 0.004 0.015
system.time(total2 <- rowSums(df))
## user system elapsed
## 0.036 0.014 0.053
total3 <- rep(NA, n)
system.time(for (i in 1:n){
total3[i] <- df$grade1[i] + df$grade2[i] + df$grade3[i] + df$grade4[i] + df$grade5[i]
}
)
## user system elapsed
## 30.209 0.489 31.469
# why is rowSums slower than + ?
df <- as.matrix(df)
system.time(total1 <- df[,1] + df[,2] + df[,3] + df[,4] + df[,5])
## user system elapsed
## 0.055 0.030 0.092
system.time(total2 <- rowSums(df))
## user system elapsed
## 0.019 0.004 0.024
The other important way to write more efficient code is to avoid memory copy:
set.seed(123)
n <- 10000
func1 <- function(){
rnd <- c()
for (i in 1:n){
rnd <- c(rnd, rnorm(n=1))
}
return(rnd)
}
func2 <- function(){
rnd <- rep(NA, n)
for (i in 1:n){
rnd[i] <- rnorm(n=1)
}
return(rnd)
}
system.time(func1())
## user system elapsed
## 0.195 0.139 0.341
system.time(func2())
## user system elapsed
## 0.018 0.004 0.023
# of course, a WAY more efficient way to run this, with vectorized function...
func3 <- function(){
rnd <- rnorm(n=n)
return(rnd)
}
system.time(func3())
## user system elapsed
## 0.001 0.000 0.000
Memory copy applies as well to data frames. A very frequent scenario looks something like the following. You can avoid it by storing results in a list, and then using do.call(rbind, LIST)
to convert back to a single data frame.
set.seed(123)
n <- 200
## using rbind
func1 <- function(){
df <- data.frame()
for (i in 1:n){
df <- rbind(df, data.frame(
age = sample(18:100, n, replace=TRUE),
state = sample(state.abb, n, replace=TRUE),
party = sample(c("R", "D", "I"), n, replace=TRUE)
) )
}
return(df)
}
## using lists to avoid memory copy
init <- Sys.time()
func2 <- function(){
df.list <- list()
for (i in 1:n){
df.list[[i]] <- data.frame(
age = sample(18:100, n, replace=TRUE),
state = sample(state.abb, n, replace=TRUE),
party = sample(c("R", "D", "I"), n, replace=TRUE)
)
}
df <- do.call(rbind, df.list)
return(df)
}
system.time(func1())
## user system elapsed
## 0.983 0.369 1.398
system.time(func2())
## user system elapsed
## 0.156 0.028 0.189
Finding slow spots in your code using Rprof
Rprof() # start monitoring
invisible(func1()) # run your code (but don't show it!)
Rprof(NULL) # stop
summaryRprof() # see results
## $by.self
## self.time self.pct total.time total.pct
## "as.vector" 0.34 24.29 0.34 24.29
## "factor" 0.30 21.43 0.46 32.86
## "[<-.factor" 0.30 21.43 0.38 27.14
## "rbind" 0.18 12.86 1.40 100.00
## "attr" 0.08 5.71 0.08 5.71
## "is.na" 0.08 5.71 0.08 5.71
## "[<-" 0.02 1.43 0.40 28.57
## "any" 0.02 1.43 0.02 1.43
## "levels" 0.02 1.43 0.02 1.43
## "sample.int" 0.02 1.43 0.02 1.43
## "sum" 0.02 1.43 0.02 1.43
## "unique" 0.02 1.43 0.02 1.43
##
## $by.total
## total.time total.pct self.time self.pct
## "rbind" 1.40 100.00 0.18 12.86
## "block_exec" 1.40 100.00 0.00 0.00
## "call_block" 1.40 100.00 0.00 0.00
## "eval" 1.40 100.00 0.00 0.00
## "evaluate_call" 1.40 100.00 0.00 0.00
## "evaluate" 1.40 100.00 0.00 0.00
## "func1" 1.40 100.00 0.00 0.00
## "handle" 1.40 100.00 0.00 0.00
## "in_dir" 1.40 100.00 0.00 0.00
## "knitr::knit" 1.40 100.00 0.00 0.00
## "process_file" 1.40 100.00 0.00 0.00
## "process_group.block" 1.40 100.00 0.00 0.00
## "process_group" 1.40 100.00 0.00 0.00
## "rmarkdown::render" 1.40 100.00 0.00 0.00
## "timing_fn" 1.40 100.00 0.00 0.00
## "withCallingHandlers" 1.40 100.00 0.00 0.00
## "withVisible" 1.40 100.00 0.00 0.00
## "factor" 0.46 32.86 0.30 21.43
## "[<-" 0.40 28.57 0.02 1.43
## "[<-.factor" 0.38 27.14 0.30 21.43
## "as.vector" 0.34 24.29 0.34 24.29
## "as.vector.factor" 0.32 22.86 0.00 0.00
## "attr" 0.08 5.71 0.08 5.71
## "is.na" 0.08 5.71 0.08 5.71
## "data.frame" 0.06 4.29 0.00 0.00
## "as.data.frame.character" 0.04 2.86 0.00 0.00
## "as.data.frame" 0.04 2.86 0.00 0.00
## "any" 0.02 1.43 0.02 1.43
## "levels" 0.02 1.43 0.02 1.43
## "sample.int" 0.02 1.43 0.02 1.43
## "sum" 0.02 1.43 0.02 1.43
## "unique" 0.02 1.43 0.02 1.43
## ".deparseOpts" 0.02 1.43 0.00 0.00
## "%in%" 0.02 1.43 0.00 0.00
## "deparse" 0.02 1.43 0.00 0.00
## "mode" 0.02 1.43 0.00 0.00
## "sample" 0.02 1.43 0.00 0.00
##
## $sample.interval
## [1] 0.02
##
## $sampling.time
## [1] 1.4
How to read large files faster? There are different packages to help with this; and also being specific about the data you’re reading will help. readr
is a good option:
system.time(d <- read.csv("~/data/UK-tweets.csv"))
## user system elapsed
## 0.159 0.004 0.169
system.time(d <- read.csv("~/data/UK-tweets.csv", stringsAsFactors=F))
## user system elapsed
## 0.092 0.002 0.095
library(readr)
system.time(d <- read_csv("~/data/UK-tweets.csv"))
## Parsed with column specification:
## cols(
## id = col_double(),
## screen_name = col_character(),
## text = col_character(),
## polite = col_character(),
## sentiment = col_character(),
## communication = col_character(),
## retweet_count = col_integer(),
## followers_count = col_integer(),
## favourites_count = col_integer(),
## created_at = col_character()
## )
## user system elapsed
## 0.035 0.008 0.047