Let’s create the following two vectors with random values sampled from a uniform distribution:
set.seed(777)
n <- 1000000
x <- runif(n)
y <- runif(n)
Which of the following chunks of code will run faster?
# manual loop over each pair of values
loop <- function(){
z <- rep(NA, n)
for (i in 1:n){
z[i] <- x[i] + y[i]
}
return(z)
}
# loop using sapply in base R
sapply_code <- function(){
z <- sapply(1:n, function(i) x[i] + y[i])
return(z)
}
# vectorized code with `+`
vectorized <- function(){
z <- x + y
return(z)
}
system.time(loop())
## user system elapsed
## 0.121 0.005 0.126
system.time(sapply_code())
## user system elapsed
## 1.404 0.042 1.449
system.time(vectorized())
## user system elapsed
## 0.001 0.000 0.001
There are many other cases where vectorized functions can dramatically speed up our code. For example, the ifelse
function.
set.seed(123)
n <- 10000000
ideology <- sample(1:10, n, replace=TRUE)
# vectorized code
group_ideology <- function(){
groups <- ifelse(ideology>5, "right", "left")
return(groups)
}
# non-vectorized code
ideology_loop <- function(){
groups <- rep(NA, n)
for (i in 1:n){
if (ideology[i]<5) groups[i] <- "left"
if (ideology[i]>5) groups[i] <- "right"
}
}
# benchmarks
system.time(group_ideology())
## user system elapsed
## 1.485 0.098 1.585
system.time(ideology_loop())
## user system elapsed
## 2.566 0.017 2.584
The other important way to write more efficient code is to avoid memory copy:
set.seed(123)
n <- 10000
# function to generate n random numbers
func1 <- function(){
rnd <- c()
for (i in 1:n){
rnd <- c(rnd, rnorm(n=1))
}
return(rnd)
}
func2 <- function(){
rnd <- rep(NA, n)
for (i in 1:n){
rnd[i] <- rnorm(n=1)
}
return(rnd)
}
system.time(func1())
## user system elapsed
## 0.183 0.160 0.343
system.time(func2())
## user system elapsed
## 0.019 0.000 0.019
# of course, a WAY more efficient way to run this, with vectorized function...
func3 <- function(){
rnd <- rnorm(n=n)
return(rnd)
}
system.time(func3())
## user system elapsed
## 0 0 0
Memory copy applies as well to data frames. A very frequent scenario looks something like the following..
set.seed(123)
n <- 200
## using rbind
func1 <- function(){
df <- data.frame()
for (i in 1:n){
df <- rbind(df, data.frame(
age = sample(18:100, n, replace=TRUE),
state = sample(state.abb, n, replace=TRUE),
party = sample(c("R", "D", "I"), n, replace=TRUE)
) )
}
return(df)
}
You can avoid it by storing results in a list, and then using do.call(rbind, LIST)
to convert back to a single data frame
## using lists to avoid memory copy
func2 <- function(){
df.list <- list()
for (i in 1:n){
df.list[[i]] <- data.frame(
age = sample(18:100, n, replace=TRUE),
state = sample(state.abb, n, replace=TRUE),
party = sample(c("R", "D", "I"), n, replace=TRUE)
)
}
df <- do.call(rbind, df.list)
return(df)
}
system.time(func1())
## user system elapsed
## 0.263 0.071 0.334
system.time(func2())
## user system elapsed
## 0.065 0.005 0.069
For complex script, how can you find what is slowing your code down? You can use Rprof
Rprof() # start monitoring
invisible(func1()) # run your code (but don't show it!)
Rprof(NULL) # stop
summaryRprof() # see results
## $by.self
## self.time self.pct total.time total.pct
## "rbind" 0.22 73.33 0.30 100.00
## "data.frame" 0.04 13.33 0.08 26.67
## "deparse" 0.02 6.67 0.02 6.67
## "structure" 0.02 6.67 0.02 6.67
##
## $by.total
## total.time total.pct self.time self.pct
## "rbind" 0.30 100.00 0.22 73.33
## "block_exec" 0.30 100.00 0.00 0.00
## "call_block" 0.30 100.00 0.00 0.00
## "eval" 0.30 100.00 0.00 0.00
## "evaluate_call" 0.30 100.00 0.00 0.00
## "evaluate::evaluate" 0.30 100.00 0.00 0.00
## "evaluate" 0.30 100.00 0.00 0.00
## "func1" 0.30 100.00 0.00 0.00
## "handle" 0.30 100.00 0.00 0.00
## "in_dir" 0.30 100.00 0.00 0.00
## "knitr::knit" 0.30 100.00 0.00 0.00
## "process_file" 0.30 100.00 0.00 0.00
## "process_group.block" 0.30 100.00 0.00 0.00
## "process_group" 0.30 100.00 0.00 0.00
## "rmarkdown::render" 0.30 100.00 0.00 0.00
## "timing_fn" 0.30 100.00 0.00 0.00
## "withCallingHandlers" 0.30 100.00 0.00 0.00
## "withVisible" 0.30 100.00 0.00 0.00
## "data.frame" 0.08 26.67 0.04 13.33
## "as.data.frame.character" 0.04 13.33 0.00 0.00
## "as.data.frame" 0.04 13.33 0.00 0.00
## "deparse" 0.02 6.67 0.02 6.67
## "structure" 0.02 6.67 0.02 6.67
## "%in%" 0.02 6.67 0.00 0.00
## "as.data.frame.vector" 0.02 6.67 0.00 0.00
## "deparse1" 0.02 6.67 0.00 0.00
## "mode" 0.02 6.67 0.00 0.00
## "paste" 0.02 6.67 0.00 0.00
##
## $sample.interval
## [1] 0.02
##
## $sampling.time
## [1] 0.3