# first let's try it with dplyr using code from #https://lost-stats.github.io/Data_Manipulation/collapse_a_data_set.html library(dplyr) rm(list=ls()) data("storms",package="dplyr") # this takes a bit of time storms_collapsed <- storms %>% group_by(name, year, month, day) %>% summarize(wind = mean(wind),pressure = mean(pressure),category = first(category)) output.dplyr <- list(avg =mean(storms_collapsed$pressure),tab=table(storms_collapsed$category) ) # now let's benchmark this, running it 100 times library(microbenchmark) times.dplyr <- microbenchmark( storms_collapsed <- storms %>% group_by(name, year, month, day) %>% summarize(wind = mean(wind),pressure = mean(pressure),category = first(category)) ) # data.table way library(data.table) setDT(storms) # convert storms to be a data.table # data.table way to do a collapse resembles Stata except for the ".()" and that we do it within storms[] storms_collapsed <- storms[,.(wind = mean(wind),pressure = mean(pressure),category = first(category)), by=.(name, year, month, day)] # confirm it is the same output.dt <- list(avg =mean(storms_collapsed$pressure),tab=table(storms_collapsed$category) ) all.equal(output.dplyr,output.dt) # benchmark it: times.dt <- microbenchmark(storms_collapsed<- storms[,.(wind = mean(wind),pressure = mean(pressure),category = first(category)), by=.(name, year, month, day)]) # now let's try stata library(RStata) # you may need to modify these options for your configuration options("RStata.StataPath" ="/Applications/Stata/StataMP.app/Contents/MacOS/stata-mp") options("RStata.StataVersion"=15) stata_src <- ' collapse (mean) wind (mean) pressure (first) category, by(name year month day) fast ' storms_collapsed <- stata(stata_src,data.in=storms,data.out = TRUE,stata.echo = TRUE) output.stata <- list(avg =mean(storms_collapsed$pressure),tab=table(storms_collapsed$category) ) all.equal(output.dplyr,output.stata) times.stata <- microbenchmark( # turn off echo for the benchmarking storms_collapsed <- stata(stata_src,data.in=storms,data.out = TRUE,stata.echo = FALSE) ) #compare times time.ratios <- times.stata$time/times.dt$time median(time.ratios)# on my macbook data.table is over 50X faster hist(time.ratios) time.ratios <- times.dplyr$time/times.dt$time median(time.ratios)# on my macbook data.table is over 100X faster hist(time.ratios)