1. Data Aggregation in R
Examples of using different functions for data aggregation and comparison of their performance:
> #data simulation
> set.seed(2211984)
> DF <- data.frame(x=sample(1:3, 10000000, rep=T), y=runif(10000000,1,100),
z=rnorm(10000000,10,2))
>
> #tapply - basic function
> tapply(DF$y, DF$x, mean)
1 2 3
50.50248 50.52115 50.50778
> tapply(DF$z, DF$x, sd)
1 2 3
1.999511 1.998869 1.998408
>
> #aggregate - basic function
> aggregate(DF$y, list(group=DF$x), FUN=mean)
group x
1 1 50.50248
2 2 50.52115
3 3 50.50778
> aggregate(DF$z, list(group=DF$x), FUN=sd)
group x
1 1 1.999511
2 2 1.998869
3 3 1.998408
>
> #ddply - plyr library
> library(plyr)
> ddply(DF, .(x), summarise, avg_y=mean(y), sd_z=sd(z))
x avg_y sd_z
1 1 50.50248 1.999511
2 2 50.52115 1.998869
3 3 50.50778 1.998408
>
> #sql query - sqldf library
> library(sqldf)
> sqldf("select avg(y) as avg_y, stdev(z) as sd_z from DF group by x")
avg_y sd_z
1 50.50248 1.999511
2 50.52115 1.998869
3 50.50778 1.998408
>
> #data.table objects - data.table library
> library(data.table)
data.table 1.8.2 For help type: help("data.table")
> DT <- data.table(DF)
> DT
x y z
1: 1 12.133576 11.947320
2: 2 44.485393 6.290101
3: 2 71.566670 10.280873
2. 4: 2 88.883879 11.121398
5: 1 3.952848 8.688182
---
9999996: 3 17.317273 10.085156
9999997: 3 64.856928 8.250676
9999998: 2 6.489453 8.812301
9999999: 3 94.344257 8.203418
10000000: 3 3.267286 6.688272
> identical(DT$x,DF$x)
[1] TRUE
> identical(DT$y,DF$y)
[1] TRUE
>
> DT[, sum(y), by=x]
x V1
1: 1 168310537
2: 2 168424040
3: 3 168370154
> DT[,list(avg_y=mean(y), sd_z=sd(z)), by=x]
x avg_y sd_z
1: 1 50.50248 1.999511
2: 2 50.52115 1.998869
3: 3 50.50778 1.998408
>
> #function performance - system time
> system.time(tapply(DF$y, DF$x, mean)) + system.time(tapply(DF$z, DF$x, sd))
user system elapsed
13.18 0.52 13.68
> system.time(aggregate(DF$y, list(group=DF$x), FUN=mean)) +
system.time(aggregate(DF$z, list(group=DF$x), FUN=sd))
user system elapsed
29.65 1.03 30.76
> system.time(ddply(DF, .(x), summarise, avg_y=mean(y), sd_z=sd(z)))
user system elapsed
2.23 0.86 3.09
> system.time(sqldf("select sum(y) as avg_y, stdev(z) as sd_z from DF group
by x"))
user system elapsed
33.83 2.85 37.11
> system.time(DT[,list(avg_y=mean(y), sd_z=sd(z)), by=x])
user system elapsed
0.7 0.0 0.7
> sessionInfo()
R version 2.15.2 (2012-10-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United
States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
3. [1] tcltk stats graphics grDevices utils datasets methods
base
other attached packages:
[1] data.table_1.8.2 sqldf_0.4-6.4 RSQLite.extfuns_0.0.1
RSQLite_0.11.2
[5] chron_2.3-42 gsubfn_0.6-5 proto_0.3-9.2
DBI_0.2-5
[9] plyr_1.7.1
Conclusion: data.table rocks. More than 4 times faster than ddply function, 19 times faster than tapply
function, 44 times faster than aggregate function and 53 times faster than sqldf function.