Optimize loop in R

Question

I would like to optimize the following code in R. This loop takes a long time to run. I wonder if someone could help me to optimize this code because it takes ages to run? I thank you all!

SIN_FM5 : is a database with about 300.000 rows and 7 columns.

# Make Combination  

SIN_FM5$Combination=(SIN_FM5$SINISTRE)
Count.Comb=data.frame(table(SIN_FM5$Combination))


# Calculate number of combinations 
Total.Comb=nrow(Count.Comb)

# Loop through all combinations and calculate statistics

Statistics=array(0,dim=c(Total.Comb,5))

for (i in 1:Total.Comb) {

  Subset=subset(SIN_FM5, SIN_FM5$Combination==Count.Comb[i,1])
  Statistics[(i),]=c(Count.Comb[i,1],mean(Subset$MONTANT_PAIEMENT), 
  median(Subset$MONTANT_PAIEMENT),min(Subset$MONTANT_PAIEMENT), 
  max(Subset$MONTANT_PAIEMENT))

}
resultatN=cbind(Count.Comb,Statistics)

dput(head(SIN_FM5))

Console~/

    "TRSP-5194", "TRSP-5197", "TRSP-5201", "TRSP-5202", "TRSP-5204", 
"TRSP-5205", "TRSP-5207", "TRSP-5212", "TRSP-5214", "TRSP-5215", 
"TRSP-5218", "TRSP-5222", "TRSP-5230", "TRSP-5238", "TRSP-5243", 
"TRSP-5247", "TRSP-5248", "TRSP-5253", "TRSP-5254", "TRSP-5255", 
"TRSP-5257", "TRSP-5259", "TRSP-5262", "TRSP-5263", "TRSP-5266", 
"TRSP-5267", "TRSP-5268", "TRSP-5270", "TRSP-5271", "TRSP-5274", 
"TRSP-5277", "TRSP-5279", "TRSP-5281", "TRSP-5283", "TRSP-5288", 
"TRSP-5289", "TRSP-5293", "TRSP-5296", "TRSP-5299", "TRSP-5301", 
"TRSP-5303", "TRSP-5304", "TRSP-5306", "TRSP-5308", "TRSP-5310", 
"TRSP-5311", "TRSP-5312", "TRSP-5313", "TRSP-5335", "TRSP-5343", 
"TRSP-5348", "TRSP-5352", "TRSP-5357", "TRSP-5363", "TRSP-5366", 
"TRSP-5372", "TRSP-5373", "TRSP-5384", "TRSP-5386", "TRSP-5388", 
"TRSP-5391", "TRSP-5392", "TRSP-5428", "TRSP-5436", "VANBILSENYolanda", 
"VanLierop", "VirgaJesseZiekenhuis", "WanetGeorges", "WANETThierry", 
"WILLEMSMichel", "WUESTENBERGHSAlain", "X01", "X01CR", "X02CR", 
"X03CR", "X04CR", "X05CR", "X06CR", "X07CR", "Y01", "Y01CR", 
"Y02", "ZOPO-5344"), class = "factor"), Combination = c(73010009L, 
73010009L, 73010014L, 73010014L, 73010014L, 73010014L)), .Names = c("SINISTRE", 
"victimeid", "Nature.Injury", "LocationL", "DurationITT", "Code.Nace2008", 
"POLICE", "TYPE_DE_PAIEMENT", "MODE_DE_PAIEMENT", "CODE_NATURE_DE_PAIEMENT", 
"MONTANT_PAIEMENT", "BENEFICIARE", "Combination"), row.names = c(NA, 
6L), class = "data.frame")

class = "factor"), Combination = c(73010009L, 73010009L, 73010014L, 73010014L, 73010014L, 73010014L)), .Names = c("SINISTRE", "victimeid", "Nature.Injury", "LocationL", "DurationITT", "Code.Nace2008", "POLICE", "TYPE_DE_PAIEMENT", "MODE_DE_PAIEMENT", "CODE_NATURE_DE_PAIEMENT", "MONTANT_PAIEMENT", "BENEFICIARE", "Combination"), row.names = c(NA, 6L), class = "data.frame") — Yasmine Nouri
– Yasmine Nouri, Commented Jul 1, 2012 at 23:20
It also gives all the values of variable "BENEFICIARE" of my data frame SIN_FM5 — Yasmine Nouri
– Yasmine Nouri, Commented Jul 1, 2012 at 23:21
Can you put the complete output of dput(head(SIN_FM5)) in your question. — mnel
– mnel, Commented Jul 1, 2012 at 23:32
I couldn't put the whole output because the number of charachters are limited. — Yasmine Nouri
– Yasmine Nouri, Commented Jul 1, 2012 at 23:48

IRTFM · Accepted Answer · 2012-07-02 00:08:21Z

5

SIN_FM5 <- data.frame(Combination = sample(1:10, 100, repl=TRUE), MONTANT_PAIEMENT=rnorm(100))
bySIN <- by(SIN_FM5, list(SIN_FM5[['Combination']]), FUN= function(subd) { 
           data.frame(counts = nrow(subd), 
                meanMont = mean(subd$MONTANT_PAIEMENT), 
                medMont =  median(subd$MONTANT_PAIEMENT),
                minMont = min(subd$MONTANT_PAIEMENT), 
                maxMont =  max(subd$MONTANT_PAIEMENT) )  } )
> sapply(bySIN, as.vector)
         1         2           3          4          5          6          7          8         
counts   11        7           14         9          11         16         10         8         
meanMont 0.3499753 -0.188964   0.1740817  -0.1505312 -0.6335896 -0.1434513 -0.2148642 -0.2978299
medMont  0.4381513 -0.06965143 0.05762425 -0.2247187 -0.7682626 -0.2288606 -0.1467318 -0.3315809
minMont  -1.122418 -0.9638749  -1.634259  -1.336908  -2.068224  -1.974108  -2.15415   -1.295045 
maxMont  1.50662   0.1653189   1.215114   1.243138   0.4643551  1.29805    1.154282   0.7097163 
         9          10       
counts   4          10       
meanMont 0.2575141  0.146613 
medMont  0.07692888 0.1047567
minMont  -0.534418  -1.006938
maxMont  1.410617   1.4973

Here's a data.table solution. Likely to be much faster:

require(data.table)
dtb <- data.table(SIN_FM5)
setkey(dtb, "Combination")

dtb[ , list(counts=length(MONTANT_PAIEMENT),
               meanMont = mean(MONTANT_PAIEMENT), 
               medMont =  median(MONTANT_PAIEMENT),
               minMont = min(MONTANT_PAIEMENT), 
               maxMont =  max(MONTANT_PAIEMENT) ), by="Combination"]
#-----------------------------------------------
      Combination counts   meanMont     medMont    minMont   maxMont
 [1,]           1     11  0.3499753  0.43815133 -1.1224179 1.5066198
 [2,]           2      7 -0.1889640 -0.06965143 -0.9638749 0.1653189
 [3,]           3     14  0.1740817  0.05762425 -1.6342586 1.2151136
 [4,]           4      9 -0.1505312 -0.22471868 -1.3369085 1.2431378
 [5,]           5     11 -0.6335896 -0.76826264 -2.0682244 0.4643551
 [6,]           6     16 -0.1434513 -0.22886060 -1.9741083 1.2980502
 [7,]           7     10 -0.2148642 -0.14673175 -2.1541500 1.1542819
 [8,]           8      8 -0.2978299 -0.33158086 -1.2950452 0.7097163
 [9,]           9      4  0.2575141  0.07692888 -0.5344180 1.4106168
[10,]          10     10  0.1466130  0.10475674 -1.0069382 1.4972998

edited Jul 2, 2012 at 0:08

answered Jul 1, 2012 at 23:48

IRTFM

264k22 gold badges381 silver badges503 bronze badges

Sign up to request clarification or add additional context in comments.

4 Comments

Yasmine Nouri Over a year ago

It sends me this message back : Error in aggregate.data.frame(SIN_FM5, by = SIN_FM5$Combination, FUN = function(subd) { : 'by' must be a list

IRTFM Over a year ago

I replaced the aggregate solution with one based on by

Yasmine Nouri Over a year ago

It seems to work but is it possible to have the statistical computations of "montant_paiement" for each SIN_FM5$SINISTRE (this is an ID number for each victim)?

IRTFM Over a year ago

To add classification dimensions to the by-solution you would add items to the list of by factors. To do the same for the data.table-solution you would add factor names to the setkey() arguments.

Collectives™ on Stack Overflow

Optimize loop in R

1 Answer 1

4 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

4 Comments

Your Answer

Sign up or log in

Post as a guest

Related