2

I would like to optimize the following code in R. This loop takes a long time to run. I wonder if someone could help me to optimize this code because it takes ages to run? I thank you all!

SIN_FM5 : is a database with about 300.000 rows and 7 columns.

# Make Combination  

SIN_FM5$Combination=(SIN_FM5$SINISTRE)
Count.Comb=data.frame(table(SIN_FM5$Combination))


# Calculate number of combinations 
Total.Comb=nrow(Count.Comb)

# Loop through all combinations and calculate statistics

Statistics=array(0,dim=c(Total.Comb,5))

for (i in 1:Total.Comb) {

  Subset=subset(SIN_FM5, SIN_FM5$Combination==Count.Comb[i,1])
  Statistics[(i),]=c(Count.Comb[i,1],mean(Subset$MONTANT_PAIEMENT), 
  median(Subset$MONTANT_PAIEMENT),min(Subset$MONTANT_PAIEMENT), 
  max(Subset$MONTANT_PAIEMENT))

}
resultatN=cbind(Count.Comb,Statistics)

dput(head(SIN_FM5))

Console~/

    "TRSP-5194", "TRSP-5197", "TRSP-5201", "TRSP-5202", "TRSP-5204", 
"TRSP-5205", "TRSP-5207", "TRSP-5212", "TRSP-5214", "TRSP-5215", 
"TRSP-5218", "TRSP-5222", "TRSP-5230", "TRSP-5238", "TRSP-5243", 
"TRSP-5247", "TRSP-5248", "TRSP-5253", "TRSP-5254", "TRSP-5255", 
"TRSP-5257", "TRSP-5259", "TRSP-5262", "TRSP-5263", "TRSP-5266", 
"TRSP-5267", "TRSP-5268", "TRSP-5270", "TRSP-5271", "TRSP-5274", 
"TRSP-5277", "TRSP-5279", "TRSP-5281", "TRSP-5283", "TRSP-5288", 
"TRSP-5289", "TRSP-5293", "TRSP-5296", "TRSP-5299", "TRSP-5301", 
"TRSP-5303", "TRSP-5304", "TRSP-5306", "TRSP-5308", "TRSP-5310", 
"TRSP-5311", "TRSP-5312", "TRSP-5313", "TRSP-5335", "TRSP-5343", 
"TRSP-5348", "TRSP-5352", "TRSP-5357", "TRSP-5363", "TRSP-5366", 
"TRSP-5372", "TRSP-5373", "TRSP-5384", "TRSP-5386", "TRSP-5388", 
"TRSP-5391", "TRSP-5392", "TRSP-5428", "TRSP-5436", "VANBILSENYolanda", 
"VanLierop", "VirgaJesseZiekenhuis", "WanetGeorges", "WANETThierry", 
"WILLEMSMichel", "WUESTENBERGHSAlain", "X01", "X01CR", "X02CR", 
"X03CR", "X04CR", "X05CR", "X06CR", "X07CR", "Y01", "Y01CR", 
"Y02", "ZOPO-5344"), class = "factor"), Combination = c(73010009L, 
73010009L, 73010014L, 73010014L, 73010014L, 73010014L)), .Names = c("SINISTRE", 
"victimeid", "Nature.Injury", "LocationL", "DurationITT", "Code.Nace2008", 
"POLICE", "TYPE_DE_PAIEMENT", "MODE_DE_PAIEMENT", "CODE_NATURE_DE_PAIEMENT", 
"MONTANT_PAIEMENT", "BENEFICIARE", "Combination"), row.names = c(NA, 
6L), class = "data.frame")
4
  • class = "factor"), Combination = c(73010009L, 73010009L, 73010014L, 73010014L, 73010014L, 73010014L)), .Names = c("SINISTRE", "victimeid", "Nature.Injury", "LocationL", "DurationITT", "Code.Nace2008", "POLICE", "TYPE_DE_PAIEMENT", "MODE_DE_PAIEMENT", "CODE_NATURE_DE_PAIEMENT", "MONTANT_PAIEMENT", "BENEFICIARE", "Combination"), row.names = c(NA, 6L), class = "data.frame") Commented Jul 1, 2012 at 23:20
  • It also gives all the values of variable "BENEFICIARE" of my data frame SIN_FM5 Commented Jul 1, 2012 at 23:21
  • 1
    Can you put the complete output of dput(head(SIN_FM5)) in your question. Commented Jul 1, 2012 at 23:32
  • I couldn't put the whole output because the number of charachters are limited. Commented Jul 1, 2012 at 23:48

1 Answer 1

5
SIN_FM5 <- data.frame(Combination = sample(1:10, 100, repl=TRUE), MONTANT_PAIEMENT=rnorm(100))
bySIN <- by(SIN_FM5, list(SIN_FM5[['Combination']]), FUN= function(subd) { 
           data.frame(counts = nrow(subd), 
                meanMont = mean(subd$MONTANT_PAIEMENT), 
                medMont =  median(subd$MONTANT_PAIEMENT),
                minMont = min(subd$MONTANT_PAIEMENT), 
                maxMont =  max(subd$MONTANT_PAIEMENT) )  } )
> sapply(bySIN, as.vector)
         1         2           3          4          5          6          7          8         
counts   11        7           14         9          11         16         10         8         
meanMont 0.3499753 -0.188964   0.1740817  -0.1505312 -0.6335896 -0.1434513 -0.2148642 -0.2978299
medMont  0.4381513 -0.06965143 0.05762425 -0.2247187 -0.7682626 -0.2288606 -0.1467318 -0.3315809
minMont  -1.122418 -0.9638749  -1.634259  -1.336908  -2.068224  -1.974108  -2.15415   -1.295045 
maxMont  1.50662   0.1653189   1.215114   1.243138   0.4643551  1.29805    1.154282   0.7097163 
         9          10       
counts   4          10       
meanMont 0.2575141  0.146613 
medMont  0.07692888 0.1047567
minMont  -0.534418  -1.006938
maxMont  1.410617   1.4973 

Here's a data.table solution. Likely to be much faster:

require(data.table)
dtb <- data.table(SIN_FM5)
setkey(dtb, "Combination")

dtb[ , list(counts=length(MONTANT_PAIEMENT),
               meanMont = mean(MONTANT_PAIEMENT), 
               medMont =  median(MONTANT_PAIEMENT),
               minMont = min(MONTANT_PAIEMENT), 
               maxMont =  max(MONTANT_PAIEMENT) ), by="Combination"]
#-----------------------------------------------
      Combination counts   meanMont     medMont    minMont   maxMont
 [1,]           1     11  0.3499753  0.43815133 -1.1224179 1.5066198
 [2,]           2      7 -0.1889640 -0.06965143 -0.9638749 0.1653189
 [3,]           3     14  0.1740817  0.05762425 -1.6342586 1.2151136
 [4,]           4      9 -0.1505312 -0.22471868 -1.3369085 1.2431378
 [5,]           5     11 -0.6335896 -0.76826264 -2.0682244 0.4643551
 [6,]           6     16 -0.1434513 -0.22886060 -1.9741083 1.2980502
 [7,]           7     10 -0.2148642 -0.14673175 -2.1541500 1.1542819
 [8,]           8      8 -0.2978299 -0.33158086 -1.2950452 0.7097163
 [9,]           9      4  0.2575141  0.07692888 -0.5344180 1.4106168
[10,]          10     10  0.1466130  0.10475674 -1.0069382 1.4972998
Sign up to request clarification or add additional context in comments.

4 Comments

It sends me this message back : Error in aggregate.data.frame(SIN_FM5, by = SIN_FM5$Combination, FUN = function(subd) { : 'by' must be a list
I replaced the aggregate solution with one based on by
It seems to work but is it possible to have the statistical computations of "montant_paiement" for each SIN_FM5$SINISTRE (this is an ID number for each victim)?
To add classification dimensions to the by-solution you would add items to the list of by factors. To do the same for the data.table-solution you would add factor names to the setkey() arguments.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.