Store result from nested for loop with two factor variables

Question

I'd like to generate some simple calculations for each combination of two factor variables and store the results in a data frame. Here are the data:

df <- data.frame(SPECIES = as.factor(c(rep("SWAN",10), rep("DUCK",4), rep("GOOS",12),
                             rep("PASS",9), rep("FALC",10))),
                 DRAINAGE = as.factor(c(rep(c("Central", "Upper", "West"),15))),
                 CATCH_QTY = c(1,1,2,5,6,1,2,1,1,1,1,1,3,1,1,2,1,1,2,2,
                               rep(1,25)),
                 TAGGED = c(rep("T",6),NA,"T","T",NA,rep("T",10),NA,NA,
                            rep("T",9),NA,"T","T","T","T",NA,rep("T",7),NA),
                 RECAP = c(rep(NA,6),"RC",NA,NA,"RC",rep(NA,10),"RC","RC",
                            rep(NA,9),"RC",NA,NA,NA,NA,"RC",rep(NA,7),"RC"))

And here is the function:

myfunction <- function(dat, yr, spp, drain){
  dat <- dat %>% filter(SPECIES == spp, DRAINAGE == drain)

    estimatea <<-
      dat %>%
      summarise(NumCaught = sum(CATCH_QTY, na.rm = T),
            NewTags = sum(!is.na(TAGGED)), 
            Recaps = sum(!is.na(RECAP)),
            TotTags = sum(NewTags+Recaps))
    
    dataTest1 <- cbind(yr, spp, drain, estimatea$NumCaught, estimatea$NewTags,
                       estimatea$Recaps, estimatea$TotTags)
}

I've mostly been experimenting with nested for loops and have been struggling to store the output in a dataframe given that the variables over which I am iterating are factors, rather than numeric, therefore a lot of the existing answers on stack exchange aren't relevant. The answer for iterating over factors here does not show how to store the output.

Some examples of my attempts:

out <- list()

for (i in seq_along(levels(df$SPECIES))) {
  for (j in seq_along(levels(df$DRAINAGE))) {
    out[i,j] <- myfunction(df, "2023", i, j)
  }
}

Error in out[i, j] <- myfunction(df, "2023", i, j) : 
  incorrect number of subscripts on matrix

for (i in seq_along(levels(df$SPECIES))) {
  for (j in seq_along(levels(df$DRAINAGE))) {
    out[i+1,j+1] <- myfunction(df, "2023", i, j)
  }
}

Error in out[i, j] <- myfunction(df, "2023", i, j) : 
  incorrect number of subscripts on matrix

I've also considered some non-for loop options, e.g.,

combos <- expand.grid(df$SPECIES, df$DRAINAGE) %>% distinct() %>%
  drop_na() %>% rename(spp = Var1, drain = Var2)
test <- myfunction(df, "2023", combos$spp, combos$drain) #generates incorrect results

sapply(combos$spp, function(x) mapply(myfunction,x,combos$drain))
apply(combos, 2, FUN = myfunction)
Error in UseMethod("filter") : 
  no applicable method for 'filter' applied to an object of class "character"

Ideally, the output dataframe would look something like this:

desired_out <- data.frame(yr = rep("2023",3),
                          spp = c("DUCK", "DUCK", "GOOS"),
                          drain = c("West", "Central", "Upper"),
                          V4 = c(1,3,4),
                          V5 = c(1,1,3),
                          v6 = c(0,0,1),
                          V7 = c(1,1,4))

shaun_m · Accepted Answer · 2024-05-21 22:57:09Z

To get your desired output, dplyr functions do what you need without getting into loops or *apply functions:

df %>%
  group_by(SPECIES, DRAINAGE) %>%
  summarise(
    NumCaught = sum(CATCH_QTY, na.rm = T),
    NewTags = sum(!is.na(TAGGED)), 
    Recaps = sum(!is.na(RECAP)),
    TotTags = NewTags+Recaps
  )
#>    SPECIES DRAINAGE NumCaught NewTags Recaps TotTags
#> 1     SWAN  Central         9       2      2       4
#> 2     SWAN    Upper         8       3      0       3
#> 3     SWAN     West         4       3      0       3
#> 4     DUCK    Upper         2       2      0       2
#> 5     DUCK     West         1       1      0       1
#> 6     DUCK  Central         3       1      0       1
#> 7     GOOS     West         4       3      1       4
#> 8     GOOS  Central         6       3      1       4
#> 9     GOOS    Upper         5       4      0       4
#> 10    PASS     West         3       3      0       3
#> 11    PASS  Central         3       3      0       3
#> 12    PASS    Upper         3       2      1       3
#> 13    FALC     West         4       3      1       4
#> 14    FALC  Central         3       2      1       3
#> 15    FALC    Upper         3       3      0       3

If your real-life requirements are more complicated and you really need to use some kind of looping I would recommend th *apply family of functions. In your example, nested lapply()s will get to the same result:

# the function can just return the summarised 1-row data frame,
# no need to update estimatea
myfunction <- function(dat, yr, spp, drain){
  dat %>% filter(SPECIES == spp, DRAINAGE == drain) %>%
    summarise(NumCaught = sum(CATCH_QTY, na.rm = T),
              NewTags = sum(!is.na(TAGGED)), 
              Recaps = sum(!is.na(RECAP)),
              TotTags = sum(NewTags+Recaps),
              .by = c(SPECIES, DRAINAGE))
}

# nested lapply() to create lists of 1-row data frames
# (use levels(df$SPECIES) not seq_along() because we want
# the character strings, not the numeric index)
outputs <- lapply(levels(df$SPECIES),
       function(x) {
         lapply(levels(df$DRAINAGE),
                function(y) myfunction(df, "2023", x, y)
                )
       })

# bind them together into 1 data frame
do.call(bind_rows, outputs)
#>    SPECIES DRAINAGE NumCaught NewTags Recaps TotTags
#> 1     DUCK  Central         3       1      0       1
#> 2     DUCK    Upper         2       2      0       2
#> 3     DUCK     West         1       1      0       1
#> 4     FALC  Central         3       2      1       3
#> 5     FALC    Upper         3       3      0       3
#> 6     FALC     West         4       3      1       4
#> 7     GOOS  Central         6       3      1       4
#> 8     GOOS    Upper         5       4      0       4
#> 9     GOOS     West         4       3      1       4
#> 10    PASS  Central         3       3      0       3
#> 11    PASS    Upper         3       2      1       3
#> 12    PASS     West         3       3      0       3
#> 13    SWAN  Central         9       2      2       4
#> 14    SWAN    Upper         8       3      0       3
#> 15    SWAN     West         4       3      0       3

Generally caution is advised when using the <<- operator as it affects global variables outside of the function's scope. *apply functions are normally cleaner to use in R.

Edit:

As @Parfait points out in the comments by() is the more appropriate function to use here. It subsets the data frame and does the summarisation without the need for nested lapply() calls.

# the function to return the summarised 1-row data frame
# it doesn't need to filter the data frame as `by()` does that for us
myfunction <- function(dat, yr){
  dat %>%
    summarise(NumCaught = sum(CATCH_QTY, na.rm = T),
              NewTags = sum(!is.na(TAGGED)), 
              Recaps = sum(!is.na(RECAP)),
              TotTags = sum(NewTags+Recaps),
              .by = c(SPECIES, DRAINAGE))
}

# by() to create lists of 1-row data frames
outputs <- by(df, df[, c("SPECIES", "DRAINAGE")], myfunction, yr = "2023")

# bind them together into 1 data frame
do.call(bind_rows, outputs)
#>    SPECIES DRAINAGE NumCaught NewTags Recaps TotTags
#> 1     DUCK  Central         3       1      0       1
#> 2     FALC  Central         3       2      1       3
#> 3     GOOS  Central         6       3      1       4
#> 4     PASS  Central         3       3      0       3
#> 5     SWAN  Central         9       2      2       4
#> 6     DUCK    Upper         2       2      0       2
#> 7     FALC    Upper         3       3      0       3
#> 8     GOOS    Upper         5       4      0       4
#> 9     PASS    Upper         3       2      1       3
#> 10    SWAN    Upper         8       3      0       3
#> 11    DUCK     West         1       1      0       1
#> 12    FALC     West         4       3      1       4
#> 13    GOOS     West         4       3      1       4
#> 14    PASS     West         3       3      0       3
#> 15    SWAN     West         4       3      0       3

You missed an important apply family member, by (object-oriented wrapper to tapply): outputs <- by(df, df[c("SPECIES", "DRAINAGE")], FUN=funcOnEachSubset)
Thanks @Parfait. I wasn't familiar with by(). I have added this suggestion to my answer.
BTW: dplyr::bind_rows (unlike base::rbind) does not need do.call. Simply, run bind_rows(outputs).
My actual function has many more parts which make the dplyr::group_by( ) option not possible. However, your other solution worked! Thank you so much

Collectives™ on Stack Overflow

Store result from nested for loop with two factor variables

1 Answer 1

4 Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

4 Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related