0 down vote favorite
I have two data frame loc_df and and city_df (city and country) now loc_df has 5 column but considering only 2 here (Organization.Location.1 and Organization.Location.2) with 35000 row and city_df has 2 column (city and country) with 1000 rows. Now I am taking one value from city cloumn and matching with organisation column using grepl (for text matching ) and for loop(for iteration). I also have to maintain a index that's why I am using for loop. But this is taking huge amount of time.
I am trying to replace each city, state, province name to their country name in organization columns.
Please help me to optimize this code. I am very new to R.
for(k in 1:2){
if(k==1){
for (i in 1:nrow(city_df)) {
x1 <- paste(" ", city_df$City[i], sep = "")
x2 <- paste(" ", city_df$City[i], " ", sep = "")
x3 <- paste(city_df$City[i], " ", sep = "")
# print(x1)
for (j in 1:nrow(loc_df)) {
#print(loc_df$Organization.Location.1[j])
if (grepl(x1, loc_df$Organization.Location.1[j]) |
grepl(x2, loc_df$Organization.Location.1[j]) |
grepl(x3, loc_df$Organization.Location.1[j])) {
loc_df$org_new1[j] <- city_df$Country[i]
break
}
}
}
}
if(k==2){
for (i in 1:nrow(city_df)) {
x1 <- paste(" ", city_df$City[i], sep = "")
x2 <- paste(" ", city_df$City[i], " ", sep = "")
x3 <- paste(city_df$City[i], " ", sep = "")
for (j in 1:nrow(loc_df)) {
if (grepl(x1, loc_df$Organization.Location.2[j]) |
grepl(x2, loc_df$Organization.Location.2[j]) |
grepl(x3, loc_df$Organization.Location.3[j])) {
loc_df$org_new1[j] <- city_df$Country[i]
break
}
}
}
}
}
this is sample data I have generated using dput of city_df
structure(list(City = c("zug", "canton of zug", "zimbabwe",
"zigong chengdu", "zhuhai guangdong china", "zaragoza spain"), Country = c("switzerland",
"switzerland", "zimbabwe", "china", "china", "spain"
)), .Names = c("City", "Country"), row.names = c(NA, 6L), class = "data.frame")
sample of loc_df
structure(list(Organization.Location.1 = c("zug switzerland",
"zug canton of zug switzerland", "zimbabwe", "zigong chengdu pr china",
"zhuhai guangdong china", "zaragoza spain"), Organization.Location.2 = c("",
"san francisco bay area", "london canada area", "beijing city china",
"greater atlanta area", "paris area france")), .Names = c("Organization.Location.1",
"Organization.Location.2"), row.names = c(NA, 6L), class = "data.frame")