Please help me figure out an efficient way to merge these two data frames without using a for loop. There are many more columns and rows, but I simplified the data for this example.
I am looking to:
- left join, keep the df rows intact and bring over the D column from the lookup.
- Join based on two columns.
- First check column x with a fuzzy match. I want to take the x from the df and see if ANY x in the lookup is a partial string match (the lookup x string is inside the df x string). If there is no match, then I want it to use the "All Else" x variable.
- Then after picking the x variable, I want to check the y variable for an exact match and return the D variable.
Here are the two tables I start with:
df = structure(list(x = c("San Francisco", "Work at Home", "Arlington VA",
"Work at Home", "Arlington"), y = c(1, 5, 1, 6, 2)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
lookup = structure(list(x = c("Arlington", "Arlington", "Arlington", "Arlington",
"Arlington", "Arlington", "Arlington", "Arlington", "Arlington",
"Arlington", "Arlington", "Arlington", "Arlington", "Chicago",
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago",
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago",
"San Diego", "San Diego", "San Diego", "San Diego", "San Diego",
"San Diego", "San Diego", "San Diego", "San Diego", "San Diego",
"San Diego", "San Diego", "San Diego", "Lisle", "Lisle", "Lisle",
"Lisle", "Lisle", "Lisle", "Lisle", "Lisle", "Lisle", "Lisle",
"Lisle", "Lisle", "Lisle", "Brea", "Brea", "Brea", "Brea", "Brea",
"Brea", "Brea", "Brea", "Brea", "Brea", "Brea", "Brea", "Brea",
"Boston", "Boston", "Boston", "Boston", "Boston", "Boston", "Boston",
"Boston", "Boston", "Boston", "Boston", "Boston", "Boston", "Austin",
"Austin", "Austin", "Austin", "Austin", "Austin", "Austin", "Austin",
"Austin", "Austin", "Austin", "Austin", "Austin", "Dallas", "Dallas",
"Dallas", "Dallas", "Dallas", "Dallas", "Dallas", "Dallas", "Dallas",
"Dallas", "Dallas", "Dallas", "Dallas", "Miami", "Miami", "Miami",
"Miami", "Miami", "Miami", "Miami", "Miami", "Miami", "Miami",
"Miami", "Miami", "Miami", "Bedford", "Bedford", "Bedford", "Bedford",
"Bedford", "Bedford", "Bedford", "Bedford", "Bedford", "Bedford",
"Bedford", "Bedford", "Bedford", "All Else", "All Else", "All Else",
"All Else", "All Else", "All Else", "All Else", "All Else", "All Else",
"All Else", "All Else", "All Else", "All Else"), y = c(1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13), D = c(0.88, 0.7, 0.19, 0.12,
0.26, 0.68, 0.1, 1, 0.68, 0.96, 0.75, 0.08, 0.25, 0.3, 0.64,
0.35, 0.94, 0.21, 0.15, 0.19, 0.84, 0.94, 0.03, 0.39, 0.42, 0.76,
0.48, 0.71, 0.75, 0.87, 0.18, 0.53, 0.45, 0.1, 0.66, 0.01, 0.22,
0.11, 0.79, 0.82, 0.11, 0.66, 0.91, 0.59, 0.55, 0.66, 0.29, 0.58,
0.26, 0.36, 0.07, 0.47, 0.47, 0.45, 0.15, 0.07, 0.49, 0.67, 0.8,
0.82, 0.89, 0.36, 0.3, 0.57, 0.44, 0.09, 0.59, 0.65, 0.12, 0.05,
0.87, 0.47, 0.24, 0.17, 0.56, 0.13, 0.84, 0.17, 0.61, 0.73, 0.31,
0.79, 0.64, 0.6, 0.63, 0.36, 0.41, 0.15, 0.79, 0.59, 0.2, 0.59,
0.89, 0.46, 0.77, 0.79, 0.5, 0.99, 0.22, 0.77, 0.9, 0.86, 0.6,
0.41, 0.95, 0.38, 0.86, 0.82, 0.68, 0.3, 0.75, 0.29, 0.16, 0.88,
0.3, 0.53, 0.14, 0.23, 0.16, 0.88, 0.93, 0.63, 0.41, 0.72, 0.58,
0.58, 0.63, 0.66, 0.98, 0.25, 0.68, 0.92, 0.67, 0.67, 0.11, 0.16,
0.3, 0.36, 0.32, 0.66, 0.34, 0.89, 0.33)), row.names = c(NA,
-143L), class = c("tbl_df", "tbl", "data.frame"))
Here is my desired output:
output = structure(list(x = c("San Francisco", "Work at Home", "Arlington VA",
"Work at Home", "Arlington"), y = c(1, 5, 1, 6, 2), D = c(0.68,
0.11, 0.88, 0.16, 0.7)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))