Вот один из способов сделать это:
get_commonest <- function(level, code, string)
{
names(rev(sort(table(na.omit(string[code == level])))))[1]
}
codes <- na.omit(unique(df$CODE))
strings <- na.omit(unique(df$STRING))
default_strings <- as.data.frame(sapply(codes, get_commonest, df$CODE, df$STRING))
default_codes <- as.data.frame(sapply(strings, get_commonest, df$STRING, df$CODE))
df$CODE[is.na(df$CODE)] <- as.character(default_codes[df$STRING[is.na(df$CODE)],])
df$STRING[is.na(df$STRING)] <- as.character(default_strings[df$CODE[is.na(df$STRING)],])
Дать вам этот результат:
df
#> CODE STRING
#> 2 A1 broken bones
#> 3 A1 broken bones
#> 4 A1 broken bones
#> 5 A1 bones, broken
#> 6 A1 bones, fracture
#> 7 A1 broken bones
#> 8 B1 red blood cells
#> 9 B1 red blood cells
#> 10 B1 blood cells, red
#> 11 B1 red blood cells
#> 12 B1 erythrocytes
#> 13 A1 broken bones
#> 14 C1 liver disease
#> 15 C1 liver disease
#> 16 C1 hepatic illness
#> 17 C1 liver disease
#> 18 C1 disease, liver
#> 19 C1 liver disease
Данные, приведенные в вопросе, представлены в воспроизводимом формате здесь:
df <- structure(list(CODE = c("A1", "A1", NA, "A1", "A1", "A1", "B1",
"B1", "B1", "B1", "B1", NA, "C1", "C1", "C1", "C1", "C1", NA),
STRING = c("broken bones", "broken bones", "broken bones",
"bones, broken", "bones, fracture", NA, "red blood cells",
"red blood cells", "blood cells, red", NA, "erythrocytes",
"broken bones", "liver disease", "liver disease", "hepatic illness",
NA, "disease, liver", "liver disease")), row.names = 2:19, class = "data.frame")