Следующая функция обеспечивает рабочее, хотя и не элегантное, решение проблемы. Я склонен задумываться над процессами, что, скорее всего, отражено в ответе здесь.
Эта функция будет принимать начальный набор данных и, основываясь на том, что если предусмотрена функция группировки, создаст новый набор данных с дополнительными строками для различных комбинаций агрегированных уровней факторов, если эти уровни существовали в группах. Различные новые уровни могут быть предоставлены в виде списка, а дополнительный столбец позволяет легко увидеть, какие новые уровни были добавлены в дополнение к исходным строкам.
#-----------------------------------------------------------#
# Create function for multilevel labelling of factor groups #
#-----------------------------------------------------------#
# target_col is a character string for the column of interest to be adjusted
# group_col is a character string for the column to check levels that exist within groupings
# new_levels is a list that uses name and value pairs to determine how new levels should be aggregated
# collapse will ensure that only unique combinations of the new level is appended
# track will add a flag to ensure one can easily see the new combinations that were appended
create_multilevelFactor <- function(data, target_col, new_levels , group_col, collapse = T, track = T) {
#
# Do some basic checks on inputs
#
# Check if new_levels is provided as a list
if(!is.list(new_levels)) stop('The provided set of levels is not in a list format, please provide as a list')
# Check if target_col is a factor
if(!is.factor(data[[target_col]])) stop('The target column for multiple levels is not a factor, convert to a factor before proceeding.')
# Check if levels are in list
for(i in 1:length(new_levels)) {
if(length(setdiff(levels(factor(new_levels[[i]])),
levels(factor(data[[target_col]])))) > 0) { # If levels in provided list contain a level not in the column, then throw error
stop('Levels in list do not match the levels in the target column')
}
}
# State if grouping col was provided and its purpose
if(!missing(group_col)) { message(paste0('The following column is used as a grouping variable for summarizing multilevel factoring: ',
group_col, '. If you do not want labels determined by those within groupings, leave argument blank.'))
}
#
# Main
#
# Set new column for tracking if desired
if(track == T) {track_col <- rep(NA,nrow(data)); data$track_col <- 1; trackColIndex <- 1;}
OutData <- as.data.frame(NULL) # Empy data frame to fill and append later
# Loop for all new levels of interest to add
for(i in 1:length(new_levels)){
tempData <- data # Look at fresh data every pass
levelIndex <- which(levels(tempData[[target_col]]) %in% new_levels [[i]]) # Index of matches
# If grouping provided, do necessary splits and rbinds
if(!missing(group_col)) {
tempData <- split(tempData, tempData[[group_col]]) # Split if there are groupings
tempData <- lapply(tempData, function(x) {
if(!(length(setdiff(levels(factor(new_levels [[i]])), levels(factor(x[[target_col]])))) > 0)) { # If the grouping does not have all the levels for the new grouping, then do nothing
levels(x[[target_col]])[levelIndex] <- names(new_levels )[i]
x
}
})
tempData <- do.call(rbind, tempData) # If didnt match necessary group conditions, will bring back empty
rownames(tempData) <- NULL # Correct row names for tibble
} else { # If not grouping
levels(tempData[[target_col]])[levelIndex] <- names(new_levels )[i]
}
tempData <- tempData[tempData[[target_col]] %in% names(new_levels )[i],] # Only keep new factor levels (could be empty if no group matches)
if(collapse == T) tempData <- unique(tempData[(tempData[[target_col]] %in% names(new_levels )[i]),]) # Collapse to unique combinations if desired
if(track == T){track_col <- rep(NA, nrow(tempData)); tempData$track_col <- trackColIndex+1; trackColIndex <- trackColIndex+1;} # Add track column to the new rows
OutData <- suppressWarnings(dplyr::bind_rows(OutData, tempData)) # Append all the new rows
}
# Append new rows to the original rows
OutData <- suppressWarnings(dplyr::bind_rows(data, OutData)) #
return(OutData)
}
Используя изначально предоставленные данные примера, это может привести кследующие выходы:
#Original data
library(tibble)
# Example data (Repeat groups)
exampleData <- tibble(group = c(1, 1, 1, 2, 3, 3),
condition = factor(c('A', 'B', 'C', 'A', 'B', 'Q'), ordered = F))
# Original data
# A tibble: 6 x 2
group condition
<dbl> <fct>
1 1 A
2 1 B
3 1 C
4 2 A
5 3 B
6 3 Q
##################
newData <- create_multilevelFactor(exampleData,
target_col = 'condition',
group_col = 'group',
new_levels = list('AB' = c('A', 'B'), 'QB' = c('Q', 'B')),
collapse = T, track = T)
newData
# Data with grouping argument
# A tibble: 8 x 3
group condition track_col
<dbl> <chr> <dbl>
1 1 A 1
2 1 B 1
3 1 C 1
4 2 A 1
5 3 B 1
6 3 Q 1
7 1 AB 2
8 3 QB 3
addmargins(table(newData$group,newData$condition))
A AB B C Q QB Sum
1 1 1 1 1 0 0 4
2 1 0 0 0 0 0 1
3 0 0 1 0 1 1 3
Sum 2 1 2 1 1 1 8
newData <- create_multilevelFactor(exampleData,
target_col = 'condition',
new_levels = list('AB' = c('A', 'B'), 'QB' = c('Q', 'B')),
collapse = T, track = T)
newData
# Without grouping argument
# A tibble: 11 x 3
group condition track_col
<dbl> <chr> <dbl>
1 1 A 1
2 1 B 1
3 1 C 1
4 2 A 1
5 3 B 1
6 3 Q 1
7 1 AB 2
8 2 AB 2
9 3 AB 2
10 1 QB 3
11 3 QB 3
newData <- create_multilevelFactor(exampleData,
target_col = 'condition',
new_levels = list('AB' = c('A', 'B'), 'QB' = c('Q', 'B')),
collapse = F, track = T)
newData
# Without collapse and grouping argument
# A tibble: 13 x 3
group condition track_col
<dbl> <chr> <dbl>
1 1 A 1
2 1 B 1
3 1 C 1
4 2 A 1
5 3 B 1
6 3 Q 1
7 1 AB 2
8 1 AB 2
9 2 AB 2
10 3 AB 2
11 1 QB 3
12 3 QB 3
13 3 QB 3