Если я правильно понимаю ваш вопрос, и вы ищете, как часто две одинаковые категории равны 1 в одном ряду (например, попарно, как @ M-- спросил), вот как я это сделал в прошлое. Я уверен, что есть более изящный способ сделать это: D
library(dplyr)
library(tidyr)
test.df <- structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L, 1L,
1L, 0L, 0L, 0L), Type_SafflowerSeeds = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Type_EconMix = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L), Type_Grains = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Type_Nuts = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Suet = c(1L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Type_SugarWater = c(1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L), Type_FruitOrJams = c(0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Mealworms = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Type_BarkOrPeanutButter = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Scraps = c(1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Bread = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), total = c(5, 3, 3, 4, 2, 3,
3, 1, 1, 2)), row.names = c(NA, 10L), class = "data.frame")
test.df %>%
mutate(food.id = 1:n()) %>%
gather(key = "type1", value = "val", -food.id, -total) %>% #create an ID column for each row
filter(val==1) %>%
select(food.id, type1) %>% #now we have a data.frame with one column for food.id and
# one column for every food.type it is associated with
left_join( # this left join is essentially doing the same thing we did before
test.df %>%
mutate(food.id = 1:n()) %>%
gather(key = "type2", value = "val", -food.id, -total) %>%
filter(val==1) %>%
select(food.id, type2),
by = c("food.id") # now we're matching each food with all of its associated types
) %>%
mutate(type1.n = as.numeric(factor(type1)), # quick way of making sure we're not counting duplicates
# (e.g. if type1 = Type_SunflowerSeeds and type2 = Type_SafflowerSeeds, that's the same if they were switched)
type2.n = as.numeric(factor(type2))) %>%
filter(type1 > type2) %>% # this filter step takes care of the flip flopping issue
group_by(type1, type2) %>%
summarise( #finally, count the combinations/pairwise values
n.times = n()
) %>%
ungroup() %>%
arrange(desc(n.times), type1, type2)
С выводом:
type1 type2 n.times
<chr> <chr> <int>
1 Type_Scraps Type_EconMix 3
2 Type_SugarWater Type_EconMix 3
3 Type_SunflowerSeeds Type_EconMix 3
4 Type_SunflowerSeeds Type_Scraps 3
5 Type_SunflowerSeeds Type_SugarWater 3
6 Type_Scraps Type_Nuts 2
7 Type_SugarWater Type_Suet 2
8 Type_SunflowerSeeds Type_Suet 2
9 Type_FruitOrJams Type_EconMix 1
10 Type_Nuts Type_EconMix 1
11 Type_Nuts Type_FruitOrJams 1
12 Type_Scraps Type_FruitOrJams 1
13 Type_Suet Type_EconMix 1
14 Type_Suet Type_Scraps 1
15 Type_SugarWater Type_PremMix 1
16 Type_SugarWater Type_Scraps 1
17 Type_SunflowerSeeds Type_Nuts 1
Чтобы расширить это и выполнить подсчет трехсторонней комбинации Вы можете следовать этому коду. Я также добавил несколько дополнительных комментариев, чтобы просмотреть, что происходит:
# create a baseline data.frame with food.id and every food type that it matches
food.type.long.df <- test.df %>%
mutate(food.id = 1:n()) %>%
gather(key = "type1", value = "val", -food.id, -total) %>%
filter(val==1) %>%
select(food.id, type1) %>%
arrange(food.id)
# join the baseline data.frame to itself to see all possible combinations of food types
# note: this includes repeated types like type1=Type_Corn and type2=Type_Corn
# this also includes rows where the types are simply flip-flopped types
# ex. Row 2 is type1=Type_SunflowerSeeds and type2 = Type_EconMix
# but Row 6 is type1=Type_EconMix and type2 = Type_SunflowerSeeds - we don't want to count this combinations twice
food.2types.df <- food.type.long.df %>%
left_join(
select(food.type.long.df, food.id, type2 = type1),
by = "food.id"
) %>%
arrange(food.id)
# let's add the third type as well; as with before, the same issues are in this df but we'll fix the duplicates
# and flip flops later
food.3types.df <- food.2types.df %>%
left_join(
select(food.type.long.df, food.id, type3 = type1),
by = "food.id"
) %>%
arrange(food.id)
food.3types.df.fixed <- food.3types.df %>%
distinct() %>%
mutate(type1.n = as.numeric(factor(type1)), # assign each type1 a number (in alphabetical order)
type2.n = as.numeric(factor(type2)), # assign each type2 a number (in alphabetical order)
type3.n = as.numeric(factor(type3))) %>% # assign each type3 a number (in alphabetical order)
filter(type1 > type2) %>% # to remove duplicates and flip-flopped rows for types 1 and 2, use a strict inequality
filter(type2 > type3) # to remove duplicates and flip-flopped rows for types 2 and 3, use a strict inequality
food.3type.combination.count <- food.3types.df.fixed %>%
group_by(type1, type2, type3) %>% # group by all three types you want to count
summarise(
n.times = n()
) %>%
ungroup() %>%
arrange(desc(n.times), type1, type2, type3)
С выводом:
type1 type2 type3 n.times
<chr> <chr> <chr> <int>
1 Type_SunflowerSeeds Type_Scraps Type_EconMix 2
2 Type_SunflowerSeeds Type_SugarWater Type_EconMix 2
3 Type_SunflowerSeeds Type_SugarWater Type_Suet 2
4 Type_Nuts Type_FruitOrJams Type_EconMix 1
5 Type_Scraps Type_FruitOrJams Type_EconMix 1
6 Type_Scraps Type_Nuts Type_EconMix 1
7 Type_Scraps Type_Nuts Type_FruitOrJams 1
8 Type_Suet Type_Scraps Type_EconMix 1
9 Type_SugarWater Type_Scraps Type_EconMix 1
10 Type_SugarWater Type_Suet Type_EconMix 1
11 Type_SugarWater Type_Suet Type_Scraps 1
12 Type_SunflowerSeeds Type_Scraps Type_Nuts 1
13 Type_SunflowerSeeds Type_Suet Type_EconMix 1
14 Type_SunflowerSeeds Type_Suet Type_Scraps 1
15 Type_SunflowerSeeds Type_SugarWater Type_Scraps 1