В ваших данных изменился порядок и ожидаемый результат. Поэтому я взяла данные из результата. Вот разбивка логики на отдельные столбцы перед созданием манекена с dummy
из hablar
.
library(hablar)
library(dplyr)
df %>%
group_by(sic, year) %>%
mutate(second_highest = nth(sort(unique(percentage), decreasing = T), 2),
max_value = max(percentage),
is_max = percentage == max_value,
is_ab_10 = (max_value - second_highest) > 10,
specialized = dummy(is_max & is_ab_10, missing = 1)
) %>%
ungroup() %>%
select(-c(second_highest, max_value, is_max, is_ab_10))
Результат
# A tibble: 49 x 6
gvkey auditor_fkey year sic percentage specialized
<int> <int> <int> <int> <dbl> <int>
1 10390 2 2000 100 100 1
2 10390 2 2000 100 100 1
3 3107 2 2000 1000 37.1 0
4 3107 2 2000 1000 37.1 0
5 14590 5 2000 1000 60.7 1
6 14590 5 2000 1000 60.7 1
7 25776 1 2000 1000 2.22 0
8 31626 2 2000 1000 37.1 0
9 61811 5 2000 1000 60.7 1
10 5560 1 2000 1040 100 1
# … with 39 more rows
Данные
df <- structure(list(gvkey = c(10390L, 10390L, 3107L, 3107L, 14590L,
14590L, 25776L, 31626L, 61811L, 5560L, 20488L, 65921L, 65921L,
14793L, 14793L, 15274L, 15274L, 1266L, 10390L, 10390L, 3107L,
3107L, 14590L, 14590L, 19565L, 19565L, 25776L, 31626L, 61811L,
61811L, 64134L, 64134L, 5560L, 7881L, 9728L, 9728L, 12321L, 12321L,
20488L, 65921L, 65921L, 102341L, 14793L, 14793L, 15274L, 15274L,
142460L, 142460L, 142460L), auditor_fkey = c(2L, 2L, 2L, 2L,
5L, 5L, 1L, 2L, 5L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 4L, 2L, 2L,
2L, 2L, 5L, 5L, 1L, 1L, 1L, 2L, 5L, 5L, 580L, 580L, 7L, 5L, 597L,
597L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L), year = c(2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2001L, 2001L,
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L,
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L,
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L,
2001L, 2001L, 2001L), sic = c(100L, 100L, 1000L, 1000L, 1000L,
1000L, 1000L, 1000L, 1000L, 1040L, 1040L, 1040L, 1040L, 1220L,
1220L, 1220L, 1220L, 100L, 100L, 100L, 1000L, 1000L, 1000L, 1000L,
1000L, 1000L, 1000L, 1000L, 1000L, 1000L, 1000L, 1000L, 1040L,
1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L,
1220L, 1220L, 1220L, 1220L, 1220L, 1220L, 1220L), percentage = c(100,
100, 37.0939127, 37.0939127, 60.6862904, 60.6862904, 2.2197969,
37.0939127, 60.6862904, 100, 100, 100, 100, 34.7515456, 34.7515456,
65.2484544, 65.2484544, 26.9605909, 73.0394091, 73.0394091, 9.889969,
9.889969, 18.8287898, 18.8287898, 71.1457384, 71.1457384, 71.1457384,
9.889969, 18.8287898, 18.8287898, 0.1355028, 0.1355028, 8.2959428,
71.1026743, 1.0906007, 1.0906007, 18.1873703, 18.1873703, 18.1873703,
18.1873703, 18.1873703, 1.3234119, 58.0859392, 58.0859392, 41.9140608,
41.9140608, 58.0859392, 58.0859392, 58.0859392)), row.names = c(NA,
-49L), class = c("tbl_df",
"tbl", "data.frame"))