R Расчет весов выборки и взвешенной агрегации на основе справочной таблицы - PullRequest
0 голосов
/ 20 февраля 2019

После проведения опроса я собрал результаты в виде фрейма данных.Вот воспроизводимая версия того, как выглядит фактический фрейм данных.

library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
               "1105","1106","1107","1108",
               "1109","1110","1111","1112",
               "1113","1114","1115","1116",
               "1117","1118","1119","1120",
               "1121","1122","1123","1124",
               "1125","1126","1127","1128",
               "1129","1130","1131","1132",
               "1133","1134","1135","1136",
               "1137","1138","1139","1140",
               "1141","1142","1143","1144",
               "1145","1146","1147","1148",
               "1149","1150","1151","1152",
               "1153","1154","1155","1156"),
          Country=c("US","UK","Canada","Mexico",
                    "India","US","Peru","China",
                    "US","UK","Canada","Mexico",
                    "Portugal","India","Portugal","Mexico",
                    "Peru","India","Canada","Mexico",
                    "India","UK","India","Canada",
                    "US","UK","China","India",
                    "US","Mexico","Canada","Mexico",
                    "Canada","China","Canada","Canada",
                    "China","China","India","Mexico",
                    "Portugal","Portugal","Portugal","Portugal",
                    "UK","UK","UK","Peru",
                    "Peru","Mexico","US","US",
                    "Peru","Mexico","Peru","Mexico"),
          Gender=c("Male","Male","Male","Female",
                    "Female","Female","Male","Female",
                    "Female","Female","Male","Female",
                    "Male","Male","Female","Female",
                    "Female","Male","Female","Female",
                    "Female","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Female","Male","Female","Female",
                    "Male","Male","Male","Female",
                    "Male","Male","Female","Female",
                    "Male","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Male","Male","Male","Male"),
          Age=c("<25","25-35","25-35","36-45",
                ">55",">55","25-35",">55",
                "<25","25-35","25-35","36-45",
                "25-35","25-35","25-35","36-45",
                ">55","36-45","46-55","36-45",
                ">55","46-55","25-35","46-55",
                "<25","46-55","25-35","46-55",
                "25-35","25-35","46-55","36-45",
                "<25","<25",">55","36-45",
                "36-45","46-55","<25","<25",
                "<25",">55","36-45","46-55",
                "<25",">55","36-45","46-55",
                "36-45",">55","36-45","46-55",
                "<25","46-55","<25","46-55"),
          Score_Q1=c(4,4,3,2,
                  1,1,4,2,
                  1,1,1,2,
                  2,1,4,3,
                  4,3,1,1,
                  1,2,1,1,
                  1,4,1,4,
                  3,4,3,3,
                  1,3,3,1,
                  1,1,2,1,
                  1,2,1,2,
                  1,1,1,1,
                  2,2,2,2,
                  1,2,3,4),
          Score_Q2=c(1,4,1,1,
                     1,2,1,1,
                     1,4,4,4,
                     2,1,1,3,
                     4,3,1,1,
                     1,3,3,3,
                     2,4,1,2,
                     4,4,4,4,
                     1,1,1,1,
                     1,2,3,4,
                     4,4,2,1,
                     1,2,3,2,
                     1,2,1,2,
                     4,3,2,1))

Фрейм данных можно разделить на следующие части:

1) ID : идентификатор респондента

2) Страна : Страна происхождения респондента

3) Пол : Пол респондента

4) Возраст : Возраст респондента

5) Score_Q1 : Оценка удовлетворенности за первый квартал по шкале от 1 (Очень доволен) до 4 (Очень недоволен).

6) Score_Q2 : оценка удовлетворенности за 2 квартал по шкале от 1 (очень доволен) до 4 (очень недоволен).

Перваянекоторая очистка данных -

#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)

Теперь я проверяю соотношение для возраста и пола в моем наборе данных -

Пол по Country

#1) Gender by Country
split_gender=df %>% select(Country,Gender) %>%
  group_by(Gender,Country) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  select(Country,Gender,n) %>%
  group_by(Country,add=TRUE) %>%
  spread(Country,n)

split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"

Age поCountry

#2) Age by Country
split_age=df %>% select(Country,Age) %>%
  group_by(Age,Country) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  select(Country,Age,n) %>%
  group_by(Country,add=TRUE) %>%
  spread(Country,n)

split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"

#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))

Приведенные выше два шага дают мне два кадра данных - split_age_sample & split_gender_sample.Эти данные содержат выборочные соотношения по возрасту и полу по стране для моих 56 респондентов.

Моя цель: Расчет весов выборки на основе статистики населения

Для того, чтобы сделать мойкадр данных более репрезентативно для реальности , я бы хотел присвоить веса моим респондентам на основе официальных соотношений населения по возрасту и полу по стране.

Это официальные коэффициенты населения, которые я нашел для обследованных стран.

#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
                                 Canada=c(0.4,0.6),
                                 China=c(0.3,0.7),
                                 India=c(0.3,0.7),
                                 Mexico=c(0.5,0.5),
                                 Peru=c(0.6,0.4),
                                 Portugal=c(0.5,0.5),
                                 UK=c(0.4,0.6),
                                 US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
                                 Canada=c(0.1,0.3,0.3,0.2,0.1),
                                 China=c(0.3,0.05,0.35,0.1,0.2),
                                 India=c(0.5,0.05,0.35,0.05,0.05),
                                 Mexico=c(0.2,0.3,0.2,0.1,0.2),
                                 Peru=c(0.1,0.3,0.2,0.2,0.2),
                                 Portugal=c(0.2,0.1,0.05,0.05,0.6),
                                 UK=c(0.2,0.3,0.1,0.3,0.1),
                                 US=c(0.2,0.3,0.1,0.3,0.1))

Желаемый результат

На основе моих выборочных коэффициентов и официальных коэффициентов населенияпо возрасту и полу я бы хотел присвоить веса своим респондентам в отдельном столбце под названием weights.

. В настоящее время я не могу понять, как выполнить этот расчет.

Затем, как только весы будут рассчитаны, я хотел бы суммировать баллы, используя столбец weights.Агрегирование выглядело бы примерно так (кроме случаев, когда в расчете учитывались веса) -

Пример: взвешенные агрегированные баллы по Великобритании

#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
  select(-Age) %>%
  group_by(Country,Gender) %>%
  filter(Country=='UK') %>%
  summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
            Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))

Я был бы очень признателен за любую помощь, которую смогуперейдите к расчету веса и его использованию на следующем шаге взвешенной агрегации.

1 Ответ

0 голосов
/ 22 февраля 2019

Не уверен, что это именно то, что вы ищете, но вот что я понял.Вам необходимо объединить национальные весовые коэффициенты с вашим фреймом данных, а затем вычислить KPI.

> # Reshape national weights

> Nombres <- cbind.data.frame("Country" = colnames(split_gender_official)[colnames(split_gender_official) != "Gender"],
+                              "time" = 1:length(colnames(split_gender_official)[colnames(split_gender_official) != "Gender"]))
> Nombres$Country <- as.character(Nombres$Country)
> 
> split_gender_official_resh <- reshape(split_gender_official, direction = "long", varying = Nombres$Country, v.names = "Weights_gend")
> split_age_official_resh <- reshape(split_age_official, direction = "long", varying = Nombres$Country, v.names = "Weights_age")
> 
> split_gender_official_resh$id <- NULL
> split_age_official_resh$id <- NULL
> 
> split_gender_official_resh <- merge(split_gender_official_resh, Nombres, by = "time", all.x = TRUE)
> split_age_official_resh <- merge(split_age_official_resh, Nombres, by = "time", all.x = TRUE)
> 
> split_gender_official_resh$time <- NULL
> split_age_official_resh$time <- NULL

> # Merge weights with df

> df <- merge(df, split_gender_official_resh, by = c("Gender", "Country"), all.x = TRUE)
> df <- merge(df, split_age_official_resh, by = c("Age", "Country"), all.x = TRUE)
> 

> # Print tables
>
> # Without weights
>
> prop.table(table(df$Gender, df$Country), 2)

            Canada     China     India    Mexico      Peru  Portugal        UK        US
  Female 0.5000000 0.2000000 0.5714286 0.7000000 0.3333333 0.5000000 0.5714286 0.5714286
  Male   0.5000000 0.8000000 0.4285714 0.3000000 0.6666667 0.5000000 0.4285714 0.4285714
> prop.table(table(df$Age, df$Country), 2)

           Canada     China     India    Mexico      Peru  Portugal        UK        US
  <25   0.1250000 0.2000000 0.1428571 0.1000000 0.3333333 0.1666667 0.1428571 0.4285714
  >55   0.1250000 0.2000000 0.2857143 0.1000000 0.1666667 0.1666667 0.1428571 0.1428571
  25-35 0.2500000 0.2000000 0.2857143 0.1000000 0.1666667 0.3333333 0.2857143 0.1428571
  36-45 0.1250000 0.2000000 0.1428571 0.5000000 0.1666667 0.1666667 0.1428571 0.1428571
  46-55 0.3750000 0.2000000 0.1428571 0.2000000 0.1666667 0.1666667 0.2857143 0.1428571
> 
> # With weights
> prop.table(xtabs(Weights_gend ~ Gender + Country, df), 2)
        Country
Gender       Canada      China      India     Mexico       Peru   Portugal         UK         US
  Female 0.40000000 0.09677419 0.36363636 0.70000000 0.42857143 0.50000000 0.47058824 0.47058824
  Male   0.60000000 0.90322581 0.63636364 0.30000000 0.57142857 0.50000000 0.52941176 0.52941176
> prop.table(xtabs(Weights_age ~ Gender + Country, df), 2)
        Country
Gender      Canada     China     India    Mexico      Peru  Portugal        UK        US
  Female 0.3333333 0.0500000 0.4642857 0.6250000 0.4545455 0.7142857 0.5000000 0.5000000
  Male   0.6666667 0.9500000 0.5357143 0.3750000 0.5454545 0.2857143 0.5000000 0.5000000
> 
> #  Means with weights and scores
> tapply(df$Score_Q1 * df$Weights_gend, list(df$Gender, df$Country), mean)
       Canada China    India   Mexico Peru  Portugal  UK  US
Female    0.6  0.60 0.600000 1.000000  1.5 1.3333333 0.8 0.7
Male      1.2  1.05 1.166667 1.666667  1.0 0.6666667 1.2 1.4
> tapply(df$Score_Q1 * df$Weights_age, list(df$Age, df$Country), mean)
         Canada China India Mexico Peru Portugal   UK  US
<25   0.1000000  0.90  1.00   0.20  0.2     0.20 0.20 0.4
>55   0.9000000  0.10  0.05   0.60  1.2     0.20 0.30 0.3
25-35 0.6000000  0.35  0.35   0.80  0.8     0.15 0.25 0.3
36-45 0.2000000  0.10  0.15   0.22  0.4     0.05 0.30 0.6
46-55 0.1666667  0.20  0.20   0.60  0.2     1.20 0.30 0.2
> 

Надеюсь, это поможет.

...