Барплот с указанием статистически значимой разницы - PullRequest
1 голос
/ 11 января 2020

Мне нужно нарисовать гистограмму для значимых кодов SNP (категориальных) по отношению к соответствующему фенотипу, аналогично этим графикам:

my favorite Barplot

Я пробовал много способов в R и получил некоторые результаты, но я хочу получить мой любимый результат. Вот коды и результаты:

### DATA

SNP_code <- as.factor(c("GG","GA","AA","GA","GA","GG","GG","GG","GG","GA","GA","AA","GA","GA","GA","GG","GG","GG","GG","AA","GG","GG","GG","GG","AA","GG","GG","GA","GG","AA","GA","GG","GG","GG","GG","GG","GG","AA","GG","GA","GG","GG","GA","GG","GG","GA","GG","GG","GA","GA","GG","GA","GG","GA","GA","GA","GA","GA","GA","GG","GG","GG","AA","GA","GA","GA","GA","GG","GA","GG","GG","GG","GA","GA","GA","GG","GG","GA","GG","AA","GG","GG","GG","AA"))

EBV <- c(0.06663,-0.03031,-0.122,-0.02021,-0.1157,-0.08131,-0.02034,-0.06324,0.06699,-0.062,0.02736,-0.1201,-0.04846,-0.06934,-0.06023,-0.009244,-0.05648,-0.01908,0.06728,-0.06517,0.08534,0.07618,-0.0814,0.06113,-0.0795,0.1055,0.08305,0.1209,-0.05314,-0.09431,0.05185,0.1347,0.1591,0.08777,0.08326,0.1612,0.09528,-0.1002,0.1561,-0.09327,0.09474,0.1356,0.06384,0.1585,0.03235,0.1081,0.1462,-0.04082,-0.05042,0.01793,-0.1157,-0.1165,-0.009399,-0.02311,-0.108,-0.1143,0.07219,0.01376,-0.05059,-0.052,0.08494,-0.0388,-0.06346,0.07789,0.02961,-0.1126,0.1102,0.133,-0.09317,-0.1181,0.1584,0.122,0.1019,-0.04074,-0.01178,0.09523,-0.03266,-0.01258,-0.0231,-0.08259,0.05823,-0.02894,-0.008242,0.07981)

LS <- c(2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,1,2,1,1,2,1,2,2,2,1,1,1,2,2,2,2,2,1,1,2,1,2,2,2,1,2,2,2,1,1,2,1,1,1,1,1,1,1,1,1,1,2,1,1,2,1,1,2,2,1,1,2,1,2,1,1,2,1,1,1,1,1,1,1,2)

IDs <- c(1033,1081,1106,1107,1120,1194,1199,1326,1334,1340,1345,1358,1398,1404,1405,1421,1457,1459,1464,1509,1529,1542,1550,2025,2030,2095,2099,2128,2141,2153,2167,2224,2232,2238,2244,2266,2271,2280,2283,2323,2326,2337,2369,2390,2391,2396,851012,851016,851021,851055,851063,851084,851105,851109,851146,851169,851176,851198,851205,851246,851266,851292,851332,851345,851488,851489,851509,851528,851531,851547,851562,851573,851574,851578,851584,851588,851592,851622,851651,851670,851672,851684,851690,861086)

sig_snp <- data.frame(IDs, SNP_code, EBV, LS)

### Анализ отклонений и сравнение средних значений

library(dplyr)
### for LS
group_by(sig_snp, SNP_code) %>%
  summarise(
    count = n(),
    mean = mean(LS, na.rm = TRUE),
    sd = sd(LS, na.rm = TRUE))

### for EBV
group_by(sig_snp, SNP_code) %>%
  summarise(
    count = n(),
    mean = mean(EBV, na.rm = TRUE),
    sd = sd(EBV, na.rm = TRUE))

# Compute the analysis of variance
Anova.fit <- aov(EBV ~ SNP_code, data = sig_snp)
summary(Anova.fit)
# Tukey multiple pairwise-comparisons
TukeyHSD(Anova.fit) 
# or
library(multcomp)
summary(glht(Anova.fit, linfct = mcp(SNP_code = "Tukey")))

### Рамочный график для EBV (на самом деле мне нужен Barplot для LS и EBV)

library(ggplot2)
library(ggpval)

plot <- ggplot(sig_snp, aes(SNP_code, EBV)) +
geom_boxplot(fill=c("red","blue", "green"), color="black", width=.7); plot
add_pval(plot, pairs = list(c(1, 3)), test='wilcox.test')
add_pval(plot, pairs = list(c(2, 3)), test='wilcox.test')
add_pval(plot, pairs = list(c(1, 2)), test='wilcox.test')

Boxplot

только "add_pval" используйте "wilcox.test" и "t.test", но я предпочитаю Tukey. Любая помощь приветствуется.

1 Ответ

1 голос
/ 12 января 2020

Определенно, есть место для улучшения кода, который я разместил ниже, но, по крайней мере, он дает вам один пример рабочего процесса, который вы можете использовать для получения своего «любимого» барплота:

Часть A : Barchart

1) Мы реорганизуем sig_snp для получения кадра данных со средним значением каждого SNP в функции EBV или LS.

library(tidyverse)
DF1 <- sig_snp %>% 
  pivot_longer(., cols = c(EBV,LS), names_to = "Variable", values_to = "Values") %>%
  group_by(SNP_code, Variable) %>%
  summarise(Mean = mean(Values),
            SEM = sd(Values) / sqrt(n()),
            Nb = n()) %>%
  rowwise() %>%
  mutate(Labels = as.character(SNP_code)) %>%
  mutate(Labels = paste(unlist(strsplit(Labels,"")),collapse = "/")) %>%
  mutate(Labels = paste0(Labels,"\nn = ",Nb))

# A tibble: 6 x 6
  SNP_code Variable    Mean    SEM    Nb Labels       
  <fct>    <chr>      <dbl>  <dbl> <int> <chr>        
1 AA       EBV      -0.0719 0.0202     9 "A/A\nn = 9" 
2 AA       LS        1.11   0.111      9 "A/A\nn = 9" 
3 GA       EBV      -0.0141 0.0134    31 "G/A\nn = 31"
4 GA       LS        1.23   0.0763    31 "G/A\nn = 31"
5 GG       EBV       0.0422 0.0126    44 "G/G\nn = 44"
6 GG       LS        1.48   0.0762    44 "G/G\nn = 44"

Столбец labels будет повторно использован для обозначения оси X.

2) Затем мы рассчитаем общее среднее значение (которое будет зависеть чтобы нарисовать «среднее» бар), выполнив:

library(tidyverse)
DF2 <- sig_snp  %>% 
  pivot_longer(., cols = c(EBV,LS), names_to = "Variable", values_to = "Values") %>%
  group_by(Variable) %>%
  summarise(Mean = mean(Values),
            SEM = sd(Values) / sqrt(n()),
            Nb = n()) %>%
  mutate(SNP_code = "All") %>%
  select(SNP_code, Variable, Mean, SEM, Nb) %>%
  rowwise() %>%
  mutate(Labels = paste0("Mean\nn = ",Nb))

# A tibble: 2 x 6
  SNP_code Variable    Mean     SEM    Nb Labels        
  <chr>    <chr>      <dbl>   <dbl> <int> <chr>         
1 All      EBV      0.00918 0.00944    84 "Mean\nn = 84"
2 All      LS       1.35    0.0522     84 "Mean\nn = 84"

3) мы связываем DF1 и DF2 и реорганизуем уровни SNP_code, чтобы получить правильный порядок построения:

library(tidyverse)
DF <- bind_rows(DF1, DF2)
DF$Labels = factor(DF$Labels,levels= c("Mean\nn = 84",
                                       "A/A\nn = 9",
                                       "G/A\nn = 31",
                                       "G/G\nn = 44" ))

4) Теперь мы можем построить его:

library(ggplot2)
ggplot(DF, aes(x = SNP_code, y = Mean, fill = SNP_code))+
  geom_bar(stat = "identity", show.legend = FALSE)+
  geom_errorbar(aes(ymin = Mean-SEM, ymax = Mean+SEM), width = 0.2)+
  facet_wrap(.~Variable, scales = "free")+
  scale_x_discrete(name = "",labels = levels(DF$Labels))

enter image description here

Часть B: Добавление статистики c на диаграмме

Для добавления статистики c можно использовать функцию geom_signif из пакета ggsignif, которая позволяет добавлять статистику из внешнего вывода.

1) Сначала создайте информационный кадр для вывода теста Тьюки на EBV:

Anova.fit <- aov(EBV ~ SNP_code, data = sig_snp)
t <- TukeyHSD(Anova.fit)
stat <- t$SNP_code
Stat_EBV <- stat %>% as.data.frame() %>% 
  mutate(Variable = "EBV") %>% 
  mutate(Group = rownames(stat)) %>%
  rowwise() %>%
  mutate(Group1 = unlist(strsplit(Group,"-"))[1]) %>%
  mutate(Group2 = unlist(strsplit(Group,"-"))[2]) %>%
  mutate(labels = round(`p adj`,4)) 

Stat_EBV$y_pos <- c(0.06,0.08,0.1)

2) То же самое для теста Тьюки LS:

Anova.fit <- aov(LS ~ SNP_code, data = sig_snp)
t <- TukeyHSD(Anova.fit)
stat <- t$SNP_code
Stat_LS <- stat %>% as.data.frame() %>% 
  mutate(Variable = "LS") %>% 
  mutate(Group = rownames(stat)) %>%
  rowwise() %>%
  mutate(Group1 = unlist(strsplit(Group,"-"))[1]) %>%
  mutate(Group2 = unlist(strsplit(Group,"-"))[2]) %>%
  mutate(labels = round(`p adj`,4)) 
Stat_LS$y_pos = c(1.7,1.9,2.1)

3) Привязка обоих фреймов данных статистики:

library(tidyverse)
STAT <- bind_rows(Stat_EBV,Stat_LS)

# A tibble: 6 x 10
    diff      lwr   upr  `p adj` Variable Group Group1 Group2 labels y_pos
   <dbl>    <dbl> <dbl>    <dbl> <chr>    <chr> <chr>  <chr>   <dbl> <dbl>
1 0.0578 -0.0130  0.129 0.132    EBV      GA-AA GA     AA     0.132   0.06
2 0.114   0.0457  0.183 0.000431 EBV      GG-AA GG     AA     0.0004  0.08
3 0.0563  0.0125  0.100 0.00821  EBV      GG-GA GG     GA     0.0082  0.1 
4 0.115  -0.303   0.532 0.790    LS       GA-AA GA     AA     0.790   1.7 
5 0.366  -0.0373  0.770 0.0832   LS       GG-AA GG     AA     0.0832  1.9 
6 0.251  -0.00716 0.510 0.0585   LS       GG-GA GG     GA     0.0585  2.1

4) Получить диаграмму и добавить статистику c результаты:

library(ggplot2)
library(ggsignif)
ggplot(DF, aes(x = SNP_code, y = Mean, fill = SNP_code))+
  geom_bar(stat = "identity", show.legend = FALSE)+
  geom_errorbar(aes(ymin = Mean-SEM, ymax = Mean+SEM), width = 0.2)+
  geom_signif(inherit.aes = FALSE, data = STAT, 
              aes(xmin=Group1, xmax=Group2, annotations=labels, y_position=y_pos),
              manual = TRUE)+
  facet_wrap(.~Variable, scales = "free")+
  scale_x_discrete(name = "",labels = levels(DF$Labels))

enter image description here

Надеюсь, это выглядит так, как вы нг.

...