пытаясь обернуть голову вокруг индексации в листы списков - PullRequest
0 голосов
/ 25 января 2019

Я хочу индексировать значения из таблицы, которая состоит из нескольких обычных столбцов и нескольких списков столбцов. Базовая структура такова:

# A tibble: 1,200 x 13
   city    country       month data               model    modelgam  glance_lm           rsq glance_gam       aic_gam tidy_lm          augment_lm         res        
   <chr>   <chr>         <dbl> <list>             <list>   <list>    <list>            <dbl> <list>             <dbl> <list>           <list>             <list>     
 1 Abidjan Côte D'Ivoire     1 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.233 <tibble [1 × 6]>   154.  <tibble [2 × 5]> <tibble [114 × 9]> <dbl [114]>
 2 Abidjan Côte D'Ivoire     2 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.342 <tibble [1 × 6]>   164.  <tibble [2 × 5]> <tibble [114 × 9]> <dbl [114]>
 3 Abidjan Côte D'Ivoire     3 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.274 <tibble [1 × 6]>   152.  <tibble [2 × 5]> <tibble [114 × 9]> <dbl [114]>
 4 Abidjan Côte D'Ivoire     4 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.376 <tibble [1 × 6]>   145.  <tibble [2 × 5]> <tibble [114 × 9]> <dbl [114]>
 5 Abidjan Côte D'Ivoire     5 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.451 <tibble [1 × 6]>   112.  <tibble [2 × 5]> <tibble [114 × 9]> <dbl [114]>
 6 Abidjan Côte D'Ivoire     6 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.419 <tibble [1 × 6]>    87.4 <tibble [2 × 5]> <tibble [114 × 9]> <dbl [114]>
 7 Abidjan Côte D'Ivoire     7 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.249 <tibble [1 × 6]>   129.  <tibble [2 × 5]> <tibble [114 × 9]> <dbl [114]>
 8 Abidjan Côte D'Ivoire     8 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.228 <tibble [1 × 6]>   153.  <tibble [2 × 5]> <tibble [114 × 9]> <dbl [114]>
 9 Abidjan Côte D'Ivoire     9 <tibble [114 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.230 <tibble [1 × 6]>   124.  <tibble [2 × 5]> <tibble [113 × 9]> <dbl [113]>
10 Abidjan Côte D'Ivoire    10 <tibble [113 × 6]> <S3: lm> <S3: Gam> <tibble [1 × 11]> 0.292 <tibble [1 × 6]>    98.7 <tibble [2 × 5]> <tibble [113 × 9]> <dbl [113]>
# ... with 1,190 more rows

Если я хочу вытащить значение p.value из glance_lm для первой записи (например), я могу использовать двойные скобки (поскольку это список), например:

> cmodels_details$glance_lm[[1]]$p.value
[1] 5.430896e-08

Однако, если мне нужны все значения p.value, это не будет работать:

> cmodels_details$glance_lm[[]]$p.value
NULL

В моем случае я специально хочу вывести оценки для двух моделей из их списка и построить линии регрессии на рисунке для каждой записи. Таким образом, возможность захвата перехватов и уклонов (по одной записи за раз), например:

 cmodels_details$tidy_lm[[1]]$estimate
[1] 26.379383982  0.007833912

или просто перехват:

> cmodels_details$tidy_lm[[1]]$estimate[1]
[1] 26.37938

не очень помогает. В конечном счете, я хотел бы добавить некоторый код ggplot в структуру данных, которая отображает фигуры для каждой записи, но для того, чтобы сделать линии регрессии для двух моделей (lm и gam), мне нужно получить оценки из таблицы tibble каждой записи. (или список тиблов)

Вот фрагмент данных, которые я использую, и код, чтобы получить правильную форму.

> dput(small_tb)
structure(list(dt = structure(c(14245, 14276, 14304, 14335, 14365, 
14396, 14426, 14457, 14488, 14518, 14549, 14579, 14610, 14641, 
14669, 14700, 14730, 14761, 14791, 14822, 14853, 14883, 14914, 
14944, 14975, 15006, 15034, 15065, 15095, 15126, 15156, 15187, 
15218, 15248, 15279, 15309, 15340, 15371, 15400, 15431, 15461, 
15492, 15522, 15553, 15584, 15614, 15645, 15675, 15706, 15737, 
15765, 15796, 15826, 15857, 15887, 15918, 15949, 14245, 14276, 
14304, 14335, 14365, 14396, 14426, 14457, 14488, 14518, 14549, 
14579, 14610, 14641, 14669, 14700, 14730, 14761, 14791, 14822, 
14853, 14883, 14914, 14944, 14975, 15006, 15034, 15065, 15095, 
15126, 15156, 15187, 15218, 15248, 15279, 15309, 15340, 15371, 
15400, 15431, 15461, 15492, 15522, 15553, 15584, 15614, 15645, 
15675, 15706, 15737, 15765, 15796, 15826, 15857, 15887, 15918, 
15949), class = "Date"), averagetemperature = c(-4.299, 1.454, 
4.808, 7.623, 12.627, 17.305, 19.792, 21.724, 19.502, 11.22, 
10.261, 1.563, -0.595, 0.771, 6.489, 10.935, 13.803, 19.055, 
24.106, 24.948, 19.229, 14.582, 8.539, -0.071, -1.582, 0.276, 
3.474, 7.383, 12.133, 18.011, 24.412, 23.414, 18.331, 13.837, 
9.555, 5.327, 2.67, 3.698, 12.145, 8.383, 14.956, 19.532, 25.909, 
22.778, 18.693, 12.229, 7.27, 5.592, 1.056, -0.509, 1.323, 6.644, 
13.734, 17.913, 21.914, 22.23, 19.977, -5.36, -0.372, 3.579, 
10.478, 15.447, 19.058, 21.103, 22.769, 17.043, 10.364, 8.217, 
-0.624, -2.359, -1.456, 6.715, 12.076, 17.119, 21.943, 24.789, 
22.67, 19.172, 11.911, 5.876, -2.165, -4.463, -1.244, 3.474, 
10.555, 16.917, 21.032, 24.564, 22.13, 19.301, 12.001, 8.013, 
2.987, -0.0410000000000004, 2.185, 8.734, 10.324, 17.779, 20.165, 
24.479, 22.731, 18.177, 12.436, 4.103, 2.586, -0.968, -1.365, 
2.518, 9.723, 15.544, 20.892, 24.722, 21.001, 17.408), averagetemperatureuncertainty = c(0.336, 
0.328, 0.247, 0.348, 0.396, 0.554, 0.481, 0.315, 0.225, 0.162, 
0.372, 0.348, 0.348, 0.364, 0.357, 0.538, 0.892, 0.33, 0.325, 
0.36, 0.322, 0.241, 0.307, 0.326, 0.522, 0.446, 0.279, 0.265, 
0.733, 0.773, 0.255, 0.404, 0.173, 0.154, 0.334, 0.483, 0.727, 
0.567, 0.369, 0.347, 0.835, 0.519, 0.516, 0.42, 0.329, 0.333, 
0.263, 0.537, 0.528, 0.473, 0.275, 0.462, 0.863, 0.669, 0.322, 
0.373, 1.033, 0.288, 0.214, 0.14, 0.259, 0.267, 0.452, 0.348, 
0.277, 0.22, 0.153, 0.181, 0.228, 0.314, 0.319, 0.235, 0.135, 
0.2, 0.387, 0.28, 0.257, 0.165, 0.154, 0.174, 0.436, 0.355, 0.33, 
0.167, 0.222, 0.312, 0.42, 0.438, 0.163, 0.16, 0.23, 0.298, 0.466, 
0.493, 0.253, 0.276, 0.258, 0.301, 0.39, 0.403, 0.224, 0.269, 
0.344, 0.298, 0.257, 0.29, 0.241, 0.255, 0.355, 0.281, 0.273, 
0.279, 0.323, 1.048), city = c("Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York", "New York", "New York", "New York", 
"New York", "New York", "New York"), country = c("United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States"), latitude = c("42.59N", "42.59N", "42.59N", 
"42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", 
"42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", 
"42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", 
"42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", 
"42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", 
"42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", 
"42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "42.59N", 
"42.59N", "42.59N", "42.59N", "42.59N", "42.59N", "40.99N", "40.99N", 
"40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", 
"40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", 
"40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", 
"40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", 
"40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", 
"40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", 
"40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N", 
"40.99N", "40.99N", "40.99N", "40.99N", "40.99N", "40.99N"), 
    longitude = c("87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", "87.27W", 
    "87.27W", "87.27W", "87.27W", "87.27W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", "74.56W", 
    "74.56W")), row.names = c(NA, -114L), class = c("tbl_df", 
"tbl", "data.frame"), spec = structure(list(cols = list(dt = structure(list(
    format = ""), class = c("collector_date", "collector")), 
    AverageTemperature = structure(list(), class = c("collector_double", 
    "collector")), AverageTemperatureUncertainty = structure(list(), class = c("collector_double", 
    "collector")), City = structure(list(), class = c("collector_character", 
    "collector")), Country = structure(list(), class = c("collector_character", 
    "collector")), Latitude = structure(list(), class = c("collector_character", 
    "collector")), Longitude = structure(list(), class = c("collector_character", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
"collector"))), class = "col_spec"))

И некоторый код:

# load libraries
library(tidyverse)
library(lubridate)
library(gam)
# library(broom)
# library(purrr)

# Load the data from dput() and take a look
summary(gltd)
str(gltd)
names(gltd)

# make lowercasae
gltd <- rename_all(gltd, tolower)
names(gltd)

# nest data, 100 major cities
by_city_month <- gltd %>% 
  filter(year(dt) >= 1900) %>%
  mutate(month = month(dt)) %>%
  mutate(yr1900 = year(dt) - 1900) %>%
  group_by(city, country, month) %>%
  nest()

by_city_month

# define function for linear model
city_model_lm <- function(df) {
  lm(averagetemperature ~ yr1900, data = df)
}

# define function for GAM
city_model_gam <- function(df) {
  gam(averagetemperature ~ s(yr1900), data = df)
}

# create columns for the models
cmodels <- by_city_month %>%
  mutate(model = map(data, city_model_lm), 
         modelgam = map(data, city_model_gam)
  )

cmodels_details <- cmodels %>%
  mutate(
    glance_lm = model %>% map(glance),  #model summary: rsquared...
    rsq = glance_lm %>% map_dbl("r.squared"),  #extract rsquared

    glance_gam = modelgam %>% map(broom::glance), #GAM model summary
    aic_gam = glance_gam %>% map_dbl("AIC"), #extract AIC

    tidy_lm = model %>% map(tidy), #model estimate: coeff...

    augment_lm = model %>% map(augment), #observation stats: resid,hat...
    res = augment_lm %>% map(".resid") #extract resid
  )

Любой совет будет наиболее ценным.

Ответы [ 2 ]

0 голосов
/ 25 января 2019

В ответ на следующее: «В конечном итоге я хотел бы добавить некоторый код ggplot в структуру данных, которая отображает фигуру для каждой записи, но для того, чтобы сделать линии регрессии для двух моделей (lm и gam), янеобходимо получить оценки из каждой записи (или списка записей) ", попробуйте geom_smooth().Вам не нужно подбирать все модели для построения моделей LM и GAM!Надеюсь, это было полезно для вас!

by_city_month %>% unnest() %>% # your data from above
  ggplot(aes(x = yr1900, y = averagetemperature, group = interaction(city,month))) + 
  geom_smooth(method = "lm", se = F, color = "black", alpha = .7) + 
  geom_smooth(method = "gam", se = F, color = "pink", alpha = .7, size = .5, linetype = 2)

enter image description here

0 голосов
/ 25 января 2019

Вот решение tidyverse, которое должно дать вам то, что вы хотите:

library(tidyverse)

cmodels_details %>%
  transmute(
    city = city,
    month = month,
    p_val = map_dbl(glance_lm, "p.value"),
    coefs = map(tidy_lm, "estimate")
  ) %>% 
  unnest(coefs) %>% 
  mutate(x = rep(c("intercept", "x1"), nrow(.) / 2))) %>% 
  spread(x, coefs)
# A tibble: 24 x 5
#   city    month  p_val intercept      x1
#   <chr>   <dbl>  <dbl>     <dbl>   <dbl>
# 1 Chicago     1 0.0790  -156.     1.40  
# 2 Chicago     2 0.875     12.2   -0.0999
# 3 Chicago     3 0.935     20.2   -0.131 
# 4 Chicago     4 0.468     58.3   -0.451 
# 5 Chicago     5 0.411    -23.9    0.337 
# 6 Chicago     6 0.630     -0.429  0.169 
# 7 Chicago     7 0.505    -43.9    0.605 
# 8 Chicago     8 0.814     35.9   -0.116 
# 9 Chicago     9 0.872     14.6    0.0414
#10 Chicago    10 0.807    -12.2    0.228 
# ... with 14 more rows
...