Объединение нескольких фреймов данных с функцией lapply и файлами RDS RData - PullRequest
0 голосов
/ 28 мая 2019

Добрый вечер, я хочу повторить функцию, которая появляется в этом блоге https://www.r -bloggers.com / как выполнять слияния-объединения-на-двух-или-больше-фреймах данных-with-base-r-tidyverse-and-data-table / Объединение нескольких фреймов данных

flightsList <- readRDS(url(paste0(dataurl, "r006/nycflights13-list.rds")))
lapply(flightsList, function(x) c(toString(dim(x)), toString(names(x)))).

Я пытался разными способами, но не мог объединить таблицы различных значений.

atencionesList <- serialize(list("iatencion","iatencion_apo", "iatencion_dia", "iatencion_ins", "iatencion_med", "iatencion_ser",  
                    "iatencion_smi", "padron_nominado"), NULL)



atencionesList <- readRDS(paste0("~/GitHub/salud2018/Lima2019/Anelima/atenciones1.RDS"))
lapply(atencionesList, function(x) c(toString(dim(x)), toString(names(x))))

features bbdd

features bbdd1

#****************************
atencionesList <- load(paste0("~/GitHub/salud2018/Lima2019/Anelima/atenciones.RData"))
lapply(atencionesList, function(x) c(toString(dim(x)), toString(names(x))))
#**********************************************

 multiFull <- merge(merge(merge(merge(merge(merge(merge(
+ atencionesList[[1L]],
+ atencionesList[[2L]], all = TRUE),
+ atencionesList[[3L]], all = TRUE),
+ atencionesList[[4L]], all = TRUE),
+ atencionesList[[5L]], all = TRUE),
+ atencionesList[[6L]], all = TRUE),
+ atencionesList[[7L]], all = TRUE),
+ atencionesList[[8L]], all = TRUE)

В отличие от примера блога с данными о рейсах, я не оставляю ни одного кадра данных, относящегося к одномунесколько обращений к таблице с другими по dni, date и другим переменным, которые находятся в таблице.

1 Ответ

0 голосов
/ 30 мая 2019

Мне удалось внести изменения в базу данных, например, преобразовать tibble в data.table.Как вы можете видеть, я выполнил задачу, чтобы иметь возможность объединить несколько таблиц с помощью функции слияния, теперь проблема заключается в нехватке памяти, как вы можете видеть в прикрепленном коде.

setwd("~/GitHub/salud2018/Lima2019/Anelima")

iatencion <- read_delim("F:/data sis/iatencion.txt",  "|",col_names = TRUE, escape_double = TRUE, trim_ws = TRUE)
      iatencion_med <- read_delim("F:/data sis/iatencion_med.txt",  "|",col_names = TRUE, escape_double = TRUE, trim_ws = TRUE)
    iatencion_ins <- read_delim("F:/data sis/iatencion_ins.txt",  "|",col_names = TRUE, escape_double = TRUE, trim_ws = TRUE)
    iatencion_ser <- read_delim("F:/data sis/iatencion_ser.txt", "|",col_names = TRUE, escape_double = TRUE, trim_ws = TRUE)
    iatencion_apo <- read_delim("F:/data sis/iatencion_apo.txt",  "|",col_names = TRUE, escape_double = TRUE, trim_ws = TRUE)
    iatencion_dia <- read_delim("F:/data sis/iatencion_dia.txt",  "|", col_names = TRUE,escape_double = TRUE, trim_ws = TRUE)
    iatencion_smi <- read_delim("F:/data sis/iatencion_smi.txt",  "|",col_names = TRUE, escape_double = TRUE, trim_ws = TRUE)
    padron_nominado <- read_delim("F:/data sis/padron_nominado.txt",  "|",col_names = TRUE, escape_double = TRUE, trim_ws = TRUE)

save(iatencion,iatencion_med, iatencion_ins, iatencion_ser,iatencion_apo, iatencion_dia,   
     iatencion_smi, padron_nominado, file = "atenciones1.RData")


 load("~/GitHub/salud2018/Lima2019/Anelima/atenciones1.RData")

 setwd("~/GitHub/salud2018/Lima2019/Anelima")



        iatencion_med <- rename(iatencion_med, ate_idnumreg = amed_numregate) 
        iatencion_ins <- rename(iatencion_ins, ate_idnumreg = ains_NumRegate) 
        iatencion_ser <- rename(iatencion_ser, ate_idnumreg = ASER_NUMREGATE) 
        iatencion_apo <- rename(iatencion_apo, ate_idnumreg = aapo_NUMREGATE) 
        iatencion_dia <- rename(iatencion_dia, ate_idnumreg = adia_numregate) 
        iatencion_smi <- rename(iatencion_smi, ate_idnumreg = asmi_numregate) 

     iatencion <- as.data.table(iatencion); str(iatencion)
     iatencion_med <- as.data.table(iatencion_med);str(iatencion_med)
     iatencion_ins <- as.data.table(iatencion_ins);str(iatencion_ins)
     iatencion_ser <- as.data.table(iatencion_ser);str(iatencion_ser)

    iatencion_apo <- as.data.table(iatencion_apo);str(iatencion_apo )
    iatencion_dia <- as.data.table(iatencion_dia);str(iatencion_dia)
    iatencion_smi <- as.data.table(iatencion_smi);str(iatencion_smi)



    iatencion <- as.data.table(iatencion); str(iatencion)

Classes ‘data.table’ and 'data.frame':  1046739 obs. of  47 variables:
 $ dni_mod                 : chr  "82879909" "35766109" "37182709" "58953509" ...
 $ ate_disa                : chr  "350" "080" "035" "210" ...
 $ ate_ideess              : chr  "150101A109" "070103A101" "150125A304" "150118A101" ...
 $ ate_edad                : num  0 1 0 0 0 0 0 2 0 1 ...
 $ ate_idgrupoEtareo       : chr  "01" "01" "01" "01" ...
 $ ate_idsexo              : num  1 1 1 0 1 1 0 0 1 1 ...
 $ ate_codate              : num  2 3 1 3 1 1 1 1 1 1 ...
 $ ate_esgestante          : num  0 0 0 0 0 0 0 0 0 0 ...
 $ ate_historiaclinica     : chr  "1078561" "201839089" "120542" "2577739" ...
 $ ate_fecatencion         : POSIXct, format: "2018-10-24 08:05:00" "2018-07-16 16:36:00" ...
 $ ate_horatencion         : 'hms' num  NA 16:36:00 11:30:00 11:50:00 ...
  ..- attr(*, "units")= chr "secs"
 $ ate_idEESSrefirio       : chr  "150135A101" NA NA NA ...
 $ ate_nrohojareferencia   : chr  "58246" NA NA NA ...
 $ ate_idServicio          : chr  "071" "062" "056" "062" ...
 $ ate_idlugar             : num  1 1 1 1 1 1 1 1 1 1 ...
 $ ate_codDestino          : num  2 2 1 1 2 1 2 1 2 2 ...
 $ ate_fecinghosp          : POSIXct, format: NA "2018-07-16" ...
 $ ate_fecaltahosp         : POSIXct, format: NA "2018-07-16" ...
 $ ate_ideesscontrarefiere : chr  NA NA NA NA ...
 $ ate_numhojacontrarefiere: chr  NA NA NA NA ...
 $ ate_idTipoPersonalSalud : num  1 1 1 1 6 1 6 10 6 13 ...
 $ ate_fecParto            : logi  NA NA NA NA NA NA ...
 $ ate_idubigeo            : chr  NA "150101" "150125" "150728" ...
 $ ate_periodo             : num  2018 2018 2018 2018 2018 ...
 $ ate_mes                 : chr  "11" "11" "11" "11" ...
 $ ate_tipoTraslado        : logi  NA NA NA NA NA NA ...
 $ ate_tipoTrasnporte      : logi  NA NA NA NA NA NA ...
 $ ate_UE                  : chr  "0149" "1318" "1684" "1138" ...
 $ ate_tipDig              : num  1 3 3 3 3 3 3 3 3 3 ...
 $ ate_fecNac              : POSIXct, format: "2018-10-07" "2017-03-30" ...
 $ ate_idCategoriaEESS     : chr  "07" "06" "04" "06" ...
 $ ate_TipoPerSalud        : chr  "01" "01" "01" "01" ...
 $ ate_grupoRiesgo         : chr  "150117" "150101" "150125" "150728" ...
 $ ate_esNatimuerto        : chr  "N" "N" "N" "N" ...
 $ ate_capita              : chr  "N" "N" "N" "N" ...
 $ ate_estado_observacion  : num  2 2 2 2 2 2 2 2 2 2 ...
 $ ate_grupofocalizado     : chr  "038" "012" "040" "024" ...
 $ ate_codDiag1            : chr  "Z138" "R509" "D509" "R509" ...
 $ ate_tipoDocPersonalSalud: num  1 1 1 1 1 1 1 1 1 1 ...
 $ PERS_D_FECNACIMIENTO    : POSIXct, format: "2018-10-07" "2017-03-30" ...
 $ PERS_V_SEXO             : num  1 1 1 0 1 1 0 0 1 1 ...
 $ PERS_V_UBIGEORESID      : num  150117 150101 150125 150728 150132 ...
 $ PERS_V_IDEESSADSCRIP    : chr  "150117A315" "150101A207" "150125A304" "0000017816" ...
 $ PERSD_FECFALLECIMIENTO  : POSIXct, format: NA NA ...
 $ EDAD_ANIOS              : num  0 1 0 1 0 1 0 2 0 1 ...
 $ EDAD_MESES              : num  2 21 8 12 6 12 10 30 3 17 ...
 $ ate_idnumreg            : num  6.09e+08 6.17e+08 6.16e+08 6.19e+08 6.16e+08 ...
 - attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame':        72 obs. of  5 variables:
  ..$ row     : int  2480 2480 2480 2480 2480 2480 2480 2480 2480 2480 ...
  ..$ col     : chr  "ate_fecatencion" "ate_horatencion" "ate_fecinghosp" "ate_fecParto" ...
  ..$ expected: chr  "date like " "time like " "date like " "1/0/T/F/TRUE/FALSE" ...
  ..$ actual  : chr  "179435" "2018-04-12 14:00:00.0000000" "2" "5" ...
  ..$ file    : chr  "'F:/data sis/iatencion.txt'" "'F:/data sis/iatencion.txt'" "'F:/data sis/iatencion.txt'" "'F:/data sis/iatencion.txt'" ...
 - attr(*, "spec")=
  .. cols(
  ..   dni_mod = col_character(),
  ..   ate_disa = col_character(),
  ..   ate_ideess = col_character(),
  ..   ate_edad = col_double(),
  ..   ate_idgrupoEtareo = col_character(),
  ..   ate_idsexo = col_double(),
  ..   ate_codate = col_double(),
  ..   ate_esgestante = col_double(),
  ..   ate_historiaclinica = col_character(),
  ..   ate_fecatencion = col_datetime(format = ""),
  ..   ate_horatencion = col_time(format = ""),
  ..   ate_idEESSrefirio = col_character(),
  ..   ate_nrohojareferencia = col_character(),
  ..   ate_idServicio = col_character(),
  ..   ate_idlugar = col_double(),
  ..   ate_codDestino = col_double(),
  ..   ate_fecinghosp = col_datetime(format = ""),
  ..   ate_fecaltahosp = col_datetime(format = ""),
  ..   ate_ideesscontrarefiere = col_character(),
  ..   ate_numhojacontrarefiere = col_character(),
  ..   ate_idTipoPersonalSalud = col_double(),
  ..   ate_fecParto = col_logical(),
  ..   ate_idubigeo = col_character(),
  ..   ate_periodo = col_double(),
  ..   ate_mes = col_character(),
  ..   ate_tipoTraslado = col_logical(),
  ..   ate_tipoTrasnporte = col_logical(),
  ..   ate_UE = col_character(),
  ..   ate_tipDig = col_double(),
  ..   ate_fecNac = col_datetime(format = ""),
  ..   ate_idCategoriaEESS = col_character(),
  ..   ate_TipoPerSalud = col_character(),
  ..   ate_grupoRiesgo = col_character(),
  ..   ate_esNatimuerto = col_character(),
  ..   ate_capita = col_character(),
  ..   ate_estado_observacion = col_double(),
  ..   ate_grupofocalizado = col_character(),
  ..   ate_codDiag1 = col_character(),
  ..   ate_tipoDocPersonalSalud = col_double(),
  ..   PERS_D_FECNACIMIENTO = col_datetime(format = ""),
  ..   PERS_V_SEXO = col_double(),
  ..   PERS_V_UBIGEORESID = col_double(),
  ..   PERS_V_IDEESSADSCRIP = col_character(),
  ..   PERSD_FECFALLECIMIENTO = col_datetime(format = ""),
  ..   EDAD_ANIOS = col_double(),
  ..   EDAD_MESES = col_double(),
  ..   ate_idnumreg = col_double()
  .. )
 - attr(*, ".internal.selfref")=<externalptr> 
    iatencion_med <- as.data.table(iatencion_med);str(iatencion_med)
Classes ‘data.table’ and 'data.frame':  833976 obs. of  6 variables:
 $ amed_CodMed            : chr  "08013" "04024" "04831" "03519" ...
 $ amed_inrodia           : num  1 1 1 1 2 1 1 2 2 1 ...
 $ amed_icantprescrita    : num  3 1 36 1 30 2 2 1 30 3 ...
 $ amed_icantentregada    : num  3 1 36 1 30 2 2 1 30 3 ...
 $ amed_icantaprobadaODSIS: num  NA NA NA NA NA NA NA NA NA NA ...
 $ ate_idnumreg           : num  5.60e+08 5.59e+08 5.57e+08 5.59e+08 5.60e+08 ...
 - attr(*, "spec")=
  .. cols(
  ..   amed_CodMed = col_character(),
  ..   amed_inrodia = col_double(),
  ..   amed_icantprescrita = col_double(),
  ..   amed_icantentregada = col_double(),
  ..   amed_icantaprobadaODSIS = col_double(),
  ..   amed_numregate = col_double()
  .. )
 - attr(*, ".internal.selfref")=<externalptr> 
     iatencion_ins <- as.data.table(iatencion_ins);str(iatencion_ins)
Classes ‘data.table’ and 'data.frame':  499411 obs. of  6 variables:
 $ ains_inrodia               : num  1 3 3 3 2 1 1 1 1 1 ...
 $ ains_CodIns                : num  10355 15287 10554 19875 11370 ...
 $ ains_icantprescrita        : num  2 1 1 21 20 20 20 2 1 1 ...
 $ ains_icantentregada        : num  2 1 1 21 20 20 20 2 1 1 ...
 $ ains_icantidadaprobadaODSIS: num  NA NA NA NA NA NA NA NA NA NA ...
 $ ate_idnumreg               : num  5.55e+08 5.55e+08 5.55e+08 5.55e+08 5.55e+08 ...
 - attr(*, "spec")=
  .. cols(
  ..   ains_inrodia = col_double(),
  ..   ains_CodIns = col_double(),
  ..   ains_icantprescrita = col_double(),
  ..   ains_icantentregada = col_double(),
  ..   ains_icantidadaprobadaODSIS = col_double(),
  ..   ains_NumRegate = col_double()
  .. )
 - attr(*, ".internal.selfref")=<externalptr> 
    iatencion_ser <- as.data.table(iatencion_ser);str(iatencion_ser)
Classes ‘data.table’ and 'data.frame':  146999 obs. of  5 variables:
 $ ASER_IDSERVICIO      : chr  "016" "007" "022" "016" ...
 $ aser_finalidad       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ ASER_CAPITA          : chr  NA NA NA NA ...
 $ ASER_IDFINANCIAMIENTO: num  1 1 1 1 1 1 1 1 1 1 ...
 $ ate_idnumreg         : num  5.59e+08 5.59e+08 5.59e+08 5.59e+08 5.60e+08 ...
 - attr(*, "spec")=
  .. cols(
  ..   ASER_IDSERVICIO = col_character(),
  ..   aser_finalidad = col_double(),
  ..   ASER_CAPITA = col_character(),
  ..   ASER_IDFINANCIAMIENTO = col_double(),
  ..   ASER_NUMREGATE = col_double()
  .. )
 - attr(*, ".internal.selfref")=<externalptr> 
    iatencion_apo <- as.data.table(iatencion_apo);str(iatencion_apo )
Classes ‘data.table’ and 'data.frame':  1668345 obs. of  6 variables:
 $ aapo_CodApo        : chr  "90782" "97782" "99209" "99403" ...
 $ aapo_inrodia       : num  1 1 1 1 1 1 1 1 1 1 ...
 $ aapo_icantproced   : num  1 1 1 1 1 1 1 1 1 1 ...
 $ aapo_icantejecutada: num  1 1 1 1 1 1 1 1 1 1 ...
 $ aapo_Resultado     : num  0 0 0 0 0 0 0 0 0 0 ...
 $ ate_idnumreg       : num  6.19e+08 6.16e+08 6.21e+08 6.19e+08 6.19e+08 ...
 - attr(*, "spec")=
  .. cols(
  ..   aapo_CodApo = col_character(),
  ..   aapo_inrodia = col_double(),
  ..   aapo_icantproced = col_double(),
  ..   aapo_icantejecutada = col_double(),
  ..   aapo_Resultado = col_double(),
  ..   aapo_NUMREGATE = col_double()
  .. )
 - attr(*, ".internal.selfref")=<externalptr> 
    iatencion_dia <- as.data.table(iatencion_dia);str(iatencion_dia)
Classes ‘data.table’ and 'data.frame':  1436328 obs. of  5 variables:
 $ adia_iNroDia  : num  3 2 1 2 1 1 1 1 1 1 ...
 $ adia_CodDia   : chr  "J304" "J980" "J459" "Z108" ...
 $ adia_TipoDia  : num  1 1 1 1 4 4 1 1 1 1 ...
 $ adia_numregDia: num  NA NA NA NA NA ...
 $ ate_idnumreg  : num  5.56e+08 5.56e+08 5.56e+08 5.56e+08 5.56e+08 ...
 - attr(*, "spec")=
  .. cols(
  ..   adia_iNroDia = col_double(),
  ..   adia_CodDia = col_character(),
  ..   adia_TipoDia = col_double(),
  ..   adia_numregDia = col_double(),
  ..   adia_numregate = col_double()
  .. )
 - attr(*, ".internal.selfref")=<externalptr> 
    iatencion_smi <- as.data.table(iatencion_smi);str(iatencion_smi)
Classes ‘data.table’ and 'data.frame':  4700512 obs. of  3 variables:
 $ ate_idnumreg: num  5.70e+08 5.72e+08 5.72e+08 5.72e+08 5.94e+08 ...
 $ asmi_codsmi : chr  "307" "124" "307" "004" ...
 $ asmi_Numero : chr  "1" "2" "1" "76.0" ...
 - attr(*, "spec")=
  .. cols(
  ..   asmi_numregate = col_double(),
  ..   asmi_codsmi = col_character(),
  ..   asmi_Numero = col_character()
  .. )
 - attr(*, ".internal.selfref")=<externalptr> 


    setkey(iatencion, ate_idnumreg)
    setkey(iatencion_med, ate_idnumreg)
    setkey(iatencion_ins, ate_idnumreg)
    setkey(iatencion_ser, ate_idnumreg)
    setkey(iatencion_apo, ate_idnumreg)
    setkey(iatencion_dia, ate_idnumreg)
    setkey(iatencion_smi, ate_idnumreg)

    mylist <- list(iatencion, iatencion_med, iatencion_ins, iatencion_ser, iatencion_apo, iatencion_dia, iatencion_smi)

    lapply(mylist, function(x) c(toString(dim(x)), toString(names(x))))

      multiFull <- merge(merge(merge(merge(merge(merge(
      mylist[[1L]],
      mylist[[2L]],  by=.EACHI, allow.cartesian=TRUE, all = TRUE),
      mylist[[3L]],  by=.EACHI, allow.cartesian=TRUE, all = TRUE),
      mylist[[4L]],  by=.EACHI, allow.cartesian=TRUE, all = TRUE),
      mylist[[5L]],  by=.EACHI, allow.cartesian=TRUE, all = TRUE),
      mylist[[6L]],  by=.EACHI, allow.cartesian=TRUE, all = TRUE),
      mylist[[7L]],  by=.EACHI, allow.cartesian=TRUE, all = TRUE)

Ошибка: не удается найтивектор размером 30,4 МБ

Теперь моя проблема, по-видимому, заключается в нехватке памяти, хотя мой компьютер обладает следующими функциями:

    sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17763)

Matrix products: default

locale:
[1] LC_COLLATE=Spanish_Chile.1252  LC_CTYPE=Spanish_Chile.1252   
[3] LC_MONETARY=Spanish_Chile.1252 LC_NUMERIC=C                  
[5] LC_TIME=Spanish_Chile.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] data.table_1.12.2 forcats_0.4.0     stringr_1.4.0     dplyr_0.8.1      
 [5] purrr_0.3.2       readr_1.3.1       tidyr_0.8.3       tibble_2.1.1     
 [9] ggplot2_3.1.1     tidyverse_1.2.1  

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.1       pillar_1.4.0     compiler_3.6.0   cellranger_1.1.0
 [5] plyr_1.8.4       tools_3.6.0      jsonlite_1.6     lubridate_1.7.4 
 [9] gtable_0.3.0     nlme_3.1-139     lattice_0.20-38  pkgconfig_2.0.2 
[13] rlang_0.3.4      cli_1.1.0        rstudioapi_0.10  haven_2.1.0     
[17] withr_2.1.2      xml2_1.2.0       httr_1.4.0       generics_0.0.2  
[21] hms_0.4.2        grid_3.6.0       tidyselect_0.2.5 glue_1.3.1      
[25] R6_2.4.0         readxl_1.3.1     modelr_0.1.4     magrittr_1.5    
[29] backports_1.1.4  scales_1.0.0     rvest_0.3.4      assertthat_0.2.1
[33] colorspace_1.4-1 stringi_1.4.3    lazyeval_0.2.2   munsell_0.5.0   
[37] broom_0.5.2      crayon_1.3.4    
...