фрейм данных sparklyr: заменить NULL-ячейку на NA - Как использовать gsub & regex для spark DF? - PullRequest
0 голосов
/ 10 декабря 2018

Как я могу использовать gsub и regex в кадре данных spark?Этот бегущий пример хорошо работает с классическим DF.

## function to replace empty cell by NA
empty_as_na <- function(x){
  if("factor" %in% class(x)) x <- as.character(x)
  ifelse(as.character(x)!="", x, NA)
}

## connect to spark local
sc <- spark_connect(master = "local")

# load an example of dataframe that has empty cells (needs cgdsr package)

clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")

## copy to spark
clinicalData_tbl <- dplyr::copy_to(sc, clinicalData, overwrite = TRUE)

 # It works for classic dataframe
clinicalData %>% mutate_all(funs(empty_as_na))

# It does not work for Spark DF
clinicalData_tbl %>% mutate_all(funs(empty_as_na))

Спасибо, Карим

1 Ответ

0 голосов
/ 10 декабря 2018

Обращаясь к этой теме , я могу заменить шаблоны столбцами.Мне нужно заранее знать, в какую колонку мне нужно перейти.здесь заголовок dataframe clinicalData <- structure(list(Patients = structure(c(187L, 63L, 167L, 88L, 173L, 139L), .Label = c("TCGA.02.0001.01", "TCGA.02.0003.01", "TCGA.02.0004.01", "TCGA.02.0006.01", "TCGA.02.0007.01", "TCGA.02.0009.01", "TCGA.02.0010.01", "TCGA.02.0011.01", "TCGA.02.0014.01", "TCGA.02.0015.01", "TCGA.02.0016.01", "TCGA.02.0021.01", "TCGA.02.0023.01", "TCGA.02.0024.01", "TCGA.02.0025.01", "TCGA.02.0026.01", "TCGA.02.0027.01", "TCGA.02.0028.01", "TCGA.02.0033.01", "TCGA.02.0034.01", "TCGA.02.0037.01", "TCGA.02.0038.01", "TCGA.02.0039.01", "TCGA.02.0043.01", "TCGA.02.0046.01", "TCGA.02.0047.01", "TCGA.02.0048.01", "TCGA.02.0052.01", "TCGA.02.0054.01", "TCGA.02.0055.01", "TCGA.02.0057.01", "TCGA.02.0058.01", "TCGA.02.0060.01", "TCGA.02.0064.01", "TCGA.02.0068.01", "TCGA.02.0069.01", "TCGA.02.0070.01", "TCGA.02.0071.01", "TCGA.02.0074.01", "TCGA.02.0075.01", "TCGA.02.0079.01", "TCGA.02.0080.01", "TCGA.02.0083.01", "TCGA.02.0084.01", "TCGA.02.0085.01", "TCGA.02.0086.01", "TCGA.02.0087.01", "TCGA.02.0089.01", "TCGA.02.0099.01", "TCGA.02.0102.01", "TCGA.02.0104.01", "TCGA.02.0106.01", "TCGA.02.0107.01", "TCGA.02.0111.01", "TCGA.02.0113.01", "TCGA.02.0114.01", "TCGA.02.0115.01", "TCGA.02.0116.01", "TCGA.02.0258.01", "TCGA.02.0260.01", "TCGA.02.0266.01", "TCGA.02.0269.01", "TCGA.02.0271.01", "TCGA.02.0281.01", "TCGA.02.0285.01", "TCGA.02.0289.01", "TCGA.02.0290.01", "TCGA.02.0317.01", "TCGA.02.0321.01", "TCGA.02.0324.01", "TCGA.02.0325.01", "TCGA.02.0326.01", "TCGA.02.0330.01", "TCGA.02.0332.01", "TCGA.02.0333.01", "TCGA.02.0337.01", "TCGA.02.0338.01", "TCGA.02.0339.01", "TCGA.02.0422.01", "TCGA.02.0430.01", "TCGA.02.0432.01", "TCGA.02.0439.01", "TCGA.02.0440.01", "TCGA.02.0446.01", "TCGA.02.0451.01", "TCGA.02.0456.01", "TCGA.06.0122.01", "TCGA.06.0124.01", "TCGA.06.0125.01", "TCGA.06.0126.01", "TCGA.06.0127.01", "TCGA.06.0128.01", "TCGA.06.0129.01", "TCGA.06.0130.01", "TCGA.06.0132.01", "TCGA.06.0133.01", "TCGA.06.0137.01", "TCGA.06.0138.01", "TCGA.06.0139.01", "TCGA.06.0141.01", "TCGA.06.0143.01", "TCGA.06.0145.01", "TCGA.06.0146.01", "TCGA.06.0147.01", "TCGA.06.0148.01", "TCGA.06.0149.01", "TCGA.06.0152.01", "TCGA.06.0154.01", "TCGA.06.0156.01", "TCGA.06.0157.01", "TCGA.06.0158.01", "TCGA.06.0162.01", "TCGA.06.0164.01", "TCGA.06.0166.01", "TCGA.06.0168.01", "TCGA.06.0169.01", "TCGA.06.0171.01", "TCGA.06.0173.01", "TCGA.06.0174.01", "TCGA.06.0175.01", "TCGA.06.0176.01", "TCGA.06.0177.01", "TCGA.06.0178.01", "TCGA.06.0179.01", "TCGA.06.0182.01", "TCGA.06.0184.01", "TCGA.06.0185.01", "TCGA.06.0187.01", "TCGA.06.0188.01", "TCGA.06.0189.01", "TCGA.06.0190.01", "TCGA.06.0194.01", "TCGA.06.0195.01", "TCGA.06.0197.01", "TCGA.06.0201.01", "TCGA.06.0206.01", "TCGA.06.0208.01", "TCGA.06.0209.01", "TCGA.06.0210.01", "TCGA.06.0211.01", "TCGA.06.0213.01", "TCGA.06.0214.01", "TCGA.06.0219.01", "TCGA.06.0221.01", "TCGA.06.0237.01", "TCGA.06.0238.01", "TCGA.06.0241.01", "TCGA.06.0394.01", "TCGA.06.0397.01", "TCGA.06.0402.01", "TCGA.06.0409.01", "TCGA.06.0410.01", "TCGA.06.0412.01", "TCGA.06.0413.01", "TCGA.06.0414.01", "TCGA.06.0644.01", "TCGA.06.0645.01", "TCGA.06.0646.01", "TCGA.06.0648.01", "TCGA.08.0244.01", "TCGA.08.0246.01", "TCGA.08.0344.01", "TCGA.08.0345.01", "TCGA.08.0346.01", "TCGA.08.0347.01", "TCGA.08.0348.01", "TCGA.08.0349.01", "TCGA.08.0350.01", "TCGA.08.0351.01", "TCGA.08.0352.01", "TCGA.08.0353.01", "TCGA.08.0354.01", "TCGA.08.0355.01", "TCGA.08.0356.01", "TCGA.08.0357.01", "TCGA.08.0358.01", "TCGA.08.0359.01", "TCGA.08.0360.01", "TCGA.08.0373.01", "TCGA.08.0375.01", "TCGA.08.0380.01", "TCGA.08.0385.01", "TCGA.08.0386.01", "TCGA.08.0389.01", "TCGA.08.0390.01", "TCGA.08.0392.01", "TCGA.08.0509.01", "TCGA.08.0510.01", "TCGA.08.0511.01", "TCGA.08.0512.01", "TCGA.08.0514.01", "TCGA.08.0516.01", "TCGA.08.0517.01", "TCGA.08.0518.01", "TCGA.08.0520.01", "TCGA.08.0521.01", "TCGA.08.0522.01", "TCGA.08.0524.01", "TCGA.08.0525.01", "TCGA.08.0529.01", "TCGA.08.0531.01", "TCGA.12.0615.01", "TCGA.12.0616.01", "TCGA.12.0618.01", "TCGA.12.0619.01", "TCGA.12.0620.01"), class = "factor"), ACGH_DATA = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NO", "YES"), class = "factor"), CANCER_TYPE = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Glioma", class = "factor"), CANCER_TYPE_DETAILED = structure(c(1L, 1L, 1L, 1L, 1L, 1L ), .Label = "Glioblastoma Multiforme", class = "factor"), COMPLETE_DATA = structure(c(1L, 1L, 2L, 2L, 2L, 2L), .Label = c("NO", "YES"), class = "factor"), DFS_MONTHS = c(NA, NA, 3.024657534, NA, 17.0630137, 2.202739726), DFS_STATUS = structure(c(1L, 1L, 3L, 1L, 3L, 3L), .Label = c("", "DiseaseFree", "Recurred" ), class = "factor"), FRACTION_GENOME_ALTERED = c(0.168880099, 0.126393857, 0.254657044, 0.174506086, 0.182024694, 0.154967319 ), KARNOFSKY_PERFORMANCE_SCORE = c(60L, NA, NA, NA, 100L, NA), MRNA_DATA = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NO", "YES"), class = "factor"), MUTATION_COUNT = c(NA, NA, NA, 4L, NA, 0L), ONCOTREE_CODE = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "GBM", class = "factor"), OS_MONTHS = c(12.55890411, 14.46575342, 9.797260274, 20.35068493, 24.55890411, 7.397260274 ), OS_STATUS = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("DECEASED", "LIVING"), class = "factor"), PRETREATMENT_HISTORY = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("NO", "YES"), class = "factor"), PRIOR_GLIOMA = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("NO", "YES"), class = "factor"), SEQUENCED = structure(c(1L, 1L, 2L, 2L, 2L, 2L), .Label = c("NO", "YES"), class = "factor"), SEX = structure(c(2L, 2L, 2L, 2L, 1L, 1L), .Label = c("Female", "Male"), class = "factor"), TREATMENT_STATUS = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Treated", "Untreated"), class = "factor")), row.names = c(NA, 6L), class = "data.frame")

и чем

sc <- spark_connect(master = "local")

clinicalData_tbl <- dplyr::copy_to(sc, clinicalData, overwrite = TRUE)

clinicalData_tbl %>%
mutate(DFS_STATUS=regexp_replace(DFS_STATUS, "^$|^ $", "NA"))

Я пробовал с NA, без "", но не работает.

...