Объединение библиотек tm
и stringr
.Сначала мы создаем целый текст для каждого ID
, также добавляя ,
перед FINDINGS
и CONCLUSION
для согласованности
library(tm)
library(stringr)
library(dplyr)
df = df%>%group_by(ID)%>%summarise(TEXT=paste(TEXT,collapse=", "))%>%mutate(TEXT=gsub("(.*)( FINDINGS.*)( CONCLUSION.*)","\\1,\\2,\\3",TEXT))
> df
# A tibble: 2 x 2
ID TEXT
<dbl> <chr>
1 1 VERIFIED DATE/TIME: 24/11/2018 16:23, VERIFIED PERSON IN CHARGE: JOHN, HISTORY aaaAAA# 111, FINDINGS Bb123, CONCLUSION 987CCC ccc654, DIAGN~
2 2 VERIFIED DATE/TIME: 25/10/2018 16:23, VERIFIED PERSON IN CHARGE: Mary, HISTORY eeeEEE@ 111, FINDINGS Bb321, CONCLUSION 987FFF ggg654
Затем определяем имена, которые нам нужно сохранить, в качестве имен столбцов иудалите их из строки
titles = c("VERIFIED DATE/TIME: ","VERIFIED PERSON IN CHARGE: ","HISTORY ","FINDINGS ","CONCLUSION ","DIAGNOSIS ")
df$TEXT = removeWords(df$TEXT,titles)
> df
# A tibble: 2 x 2
ID TEXT
<dbl> <chr>
1 1 24/11/2018 16:23, JOHN, aaaAAA# 111, Bb123, 987CCC ccc654, abc def hij
2 2 25/10/2018 16:23, Mary, eeeEEE@ 111, Bb321, 987FFF ggg654
Наконец, мы разбиваем столбцы на ,
и устанавливаем имена столбцов.
df_fin=str_split_fixed(df$TEXT, ", ",6)
colnames(df_fin)=titles
> df_fin
VERIFIED DATE/TIME: VERIFIED PERSON IN CHARGE: HISTORY FINDINGS CONCLUSION DIAGNOSIS
[1,] "24/11/2018 16:23" "JOHN" "aaaAAA# 111" "Bb123" "987CCC ccc654" "abc def hij"
[2,] "25/10/2018 16:23" "Mary" "eeeEEE@ 111" "Bb321" "987FFF ggg654" ""