Вы можете рассмотреть также пакеты tidytext
и dplyr
, которые определенно хороши:
# some data similar to yours
database <- data.frame(Name = c('Aalto Fellows II', 'Aalto introduction to Services'),
Description = c('This course is a lot of words I do not know.','Service economy, whatever it does mean.'),
LearningOutcomes = c('Aalto Fellows, which are the smartest, learn.','Knowing what does Service economy means.'), stringsAsFactors = FALSE)
# cool packages
library(tidytext)
library(dplyr)
# here the text transformations for titles
title <- tibble(line = 1:nrow(database), text = database$Name) %>% # as tibble
unnest_tokens(word, text)%>% # remove punctuations, lowercase, put words in column
anti_join(stop_words, by = c("word" = "word")) %>% # remove stopwords
group_by(line) %>% summarise(title = paste(word,collapse =' ')) # now all in a row!
# here the text transformations for descriptions
description <- tibble(line = 1:nrow(database), text = database$Description) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = c("word" = "word")) %>%
group_by(line) %>% summarise(title = paste(word,collapse =' '))
# here the text transformations for learning outcomes
learningoutcomes <- tibble(line = 1:nrow(database), text = database$LearningOutcomes) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = c("word" = "word")) %>%
group_by(line) %>% summarise(title = paste(word,collapse =' '))
# now the full dataset
database2 <- title %>% left_join(description, by = 'line') %>% left_join(learningoutcomes, by = 'line')
colnames(database2) <- c("line","Name","Description","LearningOutcomes")
database2
# A tibble: 2 x 4
line Name Description LearningOutcomes
<int> <chr> <chr> <chr>
1 1 aalto fellows ii lot words aalto fellows smartest learn
2 2 aalto introduction services service economy knowing service economy means
И вы можете преобразовать их в data.frame с помощью data.frame()
.