Поэтому я использовал пакет text2vec
R для построения векторизации слов для выбора объектов. Я сделал это в соответствии со страницей Дмитрия Селиванова http://text2vec.org/vectorization.html,, которая объясняет, как правильно использовать text2vec
перед построением классификатора.
Это мой код:
# Loading packages and movie review data
require(text2vec)
require(data.table)
data("movie_review")
library(tidyverse)
# Converting list of movie reviews to data table by reference
setDT(movie_review)
# Sorting the data table by ID
setkey(movie_review, id)
#Set seed for reproducibe results
set.seed(2016L)
# Splitting data set into training and testing data
all_ids = movie_review$id
train_ids = sample(all_ids, 4000)
test_ids = setdiff(all_ids, train_ids)
train = movie_review[J(train_ids)]
test = movie_review[J(test_ids)]
# Vocabulary-based vecorization
# Define preprocessing function and tokenization function
# Setting up a pre-processing function
prep_fun <- function(x) {
x %>%
# make text lower case
str_to_lower %>%
# remove non-alphanumeric symbols
str_replace_all("[^[:alpha:]]", " ") %>%
# collapse multiple spaces
str_replace_all("\\s+", " ")}
# Setting up the tokenization function
tok_fun = word_tokenizer
# Using an iterator over tokens to create the vocabulary
it_train = itoken(train$review,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$id,
progressbar = FALSE)
vocab = create_vocabulary(it_train)
# The created vocabulary consists of 35070 unique words.
# Now that the vocabulary is set up, it is time to construct a DTM.
# The vocab_vectorizer() function will create the vectors of the tokens.
vectorizer = vocab_vectorizer(vocab)
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))
# Vectorization and DTM creation of the training data is now complete.
# Looking up the dimensions of the created DTM
dim(dtm_train)
identical(rownames(dtm_train), train$id)
Затем я выбрал регрессионную модель LASSO с использованием пакета glmnet
R.
# Training the model using LASSO regression to avoid high variance in coefficients
require(glmnet)
NFOLDS = 4
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['sentiment']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = NFOLDS,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))
# Since the classifier is now trained, it is possible to plot the area under the curve (AUC).
plot(glmnet_classifier)
# It is also possible to compute the AUC as a figure.
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
# Now that the model has been fitted to the DTM, it is time for validating the model's performance on the test data.
# Therefore, the same preprocessing and tokenization functions will be applied as well as the same vectorizer function.
# Preprocessing and tokenizing test data
it_test = itoken(test$review,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = test$id,
progressbar = FALSE)
# Creating DTM of test data
dtm_test = create_dtm(it_test, vectorizer)
# Using classifier for prediction on test data
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
glmnet:::auc(test$sentiment, preds)
Теперь я хотел бы использовать классификатор для моих собственных данных, чтобы классифицировать чувства в моих собственных данных. У меня вопрос: нужно ли применять к моим данным одну и ту же функцию предварительной обработки и токенизации, а затем векторизовать слова моих данных? Или просто использовать встроенный классификатор данных?
Я действительно надеюсь найти ответы на некоторые вопросы, спасибо!