Question

Поэтому я использовал пакет text2vec R для построения векторизации слов для выбора объектов. Я сделал это в соответствии со страницей Дмитрия Селиванова http://text2vec.org/vectorization.html,, которая объясняет, как правильно использовать text2vec перед построением классификатора.

Это мой код:

# Loading packages and movie review data
require(text2vec)
require(data.table)
data("movie_review")
library(tidyverse)

# Converting list of movie reviews to data table by reference
setDT(movie_review)

# Sorting the data table by ID
setkey(movie_review, id)

#Set seed for reproducibe results
set.seed(2016L)

# Splitting data set into training and testing data 
all_ids = movie_review$id
train_ids = sample(all_ids, 4000)
test_ids = setdiff(all_ids, train_ids)
train = movie_review[J(train_ids)]
test = movie_review[J(test_ids)]

# Vocabulary-based vecorization 

# Define preprocessing function and tokenization function
# Setting up a pre-processing function
prep_fun <- function(x) {
x %>% 
# make text lower case
str_to_lower %>% 
# remove non-alphanumeric symbols
str_replace_all("[^[:alpha:]]", " ") %>% 
# collapse multiple spaces
str_replace_all("\\s+", " ")}

# Setting up the tokenization function
tok_fun = word_tokenizer

# Using an iterator over tokens to create the vocabulary
it_train = itoken(train$review, 
              preprocessor = prep_fun, 
              tokenizer = tok_fun, 
              ids = train$id, 
              progressbar = FALSE)
vocab = create_vocabulary(it_train)

# The created vocabulary consists of 35070 unique words.
# Now that the vocabulary is set up, it is time to construct a DTM.
# The vocab_vectorizer() function will create the vectors of the tokens.
vectorizer = vocab_vectorizer(vocab)
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))

# Vectorization and DTM creation of the training data is now complete.
# Looking up the dimensions of the created DTM
dim(dtm_train)
identical(rownames(dtm_train), train$id)

Затем я выбрал регрессионную модель LASSO с использованием пакета glmnet R.

# Training the model using LASSO regression to avoid high variance in coefficients
require(glmnet)
NFOLDS = 4
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['sentiment']], 
                          family = 'binomial', 
                          # L1 penalty
                          alpha = 1,
                          # interested in the area under ROC curve
                          type.measure = "auc",
                          # 5-fold cross-validation
                          nfolds = NFOLDS,
                          # high value is less accurate, but has faster training
                          thresh = 1e-3,
                          # again lower number of iterations for faster training
                          maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))

# Since the classifier is now trained, it is possible to plot the area under the curve (AUC).
plot(glmnet_classifier)

# It is also possible to compute the AUC as a figure.
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))

# Now that the model has been fitted to the DTM, it is time for validating the model's performance on the test data.
# Therefore, the same preprocessing and tokenization functions will be applied as well as the same vectorizer function.

# Preprocessing and tokenizing test data
it_test = itoken(test$review, 
              preprocessor = prep_fun, 
              tokenizer = tok_fun, 
              ids = test$id, 
              progressbar = FALSE)

# Creating DTM of test data
dtm_test = create_dtm(it_test, vectorizer)

# Using classifier for prediction on test data
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
glmnet:::auc(test$sentiment, preds)

Теперь я хотел бы использовать классификатор для моих собственных данных, чтобы классифицировать чувства в моих собственных данных. У меня вопрос: нужно ли применять к моим данным одну и ту же функцию предварительной обработки и токенизации, а затем векторизовать слова моих данных? Или просто использовать встроенный классификатор данных?

Я действительно надеюсь найти ответы на некоторые вопросы, спасибо!

Как использовать классификатор сборки (на основе вложений слов) на новых данных для анализа настроений?

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Как использовать классификатор сборки (на основе вложений слов) на новых данных для анализа настроений?

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Похожие темы