попробуйте это:
library("quanteda")
## Package version: 1.5.2
summarize_texts_extended <- function(x, stop_words = stopwords("en")) {
toks <- tokens(x) %>%
tokens_tolower()
# total tokens
ndocs <- ndoc(x)
ntoksall <- ntoken(toks)
ntoks <- sum(ntoksall)
# punctuation
toks <- tokens(toks, remove_punct = TRUE, remove_symbols = FALSE)
npunct <- ntoks - sum(ntoken(toks))
# symbols and emoji
toks <- tokens(toks, remove_symbols = TRUE)
nsym <- ntoks - npunct - sum(ntoken(toks))
# numbers
toks <- tokens(toks, remove_numbers = TRUE)
nnumbers <- ntoks - npunct - nsym - sum(ntoken(toks))
# words
nwords <- ntoks - npunct - nsym - nnumbers
# stopwords
dfmat <- dfm(toks)
nfeats <- nfeat(dfmat)
dfmat <- dfm_remove(dfmat, stop_words)
nstopwords <- nfeats - nfeat(dfmat)
list(
total_tokens = ntoks,
total_punctuation = npunct,
total_symbols = nsym,
total_numbers = nnumbers,
total_words = nwords,
total_stopwords = nstopwords
)
}
Возвращает в виде списка желаемое количество:
summarize_texts_extended(data_corpus_inaugural)
## $total_tokens
## [1] 149138
##
## $total_punctuation
## [1] 13852
##
## $total_symbols
## [1] 4
##
## $total_numbers
## [1] 85
##
## $total_words
## [1] 135197
##
## $total_stopwords
## [1] 136