import nltk
from nltk.corpus import PlaintextCorpusReader
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
#file directory, contains 1000 files
doc_dirname_politics = "E:/s10/SENTIMENT ANALISYS/SESI3/NLP-Data-sets/Dats sets/mini_newsgroups/mini_newsgroups/talk.politics.misc"
doc_dirname_comps = "E:/s10/SENTIMENT ANALISYS/SESI3/NLP-Data-sets/Dats sets/mini_newsgroups/mini_newsgroups/comp.os.ms-windows.misc"
politics_news_corpus = PlaintextCorpusReader(doc_dirname_politics,'.*')
comp_news_corpus = PlaintextCorpusReader(doc_dirname_comps, '.*')
import re
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def custom_preprocessor(text):
text = re.sub(r'\W+|\d+|_', ' ', text) #removing numbers and
punctuations
text = nltk.word_tokenize(text) #tokenizing
text = [word for word in text if not word in stop_words] #English
Stopwords
text = [lemmatizer.lemmatize(word) for word in text]
return text
это исходный код.Что за код, если я хочу сделать TF IDF
в файле, который был создан в doc_dirname_politics
.Я уже нашел учебник, но не нашел учебник, который решает мою проблему.