df = pd.read_csv("cpas.txt", header=None, delimiter="\t")
X = df[1].values
y = df[0].values
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
def clean_text(text, stop_words, stemmer):
return " ".join([stemmer.stem(word) for word in word_tokenize(text)
if word not in stop_words and not word.isnumeric()])
X = np.array([clean_text(text, stop_words, stemmer) for text in X])
kfold = KFold(3, shuffle=True, random_state=33)
i = 1
for train_idx, test_idx in kfold.split(X):
X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
model = LinearSVC()
model.fit(X_train, y_train)
print ("Fold : {0}".format(i))
i += 1
print (classification_report(y_test, model.predict(X_test)))
Причиной использования перекрестной проверки является настройка параметров, когда данных меньше.Для этого можно использовать поиск по сетке с CV.
df = pd.read_csv("cpas.txt", header=None, delimiter="\t")
X = df[1].values
labels = df[0].values
text = np.array([clean_text(text, stop_words, stemmer) for text in X])
idx = np.arange(len(text))
np.random.shuffle(idx)
text = text[idx]
labels = labels[idx]
pipeline = Pipeline([
('vectorizer', TfidfVectorizer()),
('svm', LinearSVC())])
params = {
'vectorizer__ngram_range' : [(1,1),(1,2),(2,2)],
'vectorizer__lowercase' : [True, False],
'vectorizer__norm' : ['l1','l2']}
model = GridSearchCV(pipeline, params, cv=3, verbose=1)
model.fit(text, y)