Я делаю пример анализа текста в Python со следующими обучающими / тестовыми файлами .
Но мои лучшие результаты - около 65% точности данных испытаний. Не могли бы вы дать несколько предложений о том, как набрать лучший балл?
С помощью следующих кодов он показывает точность как для данных о поездах, так и для тестовых данных, а также делает прогнозы 5 раз и получает средний балл для тестовой классификации.
КЛАССИФИКАТОР
import numpy as np
np.random.seed(15)
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras import optimizers
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize
import spacy
from gensim.models import KeyedVectors as kv
from datatools import load_dataset
nlp = spacy.load('fr')
embfile = "../resources/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin"
# wv : kv = kv.load_word2vec_format(embfile, binary=True, encoding='UTF-8', unicode_errors='ignore')
class Classifier:
"""The Classifier"""
def __init__(self):
self.labelset = None
self.label_binarizer = LabelBinarizer()
self.model = None
self.epochs = 200
self.batchsize = 64
self.max_features = 15000
# create the vectorizer
self.vectorizer = CountVectorizer(
max_features=self.max_features,
strip_accents=None,
analyzer="word",
tokenizer=self.mytokenize,
stop_words=None,
ngram_range=(1, 3),
binary=False,
preprocessor=None
)
def mytokenize(self, text):
"""Customized tokenizer.
Here you can add other linguistic processing and generate more normalized features
"""
doc = nlp(text)
tokens = [t.text.lower() for sent in doc.sents for t in sent if t.pos_ != "PUNCT" ]
# tokens = [t for t in tokens if t not in self.stopset]
return tokens
def feature_count(self):
return len(self.vectorizer.vocabulary_)
def create_model(self):
"""Create a neural network model and return it.
Here you can modify the architecture of the model (network type, number of layers, number of neurones)
and its parameters"""
# Define input vector, its size = number of features of the input representation
input = Input((self.feature_count(),))
# Define output: its size is the number of distinct (class) labels (class probabilities from the softmax)
layer = input
layer = Dense(10, activation='relu')(layer)
output = Dense(len(self.labelset), activation='softmax')(layer)
# create model by defining the input and output layers
model = Model(inputs=input, outputs=output)
# compile model (pre
model.compile(optimizer=optimizers.Adam(),
loss='categorical_crossentropy',
metrics=['accuracy'])
model.summary()
return model
def vectorize(self, texts):
return self.vectorizer.transform(texts).toarray()
def train_on_data(self, texts, labels, valtexts=None, vallabels=None):
"""Train the model using the list of text examples together with their true (correct) labels"""
# create the binary output vectors from the correct labels
Y_train = self.label_binarizer.fit_transform(labels)
# get the set of labels
self.labelset = set(self.label_binarizer.classes_)
print("LABELS: %s" % self.labelset)
# build the feature index (unigram of words, bi-grams etc.) using the training data
self.vectorizer.fit(texts)
# create a model to train
self.model = self.create_model()
# for each text example, build its vector representation
X_train = self.vectorize(texts)
#
my_callbacks = []
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto', baseline=None)
my_callbacks.append(early_stopping)
if valtexts is not None and vallabels is not None:
X_val = self.vectorize(valtexts)
Y_val = self.label_binarizer.transform(vallabels)
valdata = (X_val, Y_val)
else:
valdata = None
# Train the model!
self.model.fit(
X_train, Y_train,
epochs=self.epochs,
batch_size=self.batchsize,
callbacks=my_callbacks,
validation_data=valdata,
verbose=2)
def predict_on_X(self, X):
return self.model.predict(X)
def predict_on_data(self, texts):
"""Use this classifier model to predict class labels for a list of input texts.
Returns the list of predicted labels
"""
X = self.vectorize(texts)
# get the predicted output vectors: each vector will contain a probability for each class label
Y = self.model.predict(X)
# from the output probability vectors, get the labels that got the best probability scores
return self.label_binarizer.inverse_transform(Y)
def train(self, trainfile, valfile=None):
df = load_dataset(trainfile)
texts = df['text']
labels = df['polarity']
if valfile:
valdf = load_dataset(valfile)
valtexts = valdf['text']
vallabels = valdf['polarity']
else:
valtexts = vallabels = None
self.train_on_data(texts, labels, valtexts, vallabels)
def predict(self, datafile):
"""Use this classifier model to predict class labels for a list of input texts.
Returns the list of predicted labels
"""
items = load_dataset(datafile)
return self.predict_on_data(items['text'])
ТЕСТЕР
import sys, time
import numpy as np
from scipy import stats
from datatools import load_dataset
from classifier_bow import Classifier
# from eval import eval_file, eval_list, load_label_output
def set_reproducible():
# The below is necessary to have reproducible behavior.
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '0'
# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(17)
# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
rn.seed(12345)
def eval_list(glabels, slabels):
if (len(glabels) != len(slabels)):
print("\nWARNING: label count in system output (%d) is different from gold label count (%d)\n" % (
len(slabels), len(glabels)))
n = min(len(slabels), len(glabels))
incorrect_count = 0
for i in range(0, n):
if slabels[i] != glabels[i]: incorrect_count += 1
acc = (n - incorrect_count) / n
acc = acc * 100
return acc
def train_and_eval_dev_test(trainfile, devfile, testfile, run_id):
classifier = Classifier()
print("\n")
# Training
print("RUN: %s" % str(run_id))
print(" %s.1. Training the classifier..." % str(run_id))
classifier.train(trainfile, devfile)
print()
print(" %s.2. Evaluation on the dev dataset..." % str(run_id))
slabels = classifier.predict(devfile)
glabels = load_dataset(devfile)
glabels = glabels['polarity']
devacc = eval_list(glabels, slabels)
print(" Acc.: %.2f" % devacc)
testacc = -1
if testfile is not None:
# Evaluation on the test data
print(" %s.3. Evaluation on the test dataset..." % str(run_id))
slabels = classifier.predict(testfile)
glabels = load_dataset(testfile)
glabels = glabels['polarity']
testacc = eval_list(glabels, slabels)
print(" Acc.: %.2f" % testacc)
print()
return (devacc, testacc)
if __name__ == "__main__":
set_reproducible()
datadir = "../data/"
trainfile = datadir + "frdataset1_train.csv"
devfile = datadir + "frdataset1_dev.csv"
# testfile = datadir + "frdataset1_test.csv"
testfile = None
# Basic checking
start_time = time.perf_counter()
n = 5
if len(sys.argv) > 1:
n = int(sys.argv[1])
devaccs = []
testaccs = []
for i in range(n):
res = train_and_eval_dev_test(trainfile, devfile, testfile, i+1)
devaccs.append(res[0])
testaccs.append(res[1])
print('\nCompleted %d runs.' % n)
print("Dev accs:", devaccs)
print("Test accs:", testaccs)
print()
print("Mean Dev Acc.: %.2f (%.2f)\tMean Test Acc.: %.2f (%.2f)" % (np.mean(devaccs), np.std(devaccs), np.mean(testaccs), np.std(testaccs)))
total_exec_time = (time.perf_counter()-start_time)
print("\nExec time: %.2f s. ( %d per run )" % (total_exec_time, total_exec_time/n))