пример нейронной сети, дающий лучшие результаты в python - PullRequest
0 голосов
/ 19 января 2020

Я делаю пример анализа текста в Python со следующими обучающими / тестовыми файлами .

Но мои лучшие результаты - около 65% точности данных испытаний. Не могли бы вы дать несколько предложений о том, как набрать лучший балл?

С помощью следующих кодов он показывает точность как для данных о поездах, так и для тестовых данных, а также делает прогнозы 5 раз и получает средний балл для тестовой классификации.

КЛАССИФИКАТОР

import numpy as np
np.random.seed(15)

from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras import optimizers
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize

import spacy
from gensim.models import KeyedVectors as kv

from datatools import load_dataset


nlp = spacy.load('fr')

embfile = "../resources/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin"
# wv : kv = kv.load_word2vec_format(embfile, binary=True, encoding='UTF-8', unicode_errors='ignore')


class Classifier:
    """The Classifier"""

    def __init__(self):
        self.labelset = None
        self.label_binarizer = LabelBinarizer()
        self.model = None
        self.epochs = 200
        self.batchsize = 64
        self.max_features = 15000
        # create the vectorizer
        self.vectorizer = CountVectorizer(
            max_features=self.max_features,
            strip_accents=None,
            analyzer="word",
            tokenizer=self.mytokenize,
            stop_words=None,
            ngram_range=(1, 3),
            binary=False,
            preprocessor=None
        )

    def mytokenize(self, text):
        """Customized tokenizer.
        Here you can add other linguistic processing and generate more normalized features
        """
        doc = nlp(text)
        tokens = [t.text.lower() for sent in doc.sents for t in sent if t.pos_ != "PUNCT" ]
        # tokens = [t for t in tokens if t not in self.stopset]
        return tokens

    def feature_count(self):
        return len(self.vectorizer.vocabulary_)

    def create_model(self):
        """Create a neural network model and return it.
        Here you can modify the architecture of the model (network type, number of layers, number of neurones)
        and its parameters"""

        # Define input vector, its size = number of features of the input representation
        input = Input((self.feature_count(),))
        # Define output: its size is the number of distinct (class) labels (class probabilities from the softmax)
        layer = input
        layer = Dense(10, activation='relu')(layer)
        output = Dense(len(self.labelset), activation='softmax')(layer)
        # create model by defining the input and output layers
        model = Model(inputs=input, outputs=output)
        # compile model (pre
        model.compile(optimizer=optimizers.Adam(),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        model.summary()
        return model



    def vectorize(self, texts):
        return self.vectorizer.transform(texts).toarray()


    def train_on_data(self, texts, labels, valtexts=None, vallabels=None):
        """Train the model using the list of text examples together with their true (correct) labels"""
        # create the binary output vectors from the correct labels
        Y_train = self.label_binarizer.fit_transform(labels)
        # get the set of labels
        self.labelset = set(self.label_binarizer.classes_)
        print("LABELS: %s" % self.labelset)
        # build the feature index (unigram of words, bi-grams etc.)  using the training data
        self.vectorizer.fit(texts)
        # create a model to train
        self.model = self.create_model()
        # for each text example, build its vector representation
        X_train = self.vectorize(texts)
        #
        my_callbacks = []
        early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto', baseline=None)
        my_callbacks.append(early_stopping)
        if valtexts is not None and vallabels is not None:
            X_val = self.vectorize(valtexts)
            Y_val = self.label_binarizer.transform(vallabels)
            valdata = (X_val, Y_val)
        else:
            valdata = None
        # Train the model!
        self.model.fit(
            X_train, Y_train,
            epochs=self.epochs,
            batch_size=self.batchsize,
            callbacks=my_callbacks,
            validation_data=valdata,
            verbose=2)

    def predict_on_X(self, X):
        return self.model.predict(X)


    def predict_on_data(self, texts):
        """Use this classifier model to predict class labels for a list of input texts.
        Returns the list of predicted labels
        """
        X = self.vectorize(texts)
        # get the predicted output vectors: each vector will contain a probability for each class label
        Y = self.model.predict(X)
        # from the output probability vectors, get the labels that got the best probability scores
        return self.label_binarizer.inverse_transform(Y)

    def train(self, trainfile, valfile=None):
        df = load_dataset(trainfile)
        texts = df['text']
        labels = df['polarity']
        if valfile:
            valdf = load_dataset(valfile)
            valtexts = valdf['text']
            vallabels = valdf['polarity']
        else:
            valtexts = vallabels = None
        self.train_on_data(texts, labels, valtexts, vallabels)


    def predict(self, datafile):
        """Use this classifier model to predict class labels for a list of input texts.
        Returns the list of predicted labels
        """
        items = load_dataset(datafile)
        return self.predict_on_data(items['text'])

ТЕСТЕР

import sys, time
import numpy as np
from scipy import stats

from datatools import load_dataset
from classifier_bow import Classifier
# from eval import eval_file, eval_list, load_label_output

def set_reproducible():
    # The below is necessary to have reproducible behavior.
    import random as rn
    import os
    os.environ['PYTHONHASHSEED'] = '0'
    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.
    np.random.seed(17)
    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.
    rn.seed(12345)

def eval_list(glabels, slabels):
    if (len(glabels) != len(slabels)):
        print("\nWARNING: label count in system output (%d) is different from gold label count (%d)\n" % (
        len(slabels), len(glabels)))
    n = min(len(slabels), len(glabels))
    incorrect_count = 0
    for i in range(0, n):
        if slabels[i] != glabels[i]: incorrect_count += 1
    acc = (n - incorrect_count) / n
    acc = acc * 100
    return acc


def train_and_eval_dev_test(trainfile, devfile, testfile, run_id):
    classifier = Classifier()
    print("\n")
    # Training
    print("RUN: %s" % str(run_id))
    print("  %s.1. Training the classifier..." % str(run_id))
    classifier.train(trainfile, devfile)
    print()
    print("  %s.2. Evaluation on the dev dataset..." % str(run_id))
    slabels = classifier.predict(devfile)
    glabels = load_dataset(devfile)
    glabels = glabels['polarity']
    devacc = eval_list(glabels, slabels)
    print("       Acc.: %.2f" % devacc)
    testacc = -1
    if testfile is not None:
        # Evaluation on the test data
        print("  %s.3. Evaluation on the test dataset..." % str(run_id))
        slabels = classifier.predict(testfile)
        glabels = load_dataset(testfile)
        glabels = glabels['polarity']
        testacc = eval_list(glabels, slabels)
        print("       Acc.: %.2f" % testacc)
    print()
    return (devacc, testacc)

if __name__ == "__main__":
    set_reproducible()
    datadir = "../data/"
    trainfile =  datadir + "frdataset1_train.csv"
    devfile =  datadir + "frdataset1_dev.csv"
    # testfile =  datadir + "frdataset1_test.csv"
    testfile = None
    # Basic checking
    start_time = time.perf_counter()
    n = 5
    if len(sys.argv) > 1:
        n = int(sys.argv[1])
    devaccs = []
    testaccs = []
    for i in range(n):
        res = train_and_eval_dev_test(trainfile, devfile, testfile, i+1)
        devaccs.append(res[0])
        testaccs.append(res[1])
    print('\nCompleted %d runs.' % n)
    print("Dev accs:", devaccs)
    print("Test accs:", testaccs)
    print()
    print("Mean Dev Acc.: %.2f (%.2f)\tMean Test Acc.: %.2f (%.2f)" % (np.mean(devaccs), np.std(devaccs), np.mean(testaccs), np.std(testaccs)))
    total_exec_time = (time.perf_counter()-start_time)
    print("\nExec time: %.2f s. ( %d per run )" % (total_exec_time, total_exec_time/n))
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...