Я пытаюсь провести анализ настроений в твиттере с помощью lineairSVM, за которым следует CNN (конкретный LSTM).Я немного новичок в мире ИИ, и я просто спрашиваю себя, является ли это хорошим способом сделать это.Я работаю с набором данных с надписью sentiment140.Мой код вкратце выполняет следующие действия (я полагаю):
- помещает данные в фрейм данных pandas и выполняет некоторую предварительную обработку.
- токенизация и извлечение функций в униграммах
- сделать lineairSVM
- Обучить модель LSTM (cnn)
- прогнозировать
При работе в Python я получаю следующую ошибку:
ValueError: Error when checking input: expected embedding_1_input to have shape (140,) but got array with shape (1,)
должно быть что-то с вводом CNN, теперь эта ошибка не большой вопрос.Мне просто интересно, если это хороший способ сделать некоторую классификацию настроений.(svm, сопровождаемый cnn)
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing import sequence
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from spacy.lang.en import English
from sklearn.feature_extraction.text import
CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from lineairsvm import *
from keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.base import TransformerMixin
#read csv to dataframe
df = pd.read_csv("testdata.csv")
#label dataframe columns
frames = [df]
for colnames in frames:
colnames.columns = ["target","id","data","query","user","text"]
#remove unnecessary columns
df = df.drop("id",1)
df = df.drop("data",1)
df = df.drop("query",1)
df = df.drop("user",1)
nlp = spacy.load('en')
# Build a list of stopwords to use to filter
stopwords = list(STOP_WORDS)
# Use the punctuations of string module
punctuations = string.punctuation
# Creating a Spacy Parser
parser = English()
def spacy_tokenizer(sentence):
mytokens = parser(sentence)
mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-"
else word.lower_ for word in mytokens ]
mytokens = [word for word in mytokens if word not in stopwords and
word not in punctuations]
return mytokens
class predictors(TransformerMixin):
def transform(self, X, **transform_params):
return [clean_text(text) for text in X]
def fit(self, X, y=None, **fit_params):
return self
def get_params(self, deep=True):
return {}
# Basic function to clean the text
def clean_text(text):
return text.strip().lower()
# Vectorization
# Convert a collection of text documents to a matrix of token counts
# ngrams : extension of the unigram model by taking n words together
# big advantage: it preserves context. -> words that appear together in
# the text will also appear together in a n-gram
# n-grams can increase the accuracy in classifying pos & neg
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=
(1,1))
# Linear Support Vector Classification.
# Similar to SVC with parameter kernel=’linear’
# more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.
# LinearSVC take as input two arrays: an array X of size [n_samples,
n_features] holding the training samples, and an array y of class
labels (strings or integers), size [n_samples]:
classifier = LinearSVC()
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)
X = df['text']
ylabels = df['target']
#The next step is to load the data and split it into training and test
datasets. In this example,
# we will use 80% of the dataset to train the model and the remaining
20% to test the accuracy of the model.
# We can use the shape function to view the dimension of the dataset:
# parameters: X = feature data, ylabels = target, test size : what
proportion for test set?, random state: set a general number generator
what splits the data
X_tr, X_kast, y_tr, y_kast = train_test_split(X, ylabels,
test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_tr, y_tr,
test_size=0.4, random_state=42)
df_wordlist = pd.read_csv("wordlist.csv")
vocabulary_size = df_wordlist.__len__()
print(vocabulary_size)
max_words = 140
for tweet in X_train:
tweet = tweet[:140]
tweet = tweet[140:]
for tweet in X_test:
tweet = tweet[:140]
tweet = tweet[140:]
def lstm(X_train):
embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size,
input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
batch_size = 64
num_epochs = 3
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid),
batch_size=batch_size, epochs=num_epochs)
return model
cnn = lstm(X_train)
# Create the pipeline to clean, tokenize, vectorize, and classify
# Pipelines only transform the observed data (X).
# Pipeline can be used to chain multiple estimators into one.
# Tying together different pieces of the ML process is known as a pipeline.
# Each stage of a pipeline is fed data processed from its preceding stage
# The pipeline object is in the form of (key, value) pairs.
# Key is a string that has the name for a particular step
# value is the name of the function or actual method.
# Using Tfid
pipe_tfid = Pipeline([("cleaner", predictors()),
('vectorizer', tfvectorizer),
('classifier', classifier),
('network', cnn)])
pipe_tfid.fit(X_train,y_train)
sample_prediction1 = pipe_tfid.predict(X_test)
print(pipe_tfid.score(X_train,y_train))
Заранее спасибо!