У меня проблема с ValueError: y содержит новые метки: xxxx. И я нашел решение здесь: sklearn.LabelEncoder с никогда ранее не замеченными значениями . Но я не уверен, как конвертировать sklearn в pd.get_dummies?
Это мой код
import json
import itertools
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
# The CSV was generated from this query:
data = pd.read_csv("data.csv")
data.head()
# Confirm that we have a balanced dataset
# Note: data was randomly shuffled in our BigQuery query
data['tags'].value_counts()
# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))
train_posts = data['post'][:train_size]
train_tags = data['tags'][:train_size]
test_posts = data['post'][train_size:]
test_tags = data['tags'][train_size:]
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)
dictionary = tokenize.word_index
with open('dictionary.json', 'w') as dictionary_file:
json.dump(dictionary, dictionary_file)
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags.fillna('0'))
y_train = encoder.transform(train_tags.fillna('0'))
y_test = encoder.transform(test_tags.fillna('0'))
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
А вот мои фиктивные данные:
post,tags
Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets,Hyperplasia
Does Lead Burden Alter Neuropsychological Development?,Myopia
41.8 Degree Centigrade Whole Body Hyperthermia for the Treatment of Rheumatoid Diseases,Rheumatic Diseases
Body Water Content in Cyanotic Congenital Heart Disease,Heart Diseases
Effects of Training Intensity on the CHD Risk Factors in Postmenopausal Women,Cardiovascular Diseases
Intraoral Grafting of Ex Vivo Produced Oral Mucosal Composites,Mouth Diseases
Prevalence of Carbohydrate Intolerance in Lean and Obese Children,Glucose Intolerance
Correction of Myopia Evaluation Trial (COMET),Myopia
Randomized Trial of Vitamin A and Vitamin E Supplementation for Retinitis Pigmentosa,Retinitis