Я пытаюсь создать генератор твитов с керасами, используя RNN. Я сталкивался с этой проблемой и не могу понять, откуда она взялась. Я также искал inte rnet часами, но ничего не нашел. Я уверен, что это что-то маленькое, но я не могу получить это ...
Вот код (от https://github.com/schuyler-jackson/RNN_tweet_generation/blob/master/final_model.ipynb):
data = pd.read_csv('data/election2020.csv', usecols=[0, 4], names=['id', 'text'], encoding="latin-1")
# all tweets into one string
tweet_txt = data['text'][:].str.cat(sep=' ')
print(f'total characters in our dataset: {len(tweet_txt)}')
# get unique chars and make character mapping
chars = list(set(tweet_txt))
chars.sort()
char_to_index = dict((c,i) for i,c in enumerate(chars))
index_to_char = np.array(chars)
print(f"unique characters: {len(chars)}")
maxlen = 100
tweet_int = np.array([char_to_index[char] for char in tweet_txt])
seq_length = 100
examples_per_epoch = len(tweet_txt)//seq_length
char_dataset = tf.data.Dataset.from_tensor_slices(tweet_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
def split_input_target(chunk):
input_text = chunk[:-1]
target_text = chunk[1:]
return input_text, target_text
dataset = sequences.map(split_input_target)
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE
BUFFER_SIZE = 10000
dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)
# Here is a model using the Keras Functional Api.
import functools
rnn = functools.partial(keras.layers.GRU, recurrent_activation='sigmoid')
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]))
model.add(rnn(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform', stateful=True))
model.add(Dropout(rate=0.2, noise_shape=(batch_size, 1, rnn_units)))
model.add(rnn(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform', stateful=True))
model.add(Dense(vocab_size))
return model
vocab_size = len(chars)
embedding_dim = 256
rnn_units = 256
batch_size = BATCH_SIZE
model = build_model(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=batch_size)
model.summary()
def loss(labels, logits):
return sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(optimizer= Adam(), loss=loss)
checkpoint_dir = "model_gen/checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.hdf5")
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)
EPOCHS = 5
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
dataset2 = dataset
history = model.fit(np.array(dataset2), validation_data=dataset, validation_steps=30, epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])
данные выглядят так:
id text
0 1204000574099857409 Democrats launch impeachment endgame with risi...
1 1203998807928823809 ***********************#biden2020 #Election202...
2 1203998376376832000 Any congressional representation doing this sh...
3 1203997840718086144 I"m glad to see this. #Booker deserves to be s...
4 1203997705938362368 @realDonaldTrump #AmericaFirst #KAG2020 #Trump...
и вывод таков:
Using TensorFlow backend.
total characters in our dataset: 4786659
unique characters: 186
<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (64, None, 256) 47616
_________________________________________________________________
gru_1 (GRU) (64, None, 256) 393984
_________________________________________________________________
dropout_1 (Dropout) (64, None, 256) 0
_________________________________________________________________
gru_2 (GRU) (64, None, 256) 393984
_________________________________________________________________
dense_1 (Dense) (64, None, 186) 47802
=================================================================
Total params: 883,386
Trainable params: 883,386
Non-trainable params: 0
_________________________________________________________________
Traceback (most recent call last):
File ".../src/tweet_generator_2.py", line 97, in <module>
history = model.fit(np.array(dataset2), validation_data=dataset, validation_steps=30, epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])
File "...\Anaconda\envs\gputest\lib\site-packages\keras\engine\training.py", line 1154, in fit
batch_size=batch_size)
File "...\Anaconda\envs\gputest\lib\site-packages\keras\engine\training.py", line 579, in _standardize_user_data
exception_prefix='input')
File "...\Anaconda\envs\gputest\lib\site-packages\keras\engine\training_utils.py", line 135, in standardize_input_data
'with shape ' + str(data_shape))
ValueError: Error when checking input: expected embedding_1_input to have 2 dimensions, but got array with shape ()
Process finished with exit code 1
Кто-нибудь знает, как можно решить проблему? Я не понимаю, откуда берется shape ().
Спасибо!