РЕДАКТИРОВАТЬ Я обучаю модель Keras для создания резюме из научных статей.
Данные получены из https://www -nlpir.nist.gov / related_projects / tipster_summac / cmp_lg.html И статьи, иих резюме являются частью входных данных, поэтому я не знаю, какими должны быть выходные данные. Это должны быть двоичные данные в следующей форме: (num_articles, vocab_size)
#!/usr/bin/env python
# coding: utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow.keras as keras, numpy as np, itertools
import random, xml, glob, os, string, re, requests, atoma, feedparser #feedparser is ridiculously slow, speedparser incompatible with Python 3.4 onwards
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'#activating AVX, which is the actual purpose of TF/CUDA
#Data From https://www-nlpir.nist.gov/related_projects/tipster_summac/cmp_lg.html and https://machinelearningmastery.com/encoder-decoder-models-text-summarization-keras/ and https://www.tensorflow.org/beta/tutorials/text/text_generation
titles=['']
abstractWords=[[]]#only way to make append work correctly so far - means index starts at 1 - awkward
bodyWords=[[]]
vocab=[]
numArticles=0
for fileN in os.listdir("cmplg-xml"):
if fileN.endswith(".xml"):
print('cmplg-xml\\'+fileN)
file=open('cmplg-xml\\'+fileN,'r',encoding='latin-1')
rawStr=file.read()
file.close()
#as we are reading the articles, we build the dataset and the character sequence
#cleanEx=re.compile(r'<REF/>|<P>|</P>|<DIV.*?>|</DIV>|<!--.*?-->|<HEADER>|</HEADER>|<EQN>|<EQN/>|<CREF/>', re.MULTILINE)
rawStr = re.sub('<!--.*?-->|<REF/>|<P>|</P>|<DIV.*?>|</DIV>|<HEADER>|</HEADER>|<EQN>|<EQN/>|<CREF/>|\(.*?\)','',rawStr,flags=re.S)
#no way to remove header content so far
cleanStr = re.sub('\n',' ',rawStr)
try:
title=re.findall('<TITLE>(.*?)</TITLE>',cleanStr)[0]
abstract=re.findall('<ABSTRACT>(.*?)</ABSTRACT>',cleanStr)[0]
body=re.findall('<BODY>(.*?)</BODY>',cleanStr)[0]
except IndexError:
print(" incomplete")
continue
#REM: whitespaces managed by split
#cleanStr = re.sub('<REF/>|<P>|</P>|\n','',rawStr)
# outfile=open('cleaned.xml','w')
# outfile.write(cleanStr)
# outfile.close()
#print(cleanStr)
#root=xml.etree.ElementTree.fromstring(cleanStr) #Title, abstract,body
abstract.translate(str.maketrans('', '', string.punctuation))#stripping the punctuation
abstract = [word.lower() for word in abstract.split() if word.isalpha()]#only alphabetic characters
body.translate(str.maketrans('', '', string.punctuation))#stripping the punctuation
body = [word.lower() for word in body.split() if word.isalpha()]#only alphabetic character
titles.append(title)#not cleaned so far #is it worth using as a parameter?
abstractWords.append(abstract)
bodyWords.append(body)
vocab+=abstract+body
numArticles+=1
if not abstractWords[0]:
del abstractWords[0]
if not bodyWords[0]:
del bodyWords[0]
if not titles[0]:
del titles[0]
bodyWords[0]
#with feedparser
arxivRss = "D:\TextGeneration\Training\csrss.xml"
feed = feedparser.parse( arxivRss )
item = feed['items'][0]
item['summary']
# ## Vocabulary
# Read, then decode for py2 compat.
# text = open('training\cs.txt', 'rb').read().decode(encoding='utf-8')
vocab = sorted(set(['']+list(itertools.chain(*bodyWords))+list(itertools.chain(*abstractWords))))
# print ('{} unique characters'.format(len(vocab)))
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)
padLen = 5000 #consider 10000
sumLen = 30 #length of the summary 280-33=247 characters for arxiv url
#problem: these are characters, not words
vocab_size = len(vocab)
bodies=np.zeros(shape=(numArticles,padLen),dtype=int)
abstracts=np.zeros(shape=(numArticles,sumLen),dtype=int)
#no words are coded as zeroes
#just truncating the abstract is not a perfect solution
#there should be a function to automate this cumbersome padding pattern
for i in range(len(bodyWords)):
bodyLen=len(bodyWords[i])
if bodyLen<padLen:
bodies[i][:bodyLen]= [word2idx[word] for word in bodyWords[i]]
else:
bodies[i]=[word2idx[word] for word in bodyWords[i][:padLen]]
absLen=len(abstractWords[i])
if absLen<sumLen:
abstracts[i][:absLen]= [word2idx[word] for word in abstractWords[i]]
else:
abstracts[i]=[word2idx[word] for word in abstractWords[i][:sumLen]]
# article input model
inputs1 = keras.Input(shape=(padLen,))
article1 = keras.layers.Embedding(vocab_size, 128)(inputs1)#encoder1
article2 = keras.layers.LSTM(64)(article1)
article3 = keras.layers.RepeatVector(sumLen)(article2)
# summary input model
inputs2 = keras.Input(shape=(sumLen,))
summ1 = keras.layers.Embedding(vocab_size, 128)(inputs2)
# decoder model
decoder1 = keras.layers.concatenate([article3, summ1])
decoder2 = keras.layers.LSTM(128)(decoder1)
outputs = keras.layers.Dense(vocab_size, activation='softmax')(decoder2)#hos do we define this?
# tie it together [article, summary] [word]
model = keras.Model(inputs=[inputs1, inputs2], outputs=outputs)
#do we need keras.layers.activation()?
# model = keras.Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
#This model needs to be saved so it can be directly loaded for the routine
# saving the model
model_json = model.to_json()
modelFile = open("sumModel.json", "w")#model
modelFile.write(model_json)
modelFile.close()
model.save_weights("model.h5")#weights
# loading the model
modelFile = open('sumModel.json', 'r')#
model = keras.models.model_from_json(modelFile.read())
modelFile.close()
# load weights into new model
model.load_weights("model.h5")
model.compile(loss='categorical_crossentropy', optimizer='adam')
#no error here
for body in bodies:
if (len(body) != 5000):
print(len(body))
for abstract in abstracts:
if (len(abstract) != 30):
print(len(abstract))
#Now, using the model
batch_size = 32
epochs = 4
#keras.layers.Dense(vocab_size, activation='softmax') as a parameter?
model.fit([bodies, abstracts], validation_split = 0.1, epochs=epochs, batch_size=batch_size, verbose=1)
Я получаю
TypeError: У ввода 'y' из 'Equal' Op есть ресурс типакоторый не соответствует типу int32 аргумента 'x'.
Этот вопрос имеет то же сообщение об ошибке, но предлагаемое решение
Итак, яПредполагается, что проблема с типом или содержимым X_train2 или X_valid
, похоже, не содержит:
body.shape: (181, 5000)
аннотации. Форма: (30, 5000)
model.summary ()
Model: "model_1"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_3 (InputLayer) [(None, 5000)] 0
__________________________________________________________________________________________________
embedding_2 (Embedding) (None, 5000, 128) 796928 input_3[0][0]
__________________________________________________________________________________________________
lstm_2 (LSTM) (None, 64) 49408 embedding_2[0][0]
__________________________________________________________________________________________________
input_4 (InputLayer) [(None, 30)] 0
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector) (None, 30, 64) 0 lstm_2[0][0]
__________________________________________________________________________________________________
embedding_3 (Embedding) (None, 30, 128) 796928 input_4[0][0]
__________________________________________________________________________________________________
concatenate_1 (Concatenate) (None, 30, 192) 0 repeat_vector_1[0][0]
embedding_3[0][0]
__________________________________________________________________________________________________
lstm_3 (LSTM) (None, 128) 164352 concatenate_1[0][0]
__________________________________________________________________________________________________
dense_1 (Dense) (None, 6226) 803154 lstm_3[0][0]
==================================================================================================