Question

РЕДАКТИРОВАТЬ Я обучаю модель Keras для создания резюме из научных статей.
Данные получены из https://www -nlpir.nist.gov / related_projects / tipster_summac / cmp_lg.html И статьи, иих резюме являются частью входных данных, поэтому я не знаю, какими должны быть выходные данные. Это должны быть двоичные данные в следующей форме: (num_articles, vocab_size)

#!/usr/bin/env python
# coding: utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow.keras as keras, numpy as np, itertools
import random, xml, glob, os, string, re, requests, atoma, feedparser #feedparser is ridiculously slow, speedparser incompatible with Python 3.4 onwards
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'#activating AVX, which is the actual purpose of TF/CUDA
#Data From https://www-nlpir.nist.gov/related_projects/tipster_summac/cmp_lg.html and https://machinelearningmastery.com/encoder-decoder-models-text-summarization-keras/ and https://www.tensorflow.org/beta/tutorials/text/text_generation


titles=['']
abstractWords=[[]]#only way to make append work correctly so far  - means index starts at 1 - awkward
bodyWords=[[]]
vocab=[]
numArticles=0
for fileN in os.listdir("cmplg-xml"):
    if fileN.endswith(".xml"):
        print('cmplg-xml\\'+fileN)
        file=open('cmplg-xml\\'+fileN,'r',encoding='latin-1')
        rawStr=file.read()
        file.close()
        #as we are reading the articles, we build the dataset and the character sequence
        #cleanEx=re.compile(r'<REF/>|<P>|</P>|<DIV.*?>|</DIV>|<!--.*?-->|<HEADER>|</HEADER>|<EQN>|<EQN/>|<CREF/>', re.MULTILINE)
        rawStr = re.sub('<!--.*?-->|<REF/>|<P>|</P>|<DIV.*?>|</DIV>|<HEADER>|</HEADER>|<EQN>|<EQN/>|<CREF/>|\(.*?\)','',rawStr,flags=re.S)
        #no way to remove header content so far
        cleanStr = re.sub('\n',' ',rawStr)

        try:
            title=re.findall('<TITLE>(.*?)</TITLE>',cleanStr)[0]
            abstract=re.findall('<ABSTRACT>(.*?)</ABSTRACT>',cleanStr)[0]
            body=re.findall('<BODY>(.*?)</BODY>',cleanStr)[0]
        except IndexError:
            print(" incomplete")
            continue
        #REM: whitespaces managed by split
        #cleanStr = re.sub('<REF/>|<P>|</P>|\n','',rawStr)
        # outfile=open('cleaned.xml','w')
        # outfile.write(cleanStr)
        # outfile.close()
        #print(cleanStr)
        #root=xml.etree.ElementTree.fromstring(cleanStr) #Title, abstract,body   
        abstract.translate(str.maketrans('', '', string.punctuation))#stripping the punctuation
        abstract = [word.lower() for word in abstract.split() if word.isalpha()]#only alphabetic characters
        body.translate(str.maketrans('', '', string.punctuation))#stripping the punctuation
        body = [word.lower() for word in body.split() if word.isalpha()]#only alphabetic character
        titles.append(title)#not cleaned so far #is it worth using as a parameter?
        abstractWords.append(abstract)
        bodyWords.append(body)
        vocab+=abstract+body
        numArticles+=1
if not abstractWords[0]:
    del abstractWords[0]
if not bodyWords[0]:
    del bodyWords[0]
if not titles[0]:
    del titles[0]


bodyWords[0]


#with feedparser
arxivRss = "D:\TextGeneration\Training\csrss.xml"
feed = feedparser.parse( arxivRss )
item = feed['items'][0]
item['summary']
# ## Vocabulary 
# Read, then decode for py2 compat.

# text = open('training\cs.txt', 'rb').read().decode(encoding='utf-8')
vocab = sorted(set(['']+list(itertools.chain(*bodyWords))+list(itertools.chain(*abstractWords))))
# print ('{} unique characters'.format(len(vocab)))
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)

padLen = 5000 #consider 10000
sumLen = 30 #length of the summary 280-33=247 characters for arxiv url
#problem: these are characters, not words
vocab_size = len(vocab)
bodies=np.zeros(shape=(numArticles,padLen),dtype=int)
abstracts=np.zeros(shape=(numArticles,sumLen),dtype=int)
#no words are coded as zeroes
#just truncating the abstract is not a perfect solution
#there should be a function to automate this cumbersome padding pattern
for i in range(len(bodyWords)):
    bodyLen=len(bodyWords[i])
    if bodyLen<padLen:
        bodies[i][:bodyLen]= [word2idx[word] for word in bodyWords[i]]
    else:
        bodies[i]=[word2idx[word] for word in bodyWords[i][:padLen]]

    absLen=len(abstractWords[i])
    if absLen<sumLen:
        abstracts[i][:absLen]= [word2idx[word] for word in abstractWords[i]]
    else:
        abstracts[i]=[word2idx[word] for word in abstractWords[i][:sumLen]]
# article input model
inputs1 = keras.Input(shape=(padLen,))
article1 = keras.layers.Embedding(vocab_size, 128)(inputs1)#encoder1
article2 = keras.layers.LSTM(64)(article1)
article3 = keras.layers.RepeatVector(sumLen)(article2)
# summary input model
inputs2 = keras.Input(shape=(sumLen,))
summ1 = keras.layers.Embedding(vocab_size, 128)(inputs2)
# decoder model
decoder1 = keras.layers.concatenate([article3, summ1])
decoder2 = keras.layers.LSTM(128)(decoder1)
outputs = keras.layers.Dense(vocab_size, activation='softmax')(decoder2)#hos do we define this?

# tie it together [article, summary] [word]
model = keras.Model(inputs=[inputs1, inputs2], outputs=outputs)
#do we need keras.layers.activation()?
# model = keras.Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

#This model needs to be saved so it can be directly loaded for the routine
# saving the model
model_json = model.to_json()
modelFile = open("sumModel.json", "w")#model
modelFile.write(model_json)
modelFile.close()
model.save_weights("model.h5")#weights

# loading the model
modelFile = open('sumModel.json', 'r')#
model = keras.models.model_from_json(modelFile.read())
modelFile.close()
# load weights into new model
model.load_weights("model.h5")
model.compile(loss='categorical_crossentropy', optimizer='adam')
#no error here
for body in bodies:
    if (len(body) != 5000):
        print(len(body))
for abstract in abstracts:
    if (len(abstract) != 30):
        print(len(abstract))

#Now, using the model
batch_size = 32
epochs = 4
#keras.layers.Dense(vocab_size, activation='softmax') as a parameter?
model.fit([bodies, abstracts], validation_split = 0.1, epochs=epochs, batch_size=batch_size, verbose=1)

Я получаю

TypeError: У ввода 'y' из 'Equal' Op есть ресурс типакоторый не соответствует типу int32 аргумента 'x'.

Этот вопрос имеет то же сообщение об ошибке, но предлагаемое решение

Итак, яПредполагается, что проблема с типом или содержимым X_train2 или X_valid

, похоже, не содержит:

body.shape: (181, 5000)

аннотации. Форма: (30, 5000)

model.summary ()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_3 (InputLayer)            [(None, 5000)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 5000, 128)    796928      input_3[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 64)           49408       embedding_2[0][0]                
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 30, 64)       0           lstm_2[0][0]                     
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 30, 128)      796928      input_4[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 30, 192)      0           repeat_vector_1[0][0]            
                                                                 embedding_3[0][0]                
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 128)          164352      concatenate_1[0][0]              
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 6226)         803154      lstm_3[0][0]                     
==================================================================================================

Выходные данные для регрессии Кераса

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Выходные данные для регрессии Кераса

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Нет похожих вопросов