Итак, у меня есть небольшая проблема. У меня есть программа, которая должна вычислять длину словарного запаса, учитывая список вопросов и список ответов.
Вот код, который вычисляет длину:
def lenVocabulary(questions,answers):
# This section returns each question and answer as a list of words.
questions_as_word_sequences = list(map(keras.preprocessing.text.text_to_word_sequence,questions))
answers_as_word_sequences = list(map(keras.preprocessing.text.text_to_word_sequence,answers))
#print(questions_as_word_sequences[0])
#print(answers_as_word_sequences[0])
from numpy import array,column_stack
#print(questions[0:3])
print("Question as word sequence: ", questions_as_word_sequences[0])
questions_as_word_sequences = array(questions_as_word_sequences)
answers_as_word_sequences = array(answers_as_word_sequences)
# This section should return a numpy array with only two axes, but it returns one with three instead. Why?
vocabulary = column_stack((questions_as_word_sequences,answers_as_word_sequences))
vocabulary = set(vocabulary.flatten())
#print(vocabulary)
lenVocabulary = len(vocabulary)
return lenVocabulary
Теперь вышеприведенный код хорошо работает, когда я жестко кодирую вопросы и ответы:
def lenVocabulary(questions,answers):
# This section returns each question and answer as a list of words.
questions_as_word_sequences = list(map(keras.preprocessing.text.text_to_word_sequence,questions))
answers_as_word_sequences = list(map(keras.preprocessing.text.text_to_word_sequence,answers))
#print(questions_as_word_sequences[0])
#print(answers_as_word_sequences[0])
from numpy import array,column_stack
#print(questions[0:3])
print("Question as word sequence: ", questions_as_word_sequences[0])
questions_as_word_sequences = array(questions_as_word_sequences)
answers_as_word_sequences = array(answers_as_word_sequences)
# This section should return a numpy array with only two axes, but it returns one with three instead. Why?
vocabulary = column_stack((questions_as_word_sequences,answers_as_word_sequences))
vocabulary = set(vocabulary.flatten())
#print(vocabulary)
lenVocabulary = len(vocabulary)
return lenVocabulary
Выход:
Montanas-MacBook-Pro:~ montana$ /var/folders/qp/1n_8zftx3c799d_h5wqj1_4c0000gn/T/com.barebones.bbedit-558742823.535-python3.sh ; exit;
Using TensorFlow backend.
Question as word sequence: ['do', 'you', 'sell', 'seashells', 'by', 'the', 'seashore']
12
Однако, когда я тестирую его в своей основной программе, используя реальные данные, я получаю следующее:
Вывод основной программы:
Parsing data.
Saint Bernadette Soubirous
Using TensorFlow backend.
Question as word sequence: ['to', 'whom', 'did', 'the', 'virgin', 'mary', 'allegedly', 'appear', 'in', '1858', 'in', 'lourdes', 'france']
Основной программный код:
import json
import sys
import time
#print("Question n-grams: " + str(questionNgrams))
from collections import Counter
# Get N-gram vocabulary.
# Iterate through all questions and answers, pulling out our inputs
from string import punctuation
file = open("train-v1.1.json")
json = json.loads(file.read().replace('\n', ''))
data = json["data"]
questions = []
answers = []
print("Parsing data.")
import string
def isEnglish(s):
try:
s.encode(encoding='utf-8').decode('ascii')
except UnicodeDecodeError:
return False
else:
return True
for article in data:
# Articles
for paragraph in article["paragraphs"]:
# Paragraphs
for qas in paragraph["qas"]:
# Questions/Answers
# Remove all punctuation and non-English characters
question = qas["question"]
#question = "".join([string for string in question if isEnglish(string)])
# Remove common words that probably won't affect the accuracy of the end result.
#stop = ["The ","the ","A ","a "]
#for word in stop:
#question = question.replace(word,"")
answer = qas["answers"][0]["text"]
#answer = " ".join([string for string in answer if string not in punctuation])
if (answer != ""):
questions.append(question)
answers.append(answer)
# By now, we have a list of questions and answers, each of which is a string.
print(answers[0])
#print(questions_as_word_sequences[0])
#print(answers_as_word_sequences[0])
import skipgramModule
lengthVocabulary = skipgramModule.lenVocabulary(questions,answers)
Проблема заключается в том, что словарный массив имеет оси 3 вместо 2. Размеры и формы последовательностей вопросов / ответов в виде слов такие же, как и в форматах массива вопросов / ответов. , поэтому я не знаю, что может вызвать эту ошибку.