Я новичок в Python. Написал функцию для указания пакета слов.
DICT_SIZE = 5000
WORDS_TO_INDEX = words_counts
"""INDEX_TO_WORDS = ####### YOUR CODE HERE #######"""
ALL_WORDS = WORDS_TO_INDEX.keys()
Это функция:
def my_bag_of_words(text, words_to_index, dict_size):
"""
text: a string
dict_size: size of the dictionary
return a vector which is a bag-of-words representation of 'text'
"""
result_vector = np.zeros(dict_size)
sentence_tokens = nltk.word_tokenize(text)
attributes = []
for i, k in words_to_index.items():
if k<dict_size:
attributes.append(i)
for i in attributes:
for k in sentence_tokens:
if i==k:
result_vector[attributes.index(i)]=+1
return result_vector
Я попытался проверить функцию, и она тоже работает
def test_my_bag_of_words():
words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}
examples = ['hi how are you']
answers = [[1, 1, 0, 1]]
for ex, ans in zip(examples, answers):
if (my_bag_of_words(ex, words_to_index, 4) != ans).any():
print(my_bag_of_words(ex, words_to_index, 4))
return "Wrong answer for the case: '%s'" % ex
return 'Basic tests are passed.'
print(test_my_bag_of_words())
Basic tests are passed.
После того, как я хочу применить его ко всему тексту в наборе данных
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)
И в этом случае появляется ошибка:
IndexError Traceback (most recent call last)
<ipython-input-30-364e76658e6f> in <module>()
----> 1 X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
2 X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
3 X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
4 print('X_train shape ', X_train_mybag.shape)
5 print('X_val shape ', X_val_mybag.shape)
1 frames
<ipython-input-25-814e004d61c2> in my_bag_of_words(text, words_to_index, dict_size)
20 for k in sentence_tokens:
21 if i==k:
---> 22 result_vector[attributes.index(i)]=+1
23 return result_vector
IndexError: index 5000 is out of bounds for axis 0 with size 5000
Может кто-нибудь помочь мне понять, какую ошибку я сделалв коде в функции my_bag_of_words, пожалуйста?