Я пытаюсь построить Классификацию с существующим оценщиком, чтобы предсказать, будет ли статья продана или нет.
Я пытался использовать linearClassifier, потому что я новичок в Tensorflow и Pyhton.
У меня есть набор данных с ценой, категорией и размером, который идеально подходит для цифр c или столбцы объектов категории. Но у меня также есть описание статьи, только 3-6 слов на статью и около 6500 разных слов согласно моему анализу. Я пытался использовать общий встраивание, с одним столбцом категории на слово, но это не сработало. И когда я добавляю все 6500 столбцов непосредственно в модель, это очень медленно.
Каков наилучший и самый простой способ обработки описания? В лучшем случае с примером кода. Порядок слов не имеет значения, но, например, если он от бренда, он будет продаваться лучше, чем noname.
Большое спасибо за ваши ответы
Редактировать: я пробовал с этим постом Столбец функции последовательности площадок Tensorflow
Но теперь у меня проблема в том, что tf.data.Dataset.from_tensor_slices ((dict (dataframe), метки)) не работают
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
import tensorflow.compat.v2.feature_column as fc
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from tensorflow.python.framework.ops import disable_eager_execution
import itertools
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence
dfall = pd.read_csv('./articles.csv')
# Build vacabulary
vocab_size = 6203
oov_tok = '<OOV>'
sentences = dfall['description'].to_list()
tokenizer = Tokenizer(num_words = vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
# if word_index shorter then default value of vocab_size we'll save actual size
vocab_size=len(word_index)
print("vocab_size = word_index = ",len(word_index))
# Split sentensec on tokens. here token = word
# text_to_word_sequence() has good default filter for
# charachters include basic punctuation, tabs, and newlines
dfall['description'] = dfall['description'].apply(text_to_word_sequence)
max_length = 9
# paddind and trancating setnences
# do that directly with strings without using tokenizer.texts_to_sequences()
# the feature_colunm will convert strings into numbers
dfall['description']=dfall['description'].apply(lambda x, N=max_length: (x + N * [''])[:N])
dfall['description']=dfall['description'].apply(lambda x, N=max_length: x[:N])
#dfall['description']=dfall['description'].apply(np.asarray)
dfall.head()
# Define method to create tf.data dataset from Pandas Dataframe
def df_to_dataset(dataframe, label_column, shuffle=True, batch_size=32):
dataframe = dataframe.copy()
#labels = dataframe.pop(label_column)
labels = dataframe[label_column]
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
return ds
# Split dataframe into train and validation sets
train_df, val_df = train_test_split(dfall, test_size=0.2)
print(len(train_df), 'train examples')
print(len(val_df), 'validation examples')
batch_size = 32
ds = df_to_dataset(dfall, 'sold',shuffle=False,batch_size=batch_size)
train_ds = df_to_dataset(train_df, 'sold', shuffle=False, batch_size=batch_size)
val_ds = df_to_dataset(val_df, 'sold', shuffle=False, batch_size=batch_size)
# and small batch for demo
example_batch = next(iter(ds))[0]
example_batch
# Helper methods to print exxample outputs of for defined feature_column
def demo(feature_column):
feature_layer = tf.keras.layers.DenseFeatures(feature_column)
print(feature_layer(example_batch).numpy())
def seqdemo(feature_column):
sequence_feature_layer = tf.keras.experimental.SequenceFeatures(feature_column)
print(sequence_feature_layer(example_batch))
dfall.head () -
sold description category_id size_id gender price host_id lat long year month
0 1 [dünne, jacke, gepunktet, , , , , , ] 9 25 f 3.5 1 48.21534 11.29949 2019 3
1 1 [kleid, pudel, dunkelblau, gepunktet, , , , , ] 9 25 f 4.0 1 48.21534 11.29949 2019 3
2 0 [kleid, rosa, hum, hund, katze, , , , ] 9 24 f 4.0 1 48.21534 11.29949 2019 3
3 1 [kleid, hum, blau, elsa, und, anna, , , ] 9 24 f 4.0 1 48.21534 11.29949 2019 3
4 0 [kleid, blue, seven, lachsfarben, , , , , ] 9 23 f 4.5 1 48.21534 11.29949 2019 3
Результат -
vocab_size = word_index = 6203
12482 train examples
3121 validation examples
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\util\structure.py in normalize_element(element)
92 try:
---> 93 spec = type_spec_from_value(t, use_fallback=False)
94 except TypeError:
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\util\structure.py in type_spec_from_value(element, use_fallback)
464
--> 465 raise TypeError("Could not build a TypeSpec for %r with type %s" %
466 (element, type(element).__name__))
TypeError: Could not build a TypeSpec for 0 [dünne, jacke, gepunktet, , , , , , ]
1 [kleid, pudel, dunkelblau, gepunktet, , , , , ]
2 [kleid, rosa, hum, hund, katze, , , , ]
3 [kleid, hum, blau, elsa, und, anna, , , ]
4 [kleid, blue, seven, lachsfarben, , , , , ]
...
15598 [gartenschuhe, pink, , , , , , , ]
15599 [sandalen, grau, blume, superfit, , , , , ]
15600 [turnschuhe, converse, grau, , , , , , ]
15601 [strickjacke, rosa, , , , , , , ]
15602 [bikinihose, schmetterling, , , , , , , ]
Name: description, Length: 15603, dtype: object with type Series
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-1-420304a651bd> in <module>
71
72 batch_size = 32
---> 73 ds = df_to_dataset(dfall, 'sold',shuffle=False,batch_size=batch_size)
74
75 train_ds = df_to_dataset(train_df, 'sold', shuffle=False, batch_size=batch_size)
<ipython-input-1-420304a651bd> in df_to_dataset(dataframe, label_column, shuffle, batch_size)
58 labels = dataframe[label_column]
59
---> 60 ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
61 if shuffle:
62 ds = ds.shuffle(buffer_size=len(dataframe))
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py in from_tensor_slices(tensors)
638 Dataset: A `Dataset`.
639 """
--> 640 return TensorSliceDataset(tensors)
641
642 class _GeneratorState(object):
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py in __init__(self, element)
2856 def __init__(self, element):
2857 """See `Dataset.from_tensor_slices()` for details."""
-> 2858 element = structure.normalize_element(element)
2859 batched_spec = structure.type_spec_from_value(element)
2860 self._tensors = structure.to_batched_tensor_list(batched_spec, element)
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\data\util\structure.py in normalize_element(element)
96 # the value. As a fallback try converting the value to a tensor.
97 normalized_components.append(
---> 98 ops.convert_to_tensor(t, name="component_%d" % i))
99 else:
100 if isinstance(spec, sparse_tensor.SparseTensorSpec):
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
1339
1340 if ret is None:
-> 1341 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
1342
1343 if ret is NotImplemented:
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in _constant_tensor_conversion_function(v, dtype, name, as_ref)
319 as_ref=False):
320 _ = as_ref
--> 321 return constant(v, dtype=dtype, name=name)
322
323
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in constant(value, dtype, shape, name)
259 ValueError: if called on a symbolic tensor.
260 """
--> 261 return _constant_impl(value, dtype, shape, name, verify_shape=False,
262 allow_broadcast=True)
263
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
268 ctx = context.context()
269 if ctx.executing_eagerly():
--> 270 t = convert_to_eager_tensor(value, ctx, dtype)
271 if shape is None:
272 return t
c:\users\nibur\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\framework\constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
94 dtype = dtypes.as_dtype(dtype).as_datatype_enum
95 ctx.ensure_initialized()
---> 96 return ops.EagerTensor(value, ctx.device_name, dtype)
97
98
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
Я уже пытался использовать dfall ['description'] = dfall ['description']. apply (np .asarray) но тогда я получил
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).
Для всех есть одна и та же проблема, решение
tf.data.Dataset.from_tensor_slices((dataframe .to_dict(orient='list'), labels))