Примечание : это не ответ на вопрос (лучше ли Example или SequenceExample и следует ли разбивать последовательность на каналы или в виде байтовой строки)
Скорее, мне приходило в голову, когда я смотрел учебники, посты, видео и т. Д. TensorFlow Records, что большинство примеров (с которыми я сталкивался) были сосредоточены на построении (Sequence) примера с конкретными данными и не показывали, как его можно сделать более динамично.Таким образом, я инкапсулировал четыре метода выше для преобразования данных описанного типа в примере.
Несмотря на то, что мы все еще привязаны к данным, мы пытаемся создать Пример (Последовательность), надеюсь, для тех, кто все еще немного озадачен форматом - в дополнение к конкретным примерам выше - это может быть полезно.
Вот код, с которым можно поиграть.Обратная связь приветствуется.
Обновление
Это сжато в пакет с именем Функция ввода / вывода (FIO) .
Вот это Colab демонстрирует, как его использовать.
А именно, он вводит понятие "schema"
:
SCHEMA = {
'my-feature': {'length': 'fixed', 'dtype': tf.string, 'shape': []},
'seq': {
'length': 'fixed',
'dtype': tf.int64,
'shape': [4, 3],
'encode': 'channels',
'channel_names': ['A', 'B', 'C'],
'data_format': 'channels_last'
}
}
, которое позволяет вам определять ваши данные _once_ , а не дважды (один раз для кодирования в пример и один раз для извлечения из записи).
Оригинал
Настройка
import os, sys, json
sys.path.insert(0, '../')
import tensorflow as tf
import numpy as np
Некоторый легкий помощникфункции
def list_like_q(value) -> bool:
'''
TensorFlow tf.train.Feature requires a list of feature values.
Many values used in practice are either python lists or numpy.ndarrays.
We often have features which consist of a singular value.
For brevity, we define some light helper functions to wrap a list as a
tf.train.Feature. This lets us test if we need to wrap the value.
'''
# import numpy as np
return (type(value) is list or type(value) is np.ndarray)
def take_all() -> slice: return slice(None, None, None)
def take_channel(sequence, channel:int, data_format:str='channels_last'):
slices = [channel, take_all()]
if data_format != 'channels_last': slices.reverse()
return sequence[tuple(slices)]
def number_of_channels(sequence, data_format:str='channels_last') -> int:
return sequence.shape[-1] if data_format == 'channels_last' else sequence.shape[0]
def feature_int64(value):
'''Takes value and wraps into tf.train.Feature(Int64List)'''
if not list_like_q(value): value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def feature_float(value):
'''Takes value and wraps into tf.train.Feature(FloatList)'''
if not list_like_q(value): value = [value]
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def feature_bytes(value):
'''Takes value and wraps is into tf.train.Feature(BytesList).'''
if type(value) is np.ndarray: value = value.tostring()
if type(value) is not bytes: value = str(value).encode('utf-8')
if type(value) is not list: value = [value]
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def feature_function(dtype):
'''
Given <dtype> returns the function for wrapping a value into the
corresponding tf.train.Feature
'''
return feature_int64 if dtype == "int64" else \
feature_float if dtype == "float" else \
feature_bytes
def feature_list(iterable, dtype:str='float'):
'''Given an iterable, returns the feature list of corresponding <dtype>.'''
return tf.train.FeatureList([feature_function(dtype)(item) for item in iterable])
# the next three for completeness
def feature_list_int64(value):
return tf.train.FeatureList(feature=feature_list(value, 'int64'))
def feature_list_float(value):
return tf.train.FeatureList(feature=feature_list(value, 'float'))
def feature_list_bytes(value):
return tf.train.FeatureList(feature=feature_list(value, 'bytes'))
def dict_to_features(values:dict, types:dict) -> dict:
'''
Given <types>, maps over name:dtype pairs and wraps <values>[name] in the
corresponding feature type.
'''
return {name: feature_function(dtype)(values[name]) for name, dtype in types.items()}
def features_from_dict(values:dict, types:dict):
return tf.train.Features(feature=dict_to_features(values, types))
def default_channel_names(sequence, data_format:str='channels_last') -> list:
'''Ensures a naming scheme as required for channel based Example'''
return [f'Channel {i}' for i in range(number_of_channels(sequence, data_format))]
def channels_to_features(sequence, dtype:str='float', data_format:str='channels_last', channel_names:list=None) -> dict:
'''
Given a <sequence> of corresponding <dtype> and <data_format>, with optional <channel_names>
returns the dictionary of each channel:tf.train.Feature pair.
'''
if channel_names is None: channel_names = default_channel_names(sequence, data_format)
return {
channel: feature_function(dtype)(take_channel(sequence, i, data_format))
for i, channel in enumerate(channel_names)
}
def channels_to_feature_list(sequence, dtype:str='float', data_format:str='channels_last'):
'''
Given a <sequence> of <dtype> and <data_format> returns the FeatureList
where each element corresponds to a channel of <sequence>
'''
return tf.train.FeatureList(feature=list(channels_to_features(sequence, dtype, data_format).values()))
SequenceRecords
class SequenceRecord:
'''
SequenceRecord is a supporting class built on top of the functions found in
/model/utils/features.py with the purpose of converting our data consisting
of:
- a sequence of length n,
- n vectors of class probability vectors (refered to as pclasses), and
- metadata (name of sequence, start site, stop site, etc)
and converting it into a TensorFlow (Sequence)Example which can
subsequentially be written as a TensorFlow Record.
For both Example and SequenceExample options, the channels / classes of the
sequence / pclasses can be stored as numeric features (int64 / float) or as
a byte string. For each of these options, the encoding can be done per
channel / class, or the entire sequence / pclasses matrix.
Overwrite the following class variables to suit your needs:
_class_var || description
---------------------------------------------------------------------------
_metadata_types:dict || a dictionary of <feature-name>:<dtype> pairs which
|| is refered to when the metadata is converted into
|| tf.train.Feature (only 'int64', 'float', 'bytes' are
|| supported for <dtype>)
_sequence_data_format|| a string specifying where the channels are. By
|| default, this is set to 'channels_last'
_pclasses_data_format|| a string specifying where the channels are (by
|| default, this is set to 'channels_last')
_sequence_data_type || a string specifying what dtype channels should be
|| encoded as (by default 'int64')
_pclasses_data_type || a string specifying what dtype channels should be
|| encoded as (by default 'float')
_channel_names || a list of strings specifying the name and order
|| channels appear in <sequence> (by default set to
|| None)
_classes_names || a list of strings specifying the name and order
|| classes appear as channels in <pclasses> (by default
|| set to None)
'''
_metadata_types = {}
_sequence_data_format = 'channels_last'
_pclasses_data_format = 'channels_last'
_sequence_data_type = 'int64'
_pclasses_data_type = 'float'
_channel_names = None
_classes_names = None
def make_example(self, sequence, pclasses, metadata:dict={}, form:str='example', by:str='channels'):
'''
The core function of SequenceRecord. Given <sequence>, <pclasses> and <metadata>
converts them to the corresponing <form> and <by> the specified encoding schema.
form: either 'example' (default) or 'sequence' and yields either a
a Example or SequenceExample.
by: either 'channels' (default) or 'bstrings' or 'bdstring' and
encodes the sequence / pclasses by channel / class as a numeric,
or a byte string (options 'channels' and 'bstrings'), or dumps the
entire numpy.ndarray a byte string (option 'bdstring')
'''
wrap = self.example if form == 'example' else self.sequence_example
return wrap(sequence, pclasses, metadata, by)
def example(self, sequence, pclasses, metadata, by='channels'):
wrap = self.example_as_channels if by == 'channels' else \
self.example_as_bdstring if by == 'bdstring' else \
self.example_as_bstrings
return wrap(sequence, pclasses, metadata)
def sequence_example(self, sequence, pclasses, metadata, by='channels'):
wrap = self.sequence_example_as_channels if by == 'channels' else \
self.sequence_example_as_bdstring if by == 'bdstring' else \
self.sequence_example_as_bstrings
return wrap(sequence, pclasses, metadata)
def example_as_channels(self, sequence, pclasses, metadata):
'''
Encoded each channel (or class) as its own feature with specified dtype
(e.g. _sequence_data_type) and wraps in tf.train.Example
'''
features = {
**dict_to_features(metadata, self._metadata_types),
**channels_to_features(sequence, self._sequence_data_type, self._sequence_data_format, self._channel_names),
**channels_to_features(pclasses, self._pclasses_data_type, self._pclasses_data_format, self._classes_names),
}
return tf.train.Example(features=tf.train.Features(feature=features))
def example_as_bstrings(self, sequence, pclasses, metadata):
'''
Encoded each channel (or class) as its own feature but dumps ndarrays
as byte strings (<np.ndarray.tostring()>) and wraps in tf.train.Example.
'''
features = {
**dict_to_features(metadata, self._metadata_types),
**channels_to_features(sequence, 'bytes', self._sequence_data_format, self._channel_names),
**channels_to_features(pclasses, 'bytes', self._pclasses_data_format, self._classes_names),
}
return tf.train.Example(features=tf.train.Features(feature=features))
def example_as_bdstring(self, sequence, pclasses, metadata):
'''
Encodes sequence and probability classes as a byte 'dump' string
i.e. dump the sequence to a string and encode to bytes
( equivalent to np.ndarray.tostring() )
'''
features = {
**dict_to_features(metadata, self._metadata_types),
'sequence': feature_bytes(sequence),
'pclasses': feature_bytes(pclasses)
}
return tf.train.Example(features=tf.train.Features(feature=features))
def sequence_example_as_channels(self, sequence, pclasses, metadata):
'''
Encoded each channel (or class) as its own feature with specified dtype
(e.g. _sequence_data_type) and wraps in tf.train.SequenceExample
'''
context = features_from_dict(metadata, self._metadata_types)
feat_list = tf.train.FeatureLists(feature_list={
'sequence': channels_to_feature_list(sequence, self._sequence_data_type, self._sequence_data_format),
'pclasses': channels_to_feature_list(pclasses, self._pclasses_data_type, self._pclasses_data_format)
})
return tf.train.SequenceExample(context=context, feature_lists=feat_list)
def sequence_example_as_bstrings(self, sequence, pclasses, metadata):
'''
Encoded each channel (or class) as its own feature but dumps ndarrays
as byte strings (<np.ndarray.tostring()>) and wraps in
tf.train.SequenceExample.
'''
context = features_from_dict(metadata, self._metadata_types)
feat_list = tf.train.FeatureLists(feature_list={
'sequence': channels_to_feature_list(sequence, 'bytes', self._sequence_data_format),
'pclasses': channels_to_feature_list(pclasses, 'bytes', self._pclasses_data_format)
})
return tf.train.SequenceExample(context=context, feature_lists=feat_list)
def sequence_example_as_bdstring(self, sequence, pclasses, metadata):
'''
Encodes sequence and probability classes as a byte 'dump' string
i.e. dump the sequence to a string and encode to bytes
( equivalent to np.ndarray.tostring() )
'''
context = features_from_dict(metadata, self._metadata_types)
feat_list = tf.train.FeatureLists(feature_list={
'sequence': tf.train.FeatureList(feature=[feature_bytes(sequence)]),
'pclasses': tf.train.FeatureList(feature=[feature_bytes(pclasses)])
})
return tf.train.SequenceExample(context=context, feature_lists=feat_list)
def write(self, example, to:str):
'''
After calling corresponding method to construct (Sequence)Example,
writes the passed (Sequence)Example to specified location (full path name).
'''
with tf.python_io.TFRecordWriter(to) as writer:
writer.write(example.SerializeToString())
Фиктивные данные
sequences = np.array([
# sequence 1
[
# el1, el2, el3
[ 1, 1, 1], # channel 1
[ 2, 2, 2], # channel 2
[ 3, 3, 3], # channel 3
],
#sequence 2
[
[ 10, 10, 10], # channel 1
[ 20, 20, 20], # channel 2
[ 30, 30, 30], # channel 3
]
])
pclasses = np.array([
# sequence 1
[
# cls1, cls2, cls3
[ 0, 0.9, 0.1], # class probabilities element 1
[ 0, 0.1, 0.9], # class probabilities element 2
[ 0.8, 0.1, 0.1] # class probabilities element 3
],
# sequence 2
[
# cls1, cls2, cls3
[ 0.8, 0.1, 0.1], # class probabilities element 3
[ 0, 0.1, 0.9], # class probabilities element 2
[ 0, 0.9, 0.1] # class probabilities element 1
]
])
metadata = [
{'Name': 'sequence 1', 'Val_1': 100, 'Val_2': 10},
{'Name': 'sequence 2', 'Val_1': 10, 'Val_2': 100}
]
metatypes = {'Name': 'bytes', 'Val_1': 'float', 'Val_2': 'float'}
Инициировать и перейти
SequenceRecord._channel_names = ['Channel 1', 'Channel 2', 'Channel 3']
SequenceRecord._classes_names = ['Class A', 'Class B', 'Class C']
SequenceRecord._metadata_types = metatypes
SR = SequenceRecord()
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='channels')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='bstrings')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='bdstring')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='channels')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='bstrings')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='bdstring')