Вы можете прочитать в строках двумя различными способами :
- Разделить каждую строку на количество содержимого в этой строке.
- Хранить содержимое строк после разбиения в массиве с отступом - numpy.
Я создал класс с необходимыми методами, чтобы сделать это в несколько строк. Таким образом, вы можете применить его как есть, не внося в него никаких изменений.
Пример
s = """whole milk,margarine
yogurt,brown bread,coffee
pork,yogurt,coffee
bottled water,bottled beer
whole milk
salty snack
"""
tc = TextToColumns(file = 'source.txt',
sep = ',',
text2columns = True,
savedata = False,
usedummydata = True,
dummydata = s)
tc.make_dummydata()
tc.toarray()
Выход :
array([['whole milk', 'margarine', ''],
['yogurt', 'brown bread', 'coffee'],
['pork', 'yogurt', 'coffee'],
['bottled water', 'bottled beer', ''],
['whole milk', '', ''],
['salty snack', '', '']], dtype='<U13')
Код
import numpy as np
import os
class TextToColumns(object):
"""Reads in text data and converts text into
columns using user specified separator.
Parameters
----------
file: path to the file
sep: (str) separator. Default: ","
text2columns: (bool) if True, adds empty strings as a padding to create a
2D array. Default: True
savedata: (bool) if True, saves the data read-in after splitting with the
separator, as a part of the object. Default: False
usedummydata: (bool) if True, uses dummy data to write to a file.
Default: False
dummydata: (str) the string to use as dummy data.
Default: ''
Example:
# test-data
s = '''whole milk,margarine
yogurt,brown bread,coffee
pork,yogurt,coffee
bottled water,bottled beer
whole milk
salty snack
'''
# Text-to-column transformation
tc = TextToColumns(filename = 'source.txt',
sep = ',',
text2columns = True,
savedata = False,
usedummydata = True,
dummydata = s)
tc.make_dummydata()
tc.toarray()
# Uncomment next line to clear any dummy data created
# tc.clear_dummydata()
"""
def __init__(self, file,
sep: str = ',',
text2columns: bool = True,
savedata: bool = False,
usedummydata: bool = False,
dummydata: str=''):
self.file = file # 'source.txt'
self.sep = sep
self.text2columns = text2columns
self.savedata = savedata
self.usedummydata = usedummydata
self.dummydata = dummydata
def __repr__(self):
return "TextToColumns object"
def make_dummydata(self, dummydata=''):
"""Save a string as a file to use as dummy data.
"""
s = """whole milk,margarine
yogurt,brown bread,coffee
pork,yogurt,coffee
bottled water,bottled beer
whole milk
salty snack
"""
if (self.dummydata == ''):
self.dummydata = s
if (dummydata == ''):
dummydata = self.dummydata
with open(self.file, 'w') as f:
f.write(dummydata)
def clear_dummydata(self):
if os.path.isfile(self.file):
os.remove(self.file)
def readlines(self):
return self.toarray()
def read_file(self):
if os.path.isfile(self.file):
with open(self.file, 'r') as f:
lines = f.readlines()
return lines
else:
raise ValueError('Invalid file path.')
def split_lines(self, lines=None):
data = []
self._max_length = 0
if lines is None:
lines = self.read_file()
for line in lines:
linedata = [e.strip() for e in line.split(sep)]
length = len(linedata)
if (length > self._max_length):
self._max_length = length
#print(linedata)
if length > 0:
data.append(linedata)
if self.savedata:
self.data = data
return data
def toarray(self, data=None):
if data is None:
data = self.split_lines()
padded_data = []
if self.text2columns:
for line in data:
padded_data.append(line + ['']*(max_length - len(line)))
if self.savedata:
self.padded_data = padded_data
return np.array(padded_data)
else:
return data