Редактировать:
import functools
import operator
targeted_fields = [ 'Newsgroup', 'document_id', 'From', 'Subject' ]
_article_list = []
_final_dict_list = []
files = glob.glob(path + '\\*.txt')
# iterate over the list getting each file
for fle in files:
_tmp_subject_list = []
_tmp_header_list = []
with open(fle) as f:
data = [x.strip("\n").split(':',1) for x in f.readlines()]
for each in data:
if each[0] == targeted_fields[0]:
_article_list.append( [ *_tmp_header_list, functools.reduce(operator.concat, _tmp_subject_list, []) ] )
_tmp_subject_list = []
_tmp_header_list = [each]
elif each[0] in targeted_fields[1:3]:
_tmp_header_list.append(each)
else:
_tmp_subject_list.append(each)
if data.index(each) == len(data)-1:
_article_list.append( [ *_tmp_header_list, functools.reduce(operator.concat, _tmp_subject_list, []) ] )
_article_list = [ x for x in _article_list if len(x)>1 ] # Removing empty lines
for x in _article_list:
_final_dict_list.append( { y[0] : ' '.join(y[1:]) for y in x} )
Следующий подход работает, даже если каждый файл содержит больше статей:
targeted_fields = [ 'Newsgroup', 'document_id', 'From', 'Subject' ]
_final_list = []
files = glob.glob(path + '\\*.txt')
# iterate over the list getting each file
for fle in files:
with open(fle) as f:
data = [x.split(':',1) for x in f.readlines()]
_temp_list = []
for each in data:
if each != '' and each[0] in targeted_fields:
_temp_list.append(each)
if len(_temp_list) // len(targeted_fields):
_final_list.append({x[0]:x[1].strip("\n") for x in _temp_list})
_temp_list = []
_final_list
будет список словарей, образец формат (используя 2 статьи в 2 файлах, следовательно, 4 результата):
[ { 'From': ' et@teal.csn.org (Eric H. Taylor)',
'Newsgroup': ' sci.space',
'Subject': ' Re: Gravity waves, was: Predicting gravity wave quantization '
'& Cosmic Noise',
'document_id': ' 59497'},
{ 'From': ' et@teal.csn.org (Eric H. Taylor)2',
'Newsgroup': ' sci.space2',
'Subject': ' Re: Gravity waves, was: Predicting gravity wave quantization '
'& Cosmic Noise2',
'document_id': ' 594972'},
{ 'From': ' et@teal.csn.org (Eric H. Taylor)',
'Newsgroup': ' sci.space',
'Subject': ' Re: Gravity waves, was: Predicting gravity wave quantization '
'& Cosmic Noise',
'document_id': ' 59497'},
{ 'From': ' et@teal.csn.org (Eric H. Taylor)2',
'Newsgroup': ' sci.space2',
'Subject': ' Re: Gravity waves, was: Predicting gravity wave quantization '
'& Cosmic Noise2',
'document_id': ' 594972'}]
Для преобразования окончательных результатов в json:
import json
data = json.dumps(_final_list)
Json вывод:
[{"Newsgroup": " sci.space", "document_id": " 59497", "From": " et@teal.csn.org (Eric H. Taylor)", "Subject": " Re: Gravity waves, was: Predicting gravity wave quantization & Cosmic Noise"}, {"Newsgroup": " sci.space2", "document_id": " 594972", "From": " et@teal.csn.org (Eric H. Taylor)2", "Subject": " Re: Gravity waves, was: Predicting gravity wave quantization & Cosmic Noise2"}, {"Newsgroup": " sci.space", "document_id": " 59497", "From": " et@teal.csn.org (Eric H. Taylor)", "Subject": " Re: Gravity waves, was: Predicting gravity wave quantization & Cosmic Noise"}, {"Newsgroup": " sci.space2", "document_id": " 594972", "From": " et@teal.csn.org (Eric H. Taylor)2", "Subject": " Re: Gravity waves, was: Predicting gravity wave quantization & Cosmic Noise2"}]
Для преобразования в CSV:
keys = _final_list[0].keys()
with open('people.csv', 'w') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(_final_list)
Csv вывод:
Newsgroup,document_id,From,Subject
sci.space, 59497, et@teal.csn.org (Eric H. Taylor)," Re: Gravity waves, was: Predicting gravity wave quantization & Cosmic Noise"
sci.space2, 594972, et@teal.csn.org (Eric H. Taylor)2," Re: Gravity waves, was: Predicting gravity wave quantization & Cosmic Noise2"
sci.space, 59497, et@teal.csn.org (Eric H. Taylor)," Re: Gravity waves, was: Predicting gravity wave quantization & Cosmic Noise"
sci.space2, 594972, et@teal.csn.org (Eric H. Taylor)2," Re: Gravity waves, was: Predicting gravity wave quantization & Cosmic Noise2"