Отлично - теперь, учитывая тот факт, что ваш исходный файл JSON может не содержать newline characters
Я надеюсь, что это работает, и даже может быть более точным
>>> string = '''{'attachment': [{'content_header': {'content-disposition': ['attachment; ''filename="image006.jpg"'],'content-id': ['<image006.jpg@01D35D21.756FEE10>'] 'body': [{'content': ' '' ''From: eCard Delivery [mailto:ecards@789greeting.com] ''Sent: Monday, November 13, 2017 9:14 AM''To: Zhang, Jerry (352A-Affiliate) ''Subject: Warmest Wishes! You have a Happy Thanksgiving ''ecard delivery!'' '' Dear Jerry,' 'header': {'date': '2017-11-14T08:20:42-08:00','header': {'accept-language': ['en-US'], 'content-language': ['en-US'], 'content-type': ['multipart/mixed; ''boundary="--boundary-LibPST-iamunique-1500317751_-_-"'], 'date': ['Tue, 14 Nov 2017 08:20:42 -0800'] 'subject': 'FW: Warmest Wishes! You have a Happy Thanksgiving ' 'ecard delivery!'}}'''
>>> subjects_test = re.findall('([\'|\"]*[\S]ubject[\S\s]+?[\'|\"]+)(?=\n|$|\s|\})', string)
>>> for subject in subjects_test:
print(subject)
#OUPUT: #Kind of off I guess, but I don't know the full format of the file so this is the safest bet
''Subject: Warmest Wishes! You have a Happy Thanksgiving ''ecard delivery!''
'subject': 'FW: Warmest Wishes! You have a Happy Thanksgiving '
Редактировать - учитывая ваш комментарий ниже, используйте строку, указанную выше.Надеюсь, я понимаю ваши требования.Я использую оба предоставленных мной примера регулярных выражений.
>>> string = '''{'attachment': [{'content_header': {'content-disposition': ['attachment; '
'filename="image006.jpg"'],
'content-id': ['<image006.jpg@01D35D21.756FEE10>']
'body': [{'content': ' \n'
' \n'
'From: eCard Delivery [mailto:ecards@789greeting.com] \n'
'Sent: Monday, November 13, 2017 9:14 AM\n'
'To: Zhang, Jerry (352A-Affiliate) '
'Subject: Warmest Wishes! You have a Happy Thanksgiving '
'ecard delivery!\n'
' \n'
' \tDear Jerry,\n'
'header': {'date': '2017-11-14T08:20:42-08:00',
'header': {'accept-language': ['en-US'],
'content-language': ['en-US'],
'content-type': ['multipart/mixed; '
'boundary="--boundary-LibPST-iamunique-1500317751_-_-"'],
'date': ['Tue, 14 Nov 2017 08:20:42 -0800']
'subject': 'FW: Warmest Wishes! You have a Happy Thanksgiving '
'ecard delivery!'}}'''
>>> subjects_test_1 = re.findall('([\'\"]*[S|s]ubject[:\s]*?(?:[\'|\"]*[\S\s]*?(?=[\'|\"])*))(?=\n|$)', string)
>>> for subject in subjects_test_1:
print(subject)
#OUPUT:
'Subject: Warmest Wishes! You have a Happy Thanksgiving '
'subject': 'FW: Warmest Wishes! You have a Happy Thanksgiving '
########################################################
>>> subjects_test_2 = re.findall('([\'|\"]*[\S]ubject[\S\s]+?[\'|\"]*)(?=\n|$)', string)
>>> for subject in subjects_test_2:
print(subject)
#OUPUT:
'Subject: Warmest Wishes! You have a Happy Thanksgiving '
'subject': 'FW: Warmest Wishes! You have a Happy Thanksgiving '
.
Или попробуйте эту функцию:
Для строки, где вы вызываете функцию,замените 'PATH_TO_YOUR_FILE'
на ... знаете, путь к вашему файлу ...
>>> def email_subject_parse(file_path):
import re
email_subjects = []
try:
with open(file_path) as file:
string = file.read()
email_subjects = re.findall('([\'\"]*[S|s]ubject[:\s]*?(?:[\'|\"]*[\S\s]*?(?=[\'|\"])*))(?=\n|$)', string)
#Or less complicated
#email_subjects = re.findall('([\'|\"]*[\S]ubject[\S\s]+?[\'|\"]*)(?=\n|$)', string)
return email_subjects
except:
print('You have likely provided a bad file path')
>>> subjects = email_subject_parse('PATH_TO_YOUR_FILE')
>>> for subject in subjects:
print(subject)
#OUPUT:
'Subject: Warmest Wishes! You have a Happy Thanksgiving '
'subject': 'FW: Warmest Wishes! You have a Happy Thanksgiving '