Вот два парсера, основанные на lepl
библиотеке генератора парсеров. Они оба дают одинаковый результат.
from pprint import pprint
from lepl import AnyBut, Drop, Eos, Newline, Separator, SkipTo, Space
# field = name , ":" , value
name, value = AnyBut(':\n')[1:,...], AnyBut('\n')[::'n',...]
with Separator(~Space()[:]):
field = name & Drop(':') & value & ~(Newline() | Eos()) > tuple
header_start = SkipTo('Chat Transcript' & Newline()[2])
header = ~header_start & field[1:] > dict
server_message = Drop('* ') & AnyBut('\n')[:,...] & ~Newline() > 'Server'
conversation = (server_message | field)[1:] > list
footer_start = 'Visitor Details' & Newline() & '-'*15 & Newline()
footer = ~footer_start & field[1:] > dict
chat_log = header & ~Newline() & conversation & ~Newline() & footer
pprint(chat_log.parse_file(open('chat.log')))
Более строгий парсер
from pprint import pprint
from lepl import And, Drop, Newline, Or, Regexp, SkipTo
def Field(name, value=Regexp(r'\s*(.*?)\s*?\n')):
"""'name , ":" , value' matcher"""
return name & Drop(':') & value > tuple
Fields = lambda names: reduce(And, map(Field, names))
header_start = SkipTo(Regexp(r'^Chat Transcript$') & Newline()[2])
header_fields = Fields("Visitor Operator Company Started Finished".split())
server_message = Regexp(r'^\* (.*?)\n') > 'Server'
footer_fields = Fields(("Your Name, Your Question, IP Address, "
"Host Name, Referrer, Browser/OS").split(', '))
with open('chat.log') as f:
# parse header to find Visitor and Operator's names
headers, = (~header_start & header_fields > dict).parse_file(f)
# only Visitor, Operator and Server may take part in the conversation
message = reduce(Or, [Field(headers[name])
for name in "Visitor Operator".split()])
conversation = (message | server_message)[1:]
messages, footers = ((conversation > list)
& Drop('\nVisitor Details\n---------------\n')
& (footer_fields > dict)).parse_file(f)
pprint((headers, messages, footers))
Выход:
({'Company': 'Initech',
'Finished': '16 Oct 2008 9:45:44',
'Operator': 'Milton',
'Started': '16 Oct 2008 9:13:58',
'Visitor': 'Random Website Visitor'},
[('Random Website Visitor',
'Where do i get the cover sheet for the TPS report?'),
('Server',
'There are no operators available at the moment. If you would like to leave a message, please type it in the input field below and click "Send" button'),
('Server',
'Call accepted by operator Milton. Currently in room: Milton, Random Website Visitor.'),
('Milton', 'Y-- Excuse me. You-- I believe you have my stapler?'),
('Random Website Visitor', 'I really just need the cover sheet, okay?'),
('Milton',
"it's not okay because if they take my stapler then I'll, I'll, I'll set the building on fire..."),
('Random Website Visitor', 'oh i found it, thanks anyway.'),
('Server',
'Random Website Visitor is now off-line and may not reply. Currently in room: Milton.'),
('Milton', "Well, Ok. But… that's the last straw."),
('Server',
'Milton has left the conversation. Currently in room: room is empty.')],
{'Browser/OS': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727)',
'Host Name': '255.255.255.255',
'IP Address': '255.255.255.255',
'Referrer': 'Unknown',
'Your Name': 'Random Website Visitor',
'Your Question': 'Where do i get the cover sheet for the TPS report?'})