Я следую этому руководству (https://towardsdatascience.com/build-your-own-whatsapp-chat-analyzer-9590acca9014), чтобы создать анализатор WhatsApp.
Это весь код из его урока -
def startsWithDateTime(s):
pattern = '^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)(\d{2}|\d{4}), ([0-9][0-9]):([0-9][0-9]) -'
result = re.match(pattern, s)
if result:
return True
return False
def startsWithAuthor(s):
patterns = [
'([\w]+):', # First Name
'([\w]+[\s]+[\w]+):', # First Name + Last Name
'([\w]+[\s]+[\w]+[\s]+[\w]+):', # First Name + Middle Name + Last Name
'([+]\d{2} \d{5} \d{5}):', # Mobile Number (India)
'([+]\d{2} \d{3} \d{3} \d{4}):', # Mobile Number (US)
'([+]\d{2} \d{4} \d{7})' # Mobile Number (Europe)
]
pattern = '^' + '|'.join(patterns)
result = re.match(pattern, s)
if result:
return True
return False
def getDataPoint(line):
# line = 18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner?
splitLine = line.split(' - ') # splitLine = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']
dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
date, time = dateTime.split(', ') # date = '18/06/17'; time = '22:47'
message = ' '.join(splitLine[1:]) # message = 'Loki: Why do you have 2 numbers, Banner?'
if startsWithAuthor(message): # True
splitMessage = message.split(': ') # splitMessage = ['Loki', 'Why do you have 2 numbers, Banner?']
author = splitMessage[0] # author = 'Loki'
message = ' '.join(splitMessage[1:]) # message = 'Why do you have 2 numbers, Banner?'
else:
author = None
return date, time, author, message
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
conversationPath = 'chat.txt'
with open(conversationPath, encoding="utf-8") as fp:
fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)
messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
while True:
line = fp.readline()
if not line: # Stop reading further if end of file has been reached
break
line = line.strip() # Guarding against erroneous leading and trailing whitespaces
if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
print('true')
if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
parsedData.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
messageBuffer.append(message) # Append message to buffer
else:
messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer
Когда я подключаю свой файл чата - chat.txt
, я заметил, что мой список parsedData
пуст. Пройдя по его коду, я заметил, что может отвечать за пустой список. Из его урока его чаты в этом формате (24 часа) - 18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner?
, но мои чаты в этом формате (12 часов) - [4/19/20, 8:10:57 PM] Joe: How are you doing
.
Причина, по которой функция startsWithDateTime
не может соответствовать ни одной дате и времени.
Пожалуйста, как мне изменить формат регулярного выражения в функции startsWithDateTime
, чтобы он соответствовал формату моего чата?