Этот REGEX проверяет диапазон дат, соответствующий этому формату MONTH YEAR (MONTH YEAR | PRESENT)
import re
# just for complexity adding to valid range in first line
text = """
February 2016 - March 2019 February 2017 - March 2019
September 2015 to August 2019
April 2015 to present
September 2018 - present
George Mason University august 2019
Stratusburg university February 2018
Some text and month followed by year
"""
# writing the REGEX in one line will make it very UGLY
MONTHS_RE = ['Jan(?:uary)?', 'Feb(?:ruary)', 'Mar(?:ch)', 'Apr(?:il)?', 'May', 'Jun(?:e)?', 'Aug(?:ust)?', 'Sep(?:tember)?',
'(?:Nov|Dec)(?:ember)?']
# to match MONTH NAME and capture it (Jan(?:uary)?|Feb(?:ruary)...|(?:Nov|Dec)(?:ember)?)
RE_MONTH = '({})'.format('|'.join(MONTHS_RE))
# THIS MATCHE MONTH FOLLOWED BY YEAR{2 or 4} I will use two times in Final REGEXP
RE_DATE = '{RE_MONTH}(?:[\s]+)(\d{{2,4}})'.format(RE_MONTH=RE_MONTH)
# FINAL REGEX
RE_VALID_RANGE = re.compile('{RE_DATE}.+?(?:{RE_DATE}|(present))'.format(RE_DATE=RE_DATE), flags=re.IGNORECASE)
# if you want to extract both valid an invalide
valid_ranges = []
invalid_ranges = []
for line in text.split('\n'):
if line:
groups = re.findall(RE_VALID_RANGE, line)
if groups:
# If you want to do something with range
# all valid ranges are here my be 1 or 2 depends on the number of valid range in one line
# every group have 4 elements because there is 4 capturing group
# if M2,Y2 are not empty present is empty or the inverse only one of them is there (because of (?:{RE_DATE}|(present)) )
M1, Y1, M2, Y2, present = groups[0] # here use loop if you want to verify the values even more
valid_ranges.append(line)
else:
invalid_ranges.append(line)
print('VALID: ', valid_ranges)
print('INVALID:', invalid_ranges)
# this yields only valid ranges if there is 2 in one line will yield two valid ranges
# if you are dealing with lines this is not what you want
valid_ranges = []
for match in re.finditer(RE_VALID_RANGE, text):
# if you want to check the ranges
M1, Y1, M2, Y2, present = match.groups()
valid_ranges.append(match.group(0)) # the text is returned here
print('VALID USING <finditer>: ', valid_ranges)
OUPUT:
VALID: ['February 2016 - March 2019 February 2017 - March 2019', 'September 2015 to August 2019', 'April 2015 to present', 'September 2018 - present']
INVALID: ['George Mason University august 2019', 'Stratusburg university February 2018', 'Some text and month followed by year']
VALID USING <finditer>: ['February 2016 - March 2019', 'February 2017 - March 2019', 'September 2015 to August 2019', 'April 2015 to present', 'September 2018 - present']
Я ненавижу писать длинные регулярные выражения в одной переменной str
, которую я люблюсломать его, чтобы понять, что он делает, когда я читаю мой код после шести месяцев.Обратите внимание, как первая строка делится на две допустимые строки диапазона, используя finditer
Если вы хотите просто извлечь диапазоны, вы можете использовать это:
valid_ranges = re.findall(RE_VALID_RANGE, text)
Но это возвращает группы ([M1, Y1, M2, Y2, present)..]
не текст:
[('February', '2016', 'March', '2019', ''), ('February', '2017', 'March', '2019', ''), ('September', '2015', 'August', '2019', ''), ('April', '2015', '', '', 'present'), ('September', '2018', '', '', 'present')]