'''
Trying to split string 'text' below so that all segments are found and split.
There are three flavors of segments:
A: <tag attr1="one" attr2>some text</tag>
B: <tag attr1="one" attr2 text="some text"/>
C: some tagless text
I can't use an HTML parser, because C segments are in plain English.
I would like to be able to accept '/' as part of the text.
So, '/' is just a char unless it precedes '>' or comes before 'tag'.
In a perfect world, I would also like to accept '<' and '>', too.
So '<' is just a char unless it precedes 't' or '/'.
Currently I use patt[1] to split text with A and C segments, but now I want to be
able to split strings with B segments, too. Also, patt[1] does not allow '/' to be
passed in as a regular char without breaking the routine. Although it seems
to work, I don't like that the regexp limits matching according to '/'.
The goal is to properly split the text string into the expected output strings. Can someone
tell me what I'm doing wrong? Running the regexp on https://pythex.org/ looks okay, but it
still doesn't run properly under CPython 3.6 - 3.9.
'''
import pprint, re, sys, unittest
class TestClass(unittest.TestCase):
sub = False # True #
patt = {}
patt[1] = r'(<t(?:ag)?\s*[^>]*>[^>]*</t(?:ag)?>)' # orig, works for A and C but not B
patt[2] = r'(<t(?:ag)?.*?(?:/t(?:ag)?>|/>))' # misses one
# 1 2 2 3 4 4 31
patt[3] = r'(<t(?:ag)?[^/]*/t?>)' # misses one
# 1 2 2 1
patt[4] = r'(<t(?:ag)?.*?(?<=/)(?:t(?:ag)?)?>)' # with lookarounds. still misses one.
patt[5] = r'(<t(?:ag)?(?!/>)(?!/t).*?(?<=/)(?:t(?:ag)?)?>)' # with lookarounds. tried adding negs
patt[6] = r'(<t(?:ag)?.*?(?!/>)(?!/t)(?<=/)(?:t(?:ag)?)?>)'
text = "start: <t s=10 B=1>Size 10 bold</t><t siz=6>Size 6\nNew Line </t>" \
"<t u w=bold>Underlined and Bolded\n</t><t it>Italics</t><t>default</t>" \
'<t fam="Courier New" siz=18>Courier 18</t>' \
"<t bitmap=question/><t bitmap=info/> and <t fg=red>red/and/yellow</t>" \
"<t fg=red>lesser < or greater > or both <></t><t>bye</t>"
if sub:
text = re.sub(r"/>", "></t>", text)
expected_output = ['start: ', '<t s=10 B=1>Size 10 bold</t>', '<t siz=6>Size 6\nNew Line </t>',
'<t u w=bold>Underlined and Bolded\n</t>', '<t it>Italics</t>', '<t>default</t>',
'<t fam="Courier New" siz=18>Courier 18</t>',
'<t bitmap=question/>', '<t bitmap=info/>', ' and ', '<t fg=red>red/and/yellow</t>',
"<t fg=red>lesser < or greater > or both <></t>", '<t>bye</t>', ]
if sub:
expected_output = expected_output[:7] + ['<t bitmap=question></t>', '<t bitmap=info></t>'] + expected_output[9:]
exp_len = len(expected_output)
def test_patt1(self, i=1):
fields = [f for f in re.split(TestClass.patt[i], TestClass.text) if f]
print(f'\nPATT[{i}] {TestClass.patt[i]!r} gives {len(fields)} of the {TestClass.exp_len} expected fields:')
print(pprint.pformat(fields), '\n\n')
self.assertEqual(fields, TestClass.expected_output)
def test_patt2(self, i=2):
fields = [f for f in re.split(TestClass.patt[i], TestClass.text) if f]
print(f'\nPATT[{i}] {TestClass.patt[i]!r} gives {len(fields)} of the {TestClass.exp_len} expected fields:')
print(pprint.pformat(fields), '\n\n')
self.assertEqual(fields, TestClass.expected_output)
def test_patt3(self, i=3):
fields = [f for f in re.split(TestClass.patt[i], TestClass.text) if f]
print(f'\nPATT[{i}] {TestClass.patt[i]!r} gives {len(fields)} of the {TestClass.exp_len} expected fields:')
print(pprint.pformat(fields), '\n\n')
self.assertEqual(fields, TestClass.expected_output)
def test_patt4(self, i=4):
fields = [f for f in re.split(TestClass.patt[i], TestClass.text) if f]
print(f'\nPATT[{i}] {TestClass.patt[i]!r} gives {len(fields)} of the {TestClass.exp_len} expected fields:')
print(pprint.pformat(fields), '\n\n')
self.assertEqual(fields, TestClass.expected_output)
def test_patt5(self, i=5):
fields = [f for f in re.split(TestClass.patt[i], TestClass.text) if f]
print(f'\nPATT[{i}] {TestClass.patt[i]!r} gives {len(fields)} of the {TestClass.exp_len} expected fields:')
print(pprint.pformat(fields), '\n\n')
self.assertEqual(fields, TestClass.expected_output)
def test_patt6(self, i=6):
fields = [f for f in re.split(TestClass.patt[i], TestClass.text) if f]
print(f'\nPATT[{i}] {TestClass.patt[i]!r} gives {len(fields)} of the {TestClass.exp_len} expected fields:')
print(pprint.pformat(fields), '\n\n')
self.assertEqual(fields, TestClass.expected_output)
if __name__ == '__main__':
unittest.main()