sample.html
<A>
<B>
<C>Name1</C>
<D>Name2</D>
</B>
</A>
parse.py
from bs4 import BeautifulSoup
def get_root_elements(path_to_file):
soup = BeautifulSoup(open(path_to_file), 'lxml')
all_elements = soup.find_all()
count_element_indices = [len(list(a.parents)) for a in all_elements]
absolute_roots_index = min(
(index for index, element in enumerate(count_element_indices)
if element == max(count_element_indices)
)
)
return all_elements[absolute_roots_index:]
def get_path(element):
to_remove = ['[document]', 'body', 'html']
path = [element.name] + [e.name for e in element.parents if e.name not in to_remove]
return ' / '.join(path[::-1])
Python Shell
In [1]: file = 'path/to/sample.html'
In [2]: run parse.py
In [3]: roots = get_root_elements(file)
In [4]: print(roots)
[<c>Name1</c>, <d>Name2</d>]
In [4]: for root in roots:
...: print(get_path(root))
a / b / c
a / b / d