Вы можете использовать этот метод:
from urllib.parse import urljoin, urlparse
from lxml import html as lh
class Crawler:
def __init__(self, start_url):
self.start_url = start_url
self.base_url = f'{urlparse(self.start_url).scheme}://{urlparse(self.start_url).netloc}'
self.visited_urls = set()
def fetch_urls(self, html):
urls = []
dom = lh.fromstring(html)
for href in dom.xpath('//a/@href'):
url = urljoin(self.base_url, href)
if url not in self.visited_urls and url.startswith(self.base_url):
urls.append(url)
return urls