Я пытаюсь получить HTML с веб-страниц.Однако не все URL написаны правильно.Большинство недействительных URL-адресов в списке включают http, но теперь они используют https.Некоторые пропускают "www.", А с другими "www."должен быть добавлен.
def repl_www_http(url):
x = url.replace("www.", "")
y = x.replace("http", "https")
return y
def repl_www(url):
y = url.replace("www.", "")
return y
def repl_http(url):
y = url.replace("http", "https")
return y
def repl_no_www(url):
y = url.replace("//", "//www.")
return y
def get_html(urllist):
for i in urllist:
html = ""
try:
html = requests.get(i)
html = html.text
return html
except requests.exceptions.ConnectionError:
try:
html = requests.get(repl_http(i))
html = html.text
print("replaced // with //www.")
except requests.exceptions.ConnectionError:
try:
html = requests.get(repl_http(i))
html = html.text
print("replaced http with https")
return html
except requests.exceptions.ConnectionError:
try:
html = requests.get(repl_www(i))
html = html.text
print("replaced www. with .")
return html
except requests.exceptions.ConnectionError:
try:
html = requests.get(repl_www_http(i))
html = html.text
print("replaced www with . and http with https")
return html
except requests.exceptions.ConnectionError:
return "no HTML found on this URL"
print("gethtml finished", html)
Это ошибка, которую я получаю:
Traceback (most recent call last): File "C:\replacer.py", line 76, in <module> html = get_html(i)
File "C:\replacer.py", line 37, in get_html html = requests.get(repl_http(i))
File "C:\Users\LorenzKort\AppData\Local\Programs\Python\Python37\lib\site-packages\requests-2.19.1-py3.7.egg\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs) File "C:\Users\LorenzKort\AppData\Local\Programs\Python\Python37\lib\site-packages\requests-2.19.1-py3.7.egg\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs) File "C:\Users\LorenzKort\AppData\Local\Programs\Python\Python37\lib\site-packages\requests-2.19.1-py3.7.egg\requests\sessions.py", line 498, in request
prep = self.prepare_request(req) File "C:\Users\LorenzKort\AppData\Local\Programs\Python\Python37\lib\site-packages\requests-2.19.1-py3.7.egg\requests\sessions.py", line 441, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Users\LorenzKort\AppData\Local\Programs\Python\Python37\lib\site-packages\requests-2.19.1-py3.7.egg\requests\models.py",line 309, in prepare
self.prepare_url(url, params) File "C:\Users\LorenzKort\AppData\Local\Programs\Python\Python37\lib\site-packages\requests-2.19.1-py3.7.egg\requests\models.py",
line 383, in prepare_url
raise MissingSchema(error)requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Как решить эту проблему, чтобы исправить неправильные URL-адреса?