Я пытаюсь создать график, основанный на некоторых ссылках, которые я собираю, просматривая. Все работает нормально, если я ищу только 1 тег, но если я пытаюсь использовать несколько тегов, я получаю следующую ошибку:
File "c:\Users\qnour\Desktop\Programming\Python\GettingStarted\Wiki_Scraping.py", line 89, in <module>
main()
File "c:\Users\qnour\Desktop\Programming\Python\GettingStarted\Wiki_Scraping.py", line 32, in main
drawGraph(graph)
File "c:\Users\qnour\Desktop\Programming\Python\GettingStarted\Wiki_Scraping.py", line 85, in drawGraph
graph.write_png('wiki_graph.png', prog='dot')
File "C:\Users\qnour\AppData\Local\Programs\Python\Python36\lib\site-packages\pydot\__init__.py", line 1807, in <lambda>
lambda path, f=frmt, prog=self.prog : self.write(path, format=f, prog=prog))
File "C:\Users\qnour\AppData\Local\Programs\Python\Python36\lib\site-packages\pydot\__init__.py", line 1909, in write
dot_fd.write(self.create(prog, format))
File "C:\Users\qnour\AppData\Local\Programs\Python\Python36\lib\site-packages\pydot\__init__.py", line 2013, in create
stderr_output = ''.join(stderr_output)
TypeError: sequence item 0: expected str instance, bytes found
вот код:
import bs4 as bs
import urllib.request
import pydot
import graphviz
from IPython.display import Image, display
import os
def viewPydot(pdot):
plt = Image(pdot.create_png())
display(plt)
global sauce
global soup
def main():
global sauce
global soup
firstElement = input("Please select the first element : ")
bareLink = "https://en.wikipedia.org/wiki/"
sectionNumber = calculateSection(bareLink+firstElement)
if (sectionNumber == -1):
print("no see also section ! ")
exit(0)
url = "https://en.wikipedia.org/w/api.php?action=parse&prop=links&page={}§ion={}".format(firstElement, sectionNumber)
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce, 'lxml')
listUrl = gatherLinks()
fullUrl = createNewLinks(listUrl)
graph = createGraph(listUrl, firstElement)
drawGraph(graph)
#TODO
#ADD THE NEW LINKS TO THE GRAPH
def createNewLinks(listUrl):
bareLink = "https://en.wikipedia.org/wiki/"
fullUrl = []
for item in listUrl:
fullUrl.append(bareLink + item)
return fullUrl
def gatherLinks():
header = soup.find_all("span", class_="s2")
found_star = False
listUrl = []
for item in header:
if (found_star):
print(item.text)
listUrl.append(item.text.split('"')[1])
found_star = False
else:
if (item.text == '"*"'):
found_star = True
return listUrl
def createGraph(listUrl, firstElement):
graph = pydot.Dot(graph_type='graph')
for graphEdge in listUrl:
edge = pydot.Edge(firstElement, graphEdge)
graph.add_edge(edge)
return graph
def calculateSection(url):
source = urllib.request.urlopen(url).read()
sectionSoup = bs.BeautifulSoup(source, 'lxml')
sections = sectionSoup.findAll(["h2", "h3", "h4"])
for number, item in enumerate(sections):
print(item.text)
if (item.text == "See also" or item.text == "See also[edit]"):
print(number)
return number
return -1
def drawGraph(graph):
graph.write_png('wiki_graph.png', prog='dot')
Image('wiki_graph.png')
if __name__=="__main__":
main()
Что меня беспокоит, так это изменение:
sections = sectionSoup.findAll(["h2", "h3", "h4"])
по:
sections = sectionSoup.findAll("h2")
заставляет все работать, но мне нужно проверить все 3 тега.