Я создал файл graphml с помощью networkx.Когда я пытаюсь прочитать:
graph = nx.read_graphml(graph_path)
Я получаю эту ошибку:
File "/usr/src/app/application/datasources/preprocess_sources.py", line 96, in create_similarities_dictionary
backend_1 | graph = nx.read_graphml(graph_path)
backend_1 | File "<decorator-gen-622>", line 2, in read_graphml
backend_1 | File "/usr/local/lib/python3.6/site-packages/networkx/utils/decorators.py", line 240, in _open_file
backend_1 | result = func_to_be_decorated(*new_args, **kwargs)
backend_1 | File "/usr/local/lib/python3.6/site-packages/networkx/readwrite/graphml.py", line 239, in read_graphml
backend_1 | glist = list(reader(path=path))
backend_1 | File "/usr/local/lib/python3.6/site-packages/networkx/readwrite/graphml.py", line 728, in __call__
backend_1 | self.xml = ElementTree(file=path)
backend_1 | File "/usr/local/lib/python3.6/xml/etree/ElementTree.py", line 557, in __init__
backend_1 | self.parse(file)
backend_1 | File "/usr/local/lib/python3.6/xml/etree/ElementTree.py", line 597, in parse
backend_1 | self._root = parser._parse_whole(source)
backend_1 | UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd5 in position 1: invalid continuation byte
Я уверен, что в файле должны быть какие-то странные символы, но все же это нене работаетНужно ли декодировать файл перед чтением?Это вообще возможно?
Спасибо за вашу помощь!
Редактировать: Вот как строится график:
def skill_graph_from_df(self, sx_dataframe, path_of_existing=""):
"""Builds directed graph from data frame, where the weight of the edges is the confidence, as used in associaton analysis.
:param sx_dataframe: Pandas Dataframe - columns: tags, postid, page, alltext.
:param path_of_existing: str - path of an existing skill graph in GraphML format.
New data is added to this graph. New graph is built if string is empty.
:return: void
"""
self.df_all = sx_dataframe
self.pagelist = self.df_all.page.unique()
len_df = len(self.df_all)
# directed graph with confidence of the rule keyword 1 => keyword 2 as weight for edges (google association analysis for explanation)
if path_of_existing is not "":
# import GraphML graph
self.read_graph(path_of_existing)
self.keywords_di.graph['pages'] = self.keywords_di.graph['pages'] + ", " + ", ".join(self.pagelist)
else:
self.keywords_di.graph['pages'] = ", ".join(self.pagelist)
for i in range(len_df):
taglist = nltk.word_tokenize(self.df_all.iloc[i, 0])
pairs = findsubsets(taglist, 2) # pairs of keywords
for word in taglist: # adds nodes
if word in self.keywords_di.nodes:
self.keywords_di.nodes[word]['count'] += 1
else:
self.keywords_di.add_node(word, count=1)
for pair in pairs: # adds edges
if pair in self.keywords_di.edges:
self.keywords_di.edges[pair]['paircount'] += 1
self.keywords_di.edges[pair[::-1]]['paircount'] += 1
else:
self.keywords_di.add_edge(*pair, paircount=1)
self.keywords_di.add_edge(*pair[::-1], paircount=1)
for node in self.keywords_di:
for edge in self.keywords_di.out_edges([node]):
self.keywords_di.edges[edge]['confidence'] = self.keywords_di.edges[edge]['paircount'] / self.keywords_di.nodes[node]['count']