Каждое предложение в doc2 отображается в виде графика. Теперь края были добавлены в виде s-o-v из соответствующего subject_list, object_list и verb_list.
Я попытался отобразить подключенные компоненты. Но порядок, в котором оно отображает предложение, не соответствует порядку, в котором были добавлены ребра.
# This Python file uses the following encoding: utf-8
%matplotlib notebook
import codecs
import itertools
import re
import networkx as nx
import matplotlib.pyplot as pl
from matplotlib.font_manager import FontProperties
prop = FontProperties()
graph = nx.Graph()
labels = {}
each_one = []
list_of_sentences = []
subject_list = []
object_list = []
verb_list = []
newDict = {}
with codecs.open('doc2.txt', encoding='utf-8') as f:
text = f.read()
sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
for stuff in sentences:
list_of_sentences.append(stuff)
new_list_of_sentences = []
for d in list_of_sentences:
s = d.replace(u'वतीन', '').replace(u'आनी', '').replace(u'हिणें', '').replace(',', '')
new_list_of_sentences.append(s)
f = open('doc2_tag.txt', 'r')
for line in f:
k, v = line.strip().split('/')
newDict[k.strip().decode('utf-8')] = v.strip()
f.close()
for sentence in new_list_of_sentences:
a = b = c = ""
sentence_word_list = sentence.split()
for word in sentence_word_list:
if newDict[word] == 'N-NNP':
a += word + " "
if newDict[word] == 'N-NN':
b += word + " "
if newDict[word] == 'JJ':
b += word + " "
if newDict[word] == 'QT-QTC':
b += word + " "
if newDict[word] == 'RB':
b += word + " "
if newDict[word] == 'N-NST':
b += word + " "
if newDict[word] == 'PR-PRP':
b += word + " "
if newDict[word] == 'PSP':
b += word + " "
if newDict[word] == 'CC-CCD':
b += word + " "
if newDict[word] == 'V-VM-VF':
c += word + " "
subject_list.append(a)
object_list.append(b)
verb_list.append(c)
konkani_dict = {u'सनरायझर्साक': u'सनरायझर्स', u'सनरायझर्सान': u'सनरायझर्स', u'सनरायझर्साच्या': u'सनरायझर्स'}
for idx, sub in enumerate(subject_list):
temp_list = sub.split(" ")
for i in temp_list:
if i in konkani_dict:
new_sub = sub.replace(i, konkani_dict[i])
subject_list[idx] = new_sub
for s in subject_list:
if s is not "":
graph.add_node(s)
labels[s] = s
for o in object_list:
if o is not "":
graph.add_node(o)
labels[b] = b
for v in verb_list:
if v is not "":
graph.add_node(v)
labels[v] = v
for (s, o, v) in zip(subject_list, object_list, verb_list):
if s and o is not "":
graph.add_edge(s, o)
if o and v is not "":
graph.add_edge(o, v)
pos=nx.spring_layout(graph,k=0.15,iterations=20)
nx.draw(graph, with_labels = True, font_family = "Nirmala UI", node_size = 40, font_size = 9 ,node_color = "darkblue")
pl.show()
sentences=[]
for component in nx.connected_components(graph):
g=(
filter(
lambda x: x[0] in component and x[1] in component,
graph.edges
)
)
p=[]
p= ''.join(item for tuple_ in g for item in tuple_)
print p
sentences.append(p)
print sentences
output=[]
for i in sentences:
inputWords = i.split(" ")
inputWords=inputWords[-1::-1]
output = ' '.join(inputWords)
print output
Ожидаемый результат выглядит примерно так:
शिखर धवनान सगळ्यांत चड ४५ धांवड्यो केल्यो ,
सनरायझर्स दीपर हुडा जैतांत पर्जळ्ळो
Это вывод, который я получаю: отображаемые предложения