Я пытаюсь проанализировать несколько xml файлов и извлечь из них текст, однако, когда я пытаюсь распечатать извлеченный текст, у меня возникает следующая ошибка:
UnicodeEncodeError: 'charmap' codec can't encode character '\u2033' in position 884722: character maps to <undefined>
Пример файла XML:
<?xml version="1.0" encoding="ISO-8859-1"?>
<bncDoc xml:id="AHH">
<teiHeader>
<fileDesc>
<titleStmt>
<title> Daily Telegraph, electronic edition of 1992-04-05: Religious affairs stories. Sample containing about 97 words from a periodical (domain: belief and thought) </title>
<respStmt>
<resp> Data capture and transcription </resp>
<name> Oxford University Press </name>
</respStmt>
</titleStmt>
<editionStmt>
<edition>BNC XML Edition, December 2006</edition>
</editionStmt>
<extent> 97 tokens; 97 w-units; 8 s-units </extent>
<publicationStmt>
<distributor>Distributed under licence by Oxford University Computing Services on behalf of the BNC Consortium.</distributor>
<availability> This material is protected by international copyright laws and may not be copied or redistributed in any way. Consult the BNC Web Site at http://www.natcorp.ox.ac.uk for full licencing and distribution conditions.</availability>
<idno type="bnc">AHH</idno>
<idno type="old"> DbBeli </idno>
</publicationStmt>
<sourceDesc>
<bibl>
<title>Daily Telegraph, electronic edition of 1992-04-05: Religious affairs stories.</title>
<imprint n="DAILYT1">
<publisher>The Daily Telegraph plc</publisher>
<pubPlace>London</pubPlace>
<date value="1992">1992</date>
</imprint>
</bibl>
</sourceDesc>
</fileDesc>
<encodingDesc>
<tagsDecl>
<namespace name="">
<tagUsage occurs="13" gi="c"/>
<tagUsage occurs="1" gi="div"/>
<tagUsage occurs="1" gi="head"/>
<tagUsage occurs="5" gi="p"/>
<tagUsage occurs="8" gi="s"/>
<tagUsage occurs="97" gi="w"/>
</namespace>
</tagsDecl>
</encodingDesc>
<profileDesc>
<creation date="1992"> </creation>
<textClass>
<catRef targets="alltim3 news wriase3 wridom8 wrista3 "/>
<classCode scheme="DLEE">W newsp brdsht nat: social</classCode>
<keywords>
<term> (none) </term>
</keywords>
</textClass>
</profileDesc>
<revisionDesc>
<change date="2006-10-21" who="#OUCS">Tag usage updated for BNC-XML</change>
<change date="2000-12-13" who="#OUCS">Last check for BNC World first release</change>
<change date="2000-09-06" who="#OUCS">Redo tagusage tables</change>
<change date="2000-09-01" who="#OUCS">Check all tagcounts</change>
<change date="2000-06-23" who="#OUCS">Resequenced s-units and added headers</change>
<change date="2000-01-21" who="#OUCS">Added date info</change>
<change date="2000-01-09" who="#OUCS">Updated all catrefs</change>
<change date="2000-01-08" who="#OUCS">Updated source title</change>
<change date="2000-01-08" who="#OUCS">Updated titles</change>
<change date="1999-12-25" who="#OUCS">corrected tagUsage</change>
<change date="1999-09-13" who="#UCREL">POS codes revised for BNC-2; header updated</change>
<change date="1994-11-24" who="#dominic">Initial accession to corpus</change>
</revisionDesc>
</teiHeader>
<wtext type="NEWS">
<div level="1">
<head>
<s n="1">
<w pos="SUBST" hw="letter" c5="NN1">Letter </w>
<w pos="PREP" hw="to" c5="PRP">to </w>
<w pos="ART" hw="the" c5="AT0">the </w>
<w pos="SUBST" hw="editor" c5="NN1">Editor</w>
<c c5="PUN">: </c>
<w pos="ADJ" hw="gay" c5="AJ0">Gay </w>
<w pos="SUBST" hw="sex" c5="NN1">sex </w>
<w pos="VERB" hw="be" c5="VBZ">is </w>
<w pos="ADJ" hw="subversive" c5="AJ0">subversive</w>
</s>
</head>
<p>
<s n="2">
<w pos="SUBST" hw="sir" c5="NP0">SIR </w>
<w pos="SUBST" hw="ian" c5="NP0">Ian </w>
<w pos="SUBST" hw="mckellen" c5="NP0">McKellen </w>
<c c5="PUL">(</c>
<w pos="SUBST" hw="letter" c5="NN2">Letters</w>
<c c5="PUN">, </c>
<w pos="SUBST" hw="march" c5="NP0">March </w>
<w pos="ADJ" hw="29" c5="CRD">29</w>
<c c5="PUR">) </c>
<w pos="VERB" hw="must" c5="VM0">must </w>
<w pos="ADV" hw="not" c5="XX0">not </w>
<w pos="VERB" hw="be" c5="VBI">be </w>
<w pos="VERB" hw="allow" c5="VVN">allowed </w>
<w pos="PREP" hw="to" c5="TO0">to </w>
<w pos="VERB" hw="get" c5="VVI">get </w>
<w pos="ADV" hw="away" c5="AV0">away </w>
<w pos="PREP" hw="with" c5="PRP">with </w>
<w pos="PRON" hw="he" c5="DPS">his </w>
<w pos="SUBST" hw="blanket" c5="NN1">blanket </w>
<w pos="SUBST" hw="assertion" c5="NN1">assertion </w>
<w pos="CONJ" hw="that" c5="CJT-DT0">that </w>
<w pos="SUBST" hw="equality" c5="NN1">equality </w>
<w pos="PREP" hw="for" c5="PRP">for </w>
<w pos="SUBST" hw="homosexual" c5="NN2">homosexuals </w>
<w pos="VERB" hw="have" c5="VHZ">has </w>
<w pos="VERB" hw="be" c5="VBN">been </w>
<w pos="VERB" hw="establish" c5="VVN">established </w>
<w pos="PREP" hw="throughout" c5="PRP">throughout </w>
<w pos="SUBST" hw="europe" c5="NP0">Europe</w>
<c c5="PUN">.</c>
</s>
</p>
<p>
<s n="3">
<w pos="PRON" hw="one" c5="PNI">One </w>
<w pos="VERB" hw="have" c5="VHZ">has </w>
<w pos="PREP" hw="to" c5="TO0">to </w>
<w pos="VERB" hw="distinguish" c5="VVI">distinguish </w>
<w pos="PREP" hw="between" c5="PRP">between </w>
<w pos="ART" hw="the" c5="AT0">the </w>
<w pos="SUBST" hw="individual" c5="NN1-AJ0">individual </w>
<w pos="CONJ" hw="and" c5="CJC">and </w>
<w pos="ART" hw="the" c5="AT0">the </w>
<w pos="SUBST" hw="practice" c5="NN1">practice</w>
<c c5="PUN">.</c>
</s>
<s n="4">
<w pos="SUBST" hw="european" c5="NN2">Europeans </w>
<w pos="VERB" hw="recognise" c5="VVB">recognise </w>
<w pos="CONJ" hw="that" c5="CJT">that </w>
<w pos="ADJ" hw="some" c5="DT0">some </w>
<w pos="SUBST" hw="individual" c5="NN2">individuals </w>
<w pos="VERB" hw="be" c5="VBB">are </w>
<w pos="PREP" hw="by" c5="PRP">by </w>
<w pos="SUBST" hw="nature" c5="NN1">nature </w>
<w pos="SUBST" hw="homosexual" c5="NN1">homosexual</w>
<c c5="PUN">; </c>
<w pos="PREP" hw="as" c5="PRP">as </w>
<w pos="SUBST" hw="individual" c5="NN2">individuals</w>
<c c5="PUN">, </c>
<w pos="PRON" hw="they" c5="PNP">they </w>
<w pos="VERB" hw="must" c5="VM0">must </w>
<w pos="VERB" hw="be" c5="VBI">be </w>
<w pos="VERB" hw="respect" c5="VVN-AJ0">respected </w>
<w pos="CONJ" hw="and" c5="CJC">and </w>
<w pos="PRON" hw="they" c5="DPS">their </w>
<w pos="SUBST" hw="right" c5="NN2">rights </w>
<w pos="VERB" hw="must" c5="VM0">must </w>
<w pos="VERB" hw="be" c5="VBI">be </w>
<w pos="VERB" hw="protect" c5="VVN">protected</w>
<c c5="PUN">.</c>
</s>
</p>
<p>
<s n="5">
<w pos="CONJ" hw="but" c5="CJC">But </w>
<w pos="VERB" hw="let" c5="VVB">let </w>
<w pos="PRON" hw="we" c5="PNP">us </w>
<w pos="ADV" hw="not" c5="XX0">not </w>
<w pos="VERB" hw="be" c5="VBI">be </w>
<w pos="VERB" hw="lead" c5="VVN">led </w>
<w pos="ADV" hw="astray" c5="AV0-AJ0">astray</w>
<c c5="PUN">: </c>
<w pos="ART" hw="the" c5="AT0">the </w>
<w pos="SUBST" hw="practice" c5="NN1">practice </w>
<w pos="PREP" hw="of" c5="PRF">of </w>
<w pos="SUBST" hw="homosexuality" c5="NN1">homosexuality </w>
<w pos="VERB" hw="be" c5="VBZ">is </w>
<w pos="SUBST" hw="anathema" c5="NN1">anathema </w>
<w pos="PREP" hw="to" c5="PRP">to </w>
<w pos="ADJ" hw="all" c5="DT0">all </w>
<w pos="ADJ" hw="christian" c5="AJ0">Christian </w>
<w pos="SUBST" hw="people" c5="NN0">people </w>
<w pos="CONJ" hw="and" c5="CJC">and </w>
<w pos="ADV" hw="not" c5="XX0">not </w>
<w pos="ADV" hw="just" c5="AV0">just </w>
<w pos="PREP" hw="in" c5="PRP">in </w>
<w pos="SUBST" hw="europe" c5="NP0">Europe</w>
<c c5="PUN">.</c>
</s>
<s n="6">
<w pos="PRON" hw="it" c5="PNP">It </w>
<w pos="VERB" hw="can" c5="VM0">can </w>
<w pos="ADV" hw="only" c5="AV0">only </w>
<w pos="VERB" hw="subvert" c5="VVI">subvert </w>
<w pos="ART" hw="the" c5="AT0">the </w>
<w pos="ADJ" hw="young" c5="AJ0">young</w>
<c c5="PUN">.</c>
</s>
</p>
<p>
<s n="7">
<w pos="SUBST" hw="richard" c5="NP0">Richard </w>
<w pos="SUBST" hw="t." c5="NP0">T. </w>
<w pos="SUBST" hw="jones" c5="NP0">Jones</w>
</s>
</p>
<p>
<s n="8">
<w pos="SUBST" hw="southport" c5="NP0">Southport</w>
<c c5="PUN">,</c>
</s>
</p>
</div>
</wtext>
</bncDoc>
Мой код для синтаксического анализа и печати выглядит следующим образом:
#iterate through every file in the xml folder
for file in os.listdir(xmlFilesPath):
filename = os.fsdecode(file)
tree = ET.parse('XML Files/' + filename)
root = tree.getroot()
#get all s elements in each file
for s in root.iter('s'):
sentence = []
for w in s.iter('w'):
if(w.text is not None):
sentence.append(w.text.strip())
sentence.append(". ")
if(random() > 0.75):
testSentences.append(" ".join(sentence))
else:
trainingSentences.append(" ".join(sentence))
def unigram(sentencesIn):
words = [x.split() for x in sentencesIn]
print(words)
return
unigram(trainingSentences)
Я пытался закодировать текст на печать, однако, префикс всех слов с символом 'b', и я также попытался распечатать в текстовый файл с кодировкой utf-8, и это работает, однако моя цель - напечатать массив слов на экране, а не в текстовом файле.