Я работаю с XML, который может иметь несколько дочерних элементов для каждого родительского узла. В качестве части вывода я хочу дублировать теги из разных частей документа.
Пример кода
import csv
import time
import pprint
import StringIO
from lxml import etree
# Handle Namespaces
ns = {'x': 'urn:hl7-org:v3', 'sdtc': 'urn:hl7-org:sdtc', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}
# Input XML
input_xml = \
'<Document xmlns="urn:hl7-org:v3" xmlns:sdtc="urn:hl7-org:sdtc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" classCode="DOCCLIN" moodCode="EVN"> \
<code code="34133-9" displayName="Summarization of Episode Note"/> \
<recordTarget contextControlCode="OP" typeCode="RCT"> \
<patientRole classCode="PAT"> \
<id assigningAuthorityName="MRN" extension="MRN123456"/> \
<id assigningAuthorityName="SSN" extension="SSN123456"/> \
</patientRole> \
</recordTarget> \
<component typeCode="COMP"> \
<component contextConductionInd="true" typeCode="COMP"> \
<section> \
<code code="11348-0" displayName="HISTORY OF PAST ILLNESS"/> \
<entry> \
<observation classCode="OBS" moodCode="EVN"> \
<value code="401.9" displayName="Hypertension, unspecified" xsi:type="CD" /> \
</observation> \
</entry> \
<entry> \
<observation classCode="OBS" moodCode="EVN"> \
<value code="250.00" displayName="Diabetes Mellitus" xsi:type="CD" /> \
</observation> \
</entry> \
</section> \
</component> \
<component contextConductionInd="true" typeCode="COMP"> \
<section> \
<code code="10160-0" codeSystem="2.16.840.1.113883.6.1"/> \
<entry typeCode="DRIV"> \
<substanceAdministration classCode="SBADM" moodCode="INT"> \
<consumable> \
<manufacturedMaterial> \
<code code="636676" displayName="Chantix Continuing Month Box 1 mg tablet"/> \
</manufacturedMaterial> \
</consumable> \
</substanceAdministration> \
</entry> \
<entry typeCode="DRIV"> \
<substanceAdministration classCode="SBADM" moodCode="INT"> \
<consumable> \
<manufacturedMaterial> \
<code code="1651272" displayName="Stiolto Respimat 2.5 mcg"/> \
</manufacturedMaterial> \
</consumable> \
</substanceAdministration> \
</entry> \
</section> \
</component> \
</component> \
</Document> '
dict_header = {
"id_type": "",
"id": "",
"section_code": "",
"section_name": "",
"code_code": "",
"code_name": ""
}
# Parse XML; get tree and root
tree = etree.parse(StringIO.StringIO(input_xml))
root = tree.getroot()
# --------------------------------------
# Document -> recordTarget
# --------------------------------------
record_target = tree.xpath("//x:Document/x:recordTarget", namespaces=ns)
for record in record_target:
ids = record.xpath(".//x:id", namespaces=ns)
for id in ids:
dict_header["id_type"] = id.xpath(".//@assigningAuthorityName", namespaces=ns)
dict_header["id"] = id.xpath(".//@extension", namespaces=ns)
# --------------------------------------
# Document -> Component
# --------------------------------------
document_components = tree.xpath("//x:Document/x:component/x:component", namespaces=ns)
for component in document_components:
# Component -> Section
# --------------------------------------
component_sections = component.xpath(".//x:section", namespaces=ns)
for section in component_sections:
dict_header["section_code"] = section.xpath("./x:code/@code", namespaces=ns)
dict_header["section_name"] = section.xpath("./x:code/@displayName", namespaces=ns)
# Section -> Entries
# --------------------------------------
section_entries = section.xpath(".//x:entry", namespaces=ns)
for entry in section_entries:
dict_header["code_code"] = entry.xpath(".//x:code/@code | .//x:value/@code", namespaces=ns)
dict_header["code_name"] = entry.xpath(".//x:code/@displayName | .//x:value/@displayName", namespaces=ns)
# Test Printing
pprint.pprint(dict_header)
Ожидаемый результат
id_type|id|section_code|section_name|code_code|code_name
SSN|SSN123456|11348-0|HISTORY OF PAST ILLNESS|401.9|Hypertension, unspecified
SSN|SSN123456|11348-0|HISTORY OF PAST ILLNESS|250.00|Diabetes Mellitus
SSN|SSN123456|10160-0||636676|Chantix Continuing Month Box 1 mg tablet
SSN|SSN123456|10160-0||1651272|Stiolto Respimat 2.5 mcg
MRN|MRN123456|11348-0|HISTORY OF PAST ILLNESS|401.9|Hypertension, unspecified
MRN|MRN123457|11348-0|HISTORY OF PAST ILLNESS|250.00|Diabetes Mellitus
MRN|MRN123458|10160-0||636676|Chantix Continuing Month Box 1 mg tablet
MRN|MRN123459|10160-0||1651272|Stiolto Respimat 2.5 mcg
Я могу продублировать строки для последних дочерних узлов, по которым я перебираю, но мне нужна помощь в создании вида перекрестного соединения / вложенного цикла для дублирования строк, когда в любом узле есть несколько дочерних узлов.