Как конвертировать xml файл в HashMap используя apache Tika - PullRequest
0 голосов
/ 12 февраля 2020

В моем случае я могу прочитать файл xml и проанализировать его, чтобы получить содержимое, поскольку метаданные предоставляют только тип файла "application / xml"

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;  
public class XmlParserExample {   
    public static void main(String[] args) throws IOException, SAXException, TikaException {  
         BodyContentHandler handler   = new BodyContentHandler();  
         XMLParser parser             = new XMLParser();  
         Metadata metadata            = new Metadata();  
         ParseContext pcontext        = new ParseContext();  
         FileInputStream inputstream = new FileInputStream(new File("example.xml"));
         parser.parse(inputstream, handler, metadata, pcontext);
         System.out.println("Contents of the document:" + handler.toString());
         System.out.println("Metadata of the document:");
         String[] metadataNames = metadata.names();

         for(String name : metadataNames) {
            System.out.println(name + ": " + metadata.get(name));
         }
    }  
} 

Выше фрагмент кода печатает весь контент xml и тип контента (в виде метаданных). Но я также хочу получить теги xml, чтобы я мог создать HashMap, который является требованием в моем случае. Ниже приведен мой пример с пустышкой. xml: -

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE PubmedArticleSet SYSTEM "http://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">
<PubmedArticleSet>
  <PubmedArticle>
    <MedlineCitation Status="MEDLINE" Owner="NLM">
      <PMID Version="1">27483086</PMID>
      <DateCompleted>
        <Year>2018</Year>
        <Month>05</Month>
        <Day>02</Day>
      </DateCompleted>
      <DateRevised>
        <Year>2018</Year>
        <Month>05</Month>
        <Day>02</Day>
      </DateRevised>
      <Article PubModel="Print-Electronic">
        <Journal>
          <ISSN IssnType="Electronic">1532-849X</ISSN>
          <JournalIssue CitedMedium="Internet">
            <Volume>26</Volume>
            <Issue>4</Issue>
            <PubDate>
              <Year>2017</Year>
              <Month>Jun</Month>
            </PubDate>
          </JournalIssue>
          <Title>Journal of prosthodontics : official journal of the American College of Prosthodontists</Title>
          <ISOAbbreviation>J Prosthodont</ISOAbbreviation>
        </Journal>
        <ArticleTitle>The Use of CADCAM Technology for Fabricating Cast Gold Survey Crowns under Existing Partial Removable Dental Prosthesis. A Clinical Report.</ArticleTitle>
        <Pagination>
          <MedlinePgn>321-326</MedlinePgn>
        </Pagination>
        <ELocationID EIdType="doi" ValidYN="Y">10.1111jopr.12525</ELocationID>
        <Abstract>
          <AbstractText>The fabrication of a survey crown under an existing partial removable dental prosthesis (PRDP) has always been a challenge to many dental practitioners. This clinical report presents a technique for fabricating accurate cast gold survey crowns to fit existing PRDPs using CAD/CAM technology. The report describes a technique that would digitally scan the coronal anatomy of a cast gold survey crown and an abutment tooth under existing PRDPs planned for restoration, prior to any preparation. The information is stored in the digital software where all the coronal anatomical details are preserved without any modifications. The scanned designs are then applied to the scanned teeth preparations, sent to the milling machine and milled into full-contour clear acrylic resin burn-out patterns. The acrylic resin patterns are tried in the patient's mouth the same day to verify the full seating of the PRDP components. The patterns are then invested and cast into gold crowns and cemented in the conventional manner.</AbstractText>
          <CopyrightInformation>© 2016 by the American College of Prosthodontists.</CopyrightInformation>
        </Abstract>
        <AuthorList CompleteYN="Y">
          <Author ValidYN="Y">
            <LastName>El Kerdani</LastName>
            <ForeName>Tarek</ForeName>
            <Initials>T</Initials>
            <AffiliationInfo>
              <Affiliation>Department of Restorative Dental Sciences, Division of Prosthodontics, University of Florida College of Dentistry, Gainesville, FL.</Affiliation>
            </AffiliationInfo>
          </Author>
          <Author ValidYN="Y">
            <LastName>Roushdy</LastName>
            <ForeName>Sally</ForeName>
            <Initials>S</Initials>
            <AffiliationInfo>
              <Affiliation>Department of Restorative Dental Sciences, Division of Prosthodontics, University of Florida College of Dentistry, Gainesville, FL.</Affiliation>
            </AffiliationInfo>
          </Author>
        </AuthorList>
        <Language>eng</Language>
        <PublicationTypeList>
          <PublicationType UI="D002363">Case Reports</PublicationType>
          <PublicationType UI="D016428">Journal Article</PublicationType>
        </PublicationTypeList>
        <ArticleDate DateType="Electronic">
          <Year>2016</Year>
          <Month>08</Month>
          <Day>02</Day>
        </ArticleDate>
      </Article>
      <MedlineJournalInfo>
        <Country>United States</Country>
        <MedlineTA>J Prosthodont</MedlineTA>
        <NlmUniqueID>9301275</NlmUniqueID>
        <ISSNLinking>1059-941X</ISSNLinking>
      </MedlineJournalInfo>
      <ChemicalList>
        <Chemical>
          <RegistryNumber>7440-57-5</RegistryNumber>
          <NameOfSubstance UI="D006046">Gold</NameOfSubstance>
        </Chemical>
      </ChemicalList>
      <CitationSubset>D</CitationSubset>
      <MeshHeadingList>
        <MeshHeading>
          <DescriptorName UI="D000368" MajorTopicYN="N">Aged</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D017076" MajorTopicYN="Y">Computer-Aided Design</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D003442" MajorTopicYN="Y">Crowns</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D000044" MajorTopicYN="N">Dental Abutments</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D017267" MajorTopicYN="Y">Dental Prosthesis Design</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D003832" MajorTopicYN="Y">Denture, Partial, Removable</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D006046" MajorTopicYN="N">Gold</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D008297" MajorTopicYN="N">Male</DescriptorName>
        </MeshHeading>
      </MeshHeadingList>
      <KeywordList Owner="NOTNLM">
        <Keyword MajorTopicYN="N">CADM</Keyword>
        <Keyword MajorTopicYN="N">cast gold</Keyword>
        <Keyword MajorTopicYN="N">milled acrylic resin patterns</Keyword>
      </KeywordList>
    </MedlineCitation>
    <PubmedData>
      <History>
        <PubMedPubDate PubStatus="accepted">
          <Year>2016</Year>
          <Month>06</Month>
          <Day>13</Day>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="pubmed">
          <Year>2016</Year>
          <Month>8</Month>
          <Day>3</Day>
          <Hour>6</Hour>
          <Minute>0</Minute>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="medline">
          <Year>2018</Year>
          <Month>5</Month>
          <Day>3</Day>
          <Hour>6</Hour>
          <Minute>0</Minute>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="entrez">
          <Year>2016</Year>
          <Month>8</Month>
          <Day>3</Day>
          <Hour>6</Hour>
          <Minute>0</Minute>
        </PubMedPubDate>
      </History>
      <PublicationStatus>ppublish</PublicationStatus>
      <ArticleIdList>
        <ArticleId IdType="pubmed">27483086</ArticleId>
        <ArticleId IdType="doi">10.111pr.12525</ArticleId>
      </ArticleIdList>
    </PubmedData>
  </PubmedArticle>
  <PubmedArticle>
    <MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM">
      <PMID Version="1">27483087</PMID>
      <DateCompleted>
        <Year>2018</Year>
        <Month>08</Month>
        <Day>07</Day>
      </DateCompleted>
      <DateRevised>
        <Year>2018</Year>
        <Month>08</Month>
        <Day>07</Day>
      </DateRevised>
      <Article PubModel="Print-Electronic">
        <Journal>
          <ISSN IssnType="Electronic">2326-5205</ISSN>
          <JournalIssue CitedMedium="Internet">
            <Volume>68</Volume>
            <Issue>11</Issue>
            <PubDate>
              <Year>2016</Year>
              <Month>11</Month>
            </PubDate>
          </JournalIssue>
          <Title>Arthritis &amp; rheumatology (Hoboken, N.J.)</Title>
        </Journal>
        <ArticleTitle>Reply.</ArticleTitle>
        <Pagination>
          <MedlinePgn>2826-2827</MedlinePgn>
        </Pagination>
        <ELocationID EIdType="doi" ValidYN="Y">10t.39831</ELocationID>
        <AuthorList CompleteYN="Y">
          <Author ValidYN="Y">
            <LastName>Hitchon</LastName>
            <ForeName>Carol Ann</ForeName>
            <Initials>CA</Initials>
            <AffiliationInfo>
              <Affiliation>University of Manitoba, Winnipeg, Manitoba, Canada.</Affiliation>
            </AffiliationInfo>
          </Author>
          <Author ValidYN="Y">
            <LastName>Koppejan</LastName>
            <ForeName>Hester</ForeName>
            <Initials>H</Initials>
            <AffiliationInfo>
              <Affiliation>Leiden University Medical Center, Leiden, The Netherlands.</Affiliation>
            </AffiliationInfo>
          </Author>
          <Author ValidYN="Y">
            <LastName>Trouw</LastName>
            <ForeName>Leendert A</ForeName>
            <Initials>LA</Initials>
            <AffiliationInfo>
              <Affiliation>Leiden University Medical Center, Leiden, The Netherlands.</Affiliation>
            </AffiliationInfo>
          </Author>
          <Author ValidYN="Y">
            <LastName>Huizinga</LastName>
            <ForeName>Tom J W</ForeName>
            <Initials>TJ</Initials>
            <AffiliationInfo>
              <Affiliation>Leiden University Medical Center, Leiden, The Netherlands.</Affiliation>
            </AffiliationInfo>
          </Author>
          <Author ValidYN="Y">
            <LastName>Toes</LastName>
            <ForeName>René E M</ForeName>
            <Initials>RE</Initials>
            <AffiliationInfo>
              <Affiliation>Leiden University Medical Center, Leiden, The Netherlands.</Affiliation>
            </AffiliationInfo>
          </Author>
          <Author ValidYN="Y">
            <LastName>El-Gabalawy</LastName>
            <ForeName>Hani S</ForeName>
            <Initials>HS</Initials>
            <AffiliationInfo>
              <Affiliation>University of Manitoba, Winnipeg, Manitoba, Canada.</Affiliation>
            </AffiliationInfo>
          </Author>
        </AuthorList>
        <Language>eng</Language>
        <GrantList CompleteYN="Y">
          <Grant>
            <GrantID>MOP‐77700</GrantID>
            <Agency>CIHR</Agency>
            <Country>Canada</Country>
          </Grant>
        </GrantList>
        <PublicationTypeList>
          <PublicationType UI="D016422">Letter</PublicationType>
          <PublicationType UI="D013485">Research Sup</PublicationType>
          <PublicationType UI="D016420">Comment</PublicationType>
        </PublicationTypeList>
        <ArticleDate DateType="Electronic">
          <Year>2016</Year>
          <Month>10</Month>
          <Day>09</Day>
        </ArticleDate>
      </Article>
      <MedlineJournalInfo>
        <Country>United States</Country>
        <MedlineTA>Arthritis Rheumatol</MedlineTA>
        <NlmUniqueID>101623795</NlmUniqueID>
        <ISSNLinking>2326-5191</ISSNLinking>
      </MedlineJournalInfo>
      <CommentsCorrectionsList>
        <CommentsCorrections RefType="CommentOn">
          <RefSource>dff</RefSource>
          <PMID Version="1">27483211</PMID>
        </CommentsCorrections>
        <CommentsCorrections RefType="CommentOn">
          <RefSource>Arthritis Rheumato</RefSource>
          <PMID Version="1">26946484</PMID>
        </CommentsCorrections>
      </CommentsCorrectionsList>
    </MedlineCitation>
    <PubmedData>
      <History>
        <PubMedPubDate PubStatus="received">
          <Year>2016</Year>
          <Month>07</Month>
          <Day>26</Day>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="accepted">
          <Year>2016</Year>
          <Month>07</Month>
          <Day>28</Day>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="pubmed">
          <Year>2016</Year>
          <Month>10</Month>
          <Day>28</Day>
          <Hour>6</Hour>
          <Minute>0</Minute>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="medline">
          <Year>2016</Year>
          <Month>10</Month>
          <Day>28</Day>
          <Hour>6</Hour>
          <Minute>1</Minute>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="entrez">
          <Year>2016</Year>
          <Month>8</Month>
          <Day>3</Day>
          <Hour>6</Hour>
          <Minute>0</Minute>
        </PubMedPubDate>
      </History>
      <PublicationStatus>ppublish</PublicationStatus>
      <ArticleIdList>
        <ArticleId IdType="pubmed">27483087</ArticleId>
        <ArticleId IdType="doi">efre</ArticleId>
      </ArticleIdList>
    </PubmedData>
  </PubmedArticle>
</PubmedArticleSet>

Пожалуйста, помогите мне с этим. Спасибо

1 Ответ

1 голос
/ 17 марта 2020

Мое предложение: если вы хотите прочитать файл XML, а затем проанализировать его содержимое, вам, вероятно, лучше использовать специализированный синтаксический анализатор XML, а не Tika.

Есть различные варианты - каждый со своими плюсами и минусами (например, скорость, потребление памяти).

Вот один из подходов - он считывает весь файл в память, но вы уже делаете это с вашим подходом Tika, поэтому я предполагаю, что размер файла не является проблемой.

Код предполагает, что есть файл с именем pubmed.xml, содержащий XML, представленный в вопросе.

Он читает XML из файла и обрабатывает каждый элемент как узел DOM.

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import java.io.File;

...

public void parseUsingDom() {
    try {

        File xmlFile = new File("C:/tmp/pubmed.xml");
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(xmlFile);
        doc.getDocumentElement().normalize();

        NodeList articles = doc.getElementsByTagName("Article");
        for (int i = 0; i < articles.getLength(); i++) {
            Node article = articles.item(i);
            if (article.getNodeType() == Node.ELEMENT_NODE) {
                Element articleElement = (Element) article;

                String title = articleElement
                        .getElementsByTagName("ArticleTitle")
                        .item(0).getTextContent();
                System.out.println("");
                System.out.println("Title  : " + title);

                NodeList authors = articleElement.getElementsByTagName("Author");
                for (int j = 0; j < authors.getLength(); j++) {
                    Node author = authors.item(j);
                    if (author.getNodeType() == Node.ELEMENT_NODE) {
                        Element authorElement = (Element) author;

                        String foreName = authorElement
                                .getElementsByTagName("ForeName")
                                .item(0).getTextContent();
                        String lastName = authorElement
                                .getElementsByTagName("LastName")
                                .item(0).getTextContent();
                        System.out.println("Author : " + lastName + ", " + foreName);
                    }
                }
            }
        }
    } catch (Exception e) {
        System.err.print(e);
    }
}

Программа выводит следующий вывод, как демонстрацию того, что возможно:

Title  : The Use of CADCAM Technology for Fabricating Cast Gold Survey Crowns under Existing Partial Removable Dental Prosthesis. A Clinical Report.
Author : El Kerdani, Tarek
Author : Roushdy, Sally

Title  : Reply.
Author : Hitchon, Carol Ann
Author : Koppejan, Hester
Author : Trouw, Leendert A
Author : Huizinga, Tom J W
Author : Toes, René E M
Author : El-Gabalawy, Hani S

В вашем случае вы бы захватили соответствующие значения в га sh карта, конечно.

...