Как извлечь текст из одного элемента XML? - PullRequest
0 голосов
/ 29 сентября 2019

Учитывая следующий XML, как я могу присвоить текст элемента переменной в Python?

<StudyFieldsResponse>
  <APIVrs>1.01.01</APIVrs>
  <DataVrs>2019:09:26 22:04:24.774</DataVrs>
  <Expression>(AREA[OverallStatus]Recruiting AND cholangiocarcinoma OR AREA[OverallStatus]\&quot;Not Yet Recruiting\&quot; AND cholangiocarcinoma) AND NOT AREA[OverallStatus]Active, not recruiting</Expression>
  <NStudiesAvail>317735</NStudiesAvail>
  <NStudiesFound>182</NStudiesFound>
  <MinRank>1</MinRank>
  <MaxRank>1000</MaxRank>
  <NStudiesReturned>182</NStudiesReturned>
  <StudyFieldsList>
    <StudyFields Rank="1"/>
    <StudyFields Rank="2"/>
    <StudyFields Rank="3"/>
    <StudyFields Rank="4"/>
    <StudyFields Rank="5"/>
    <StudyFields Rank="6"/>
    <StudyFields Rank="7"/>
    <StudyFields Rank="8"/>
    <StudyFields Rank="9"/>
    <StudyFields Rank="10"/>
    <StudyFields Rank="11"/>
    <StudyFields Rank="12"/>
    <StudyFields Rank="13"/>
    <StudyFields Rank="14"/>
    <StudyFields Rank="15"/>
    <StudyFields Rank="16"/>
    <StudyFields Rank="17"/>
    <StudyFields Rank="18"/>
    <StudyFields Rank="19"/>
    <StudyFields Rank="20"/>
    <StudyFields Rank="21"/>
    <StudyFields Rank="22"/>
    <StudyFields Rank="23"/>
    <StudyFields Rank="24"/>
    <StudyFields Rank="25"/>
    <StudyFields Rank="26"/>
    <StudyFields Rank="27"/>
    <StudyFields Rank="28"/>
    <StudyFields Rank="29"/>
    <StudyFields Rank="30"/>
    <StudyFields Rank="31"/>
    <StudyFields Rank="32"/>
    <StudyFields Rank="33"/>
    <StudyFields Rank="34"/>
    <StudyFields Rank="35"/>
    <StudyFields Rank="36"/>
    <StudyFields Rank="37"/>
    <StudyFields Rank="38"/>
    <StudyFields Rank="39"/>
    <StudyFields Rank="40"/>
    <StudyFields Rank="41"/>
    <StudyFields Rank="42"/>
    <StudyFields Rank="43"/>
    <StudyFields Rank="44"/>
    <StudyFields Rank="45"/>
    <StudyFields Rank="46"/>
    <StudyFields Rank="47"/>
    <StudyFields Rank="48"/>
    <StudyFields Rank="49"/>
    <StudyFields Rank="50"/>
    <StudyFields Rank="51"/>
    <StudyFields Rank="52"/>
    <StudyFields Rank="53"/>
    <StudyFields Rank="54"/>
    <StudyFields Rank="55"/>
    <StudyFields Rank="56"/>
    <StudyFields Rank="57"/>
    <StudyFields Rank="58"/>
    <StudyFields Rank="59"/>
    <StudyFields Rank="60"/>
    <StudyFields Rank="61"/>
    <StudyFields Rank="62"/>
    <StudyFields Rank="63"/>
    <StudyFields Rank="64"/>
    <StudyFields Rank="65"/>
    <StudyFields Rank="66"/>
    <StudyFields Rank="67"/>
    <StudyFields Rank="68"/>
    <StudyFields Rank="69"/>
    <StudyFields Rank="70"/>
    <StudyFields Rank="71"/>
    <StudyFields Rank="72"/>
    <StudyFields Rank="73"/>
    <StudyFields Rank="74"/>
    <StudyFields Rank="75"/>
    <StudyFields Rank="76"/>
    <StudyFields Rank="77"/>
    <StudyFields Rank="78"/>
    <StudyFields Rank="79"/>
    <StudyFields Rank="80"/>
    <StudyFields Rank="81"/>
    <StudyFields Rank="82"/>
    <StudyFields Rank="83"/>
    <StudyFields Rank="84"/>
    <StudyFields Rank="85"/>
    <StudyFields Rank="86"/>
    <StudyFields Rank="87"/>
    <StudyFields Rank="88"/>
    <StudyFields Rank="89"/>
    <StudyFields Rank="90"/>
    <StudyFields Rank="91"/>
    <StudyFields Rank="92"/>
    <StudyFields Rank="93"/>
    <StudyFields Rank="94"/>
    <StudyFields Rank="95"/>
    <StudyFields Rank="96"/>
    <StudyFields Rank="97"/>
    <StudyFields Rank="98"/>
    <StudyFields Rank="99"/>
    <StudyFields Rank="100"/>
    <StudyFields Rank="101"/>
    <StudyFields Rank="102"/>
    <StudyFields Rank="103"/>
    <StudyFields Rank="104"/>
    <StudyFields Rank="105"/>
    <StudyFields Rank="106"/>
    <StudyFields Rank="107"/>
    <StudyFields Rank="108"/>
    <StudyFields Rank="109"/>
    <StudyFields Rank="110"/>
    <StudyFields Rank="111"/>
    <StudyFields Rank="112"/>
    <StudyFields Rank="113"/>
    <StudyFields Rank="114"/>
    <StudyFields Rank="115"/>
    <StudyFields Rank="116"/>
    <StudyFields Rank="117"/>
    <StudyFields Rank="118"/>
    <StudyFields Rank="119"/>
    <StudyFields Rank="120"/>
    <StudyFields Rank="121"/>
    <StudyFields Rank="122"/>
    <StudyFields Rank="123"/>
    <StudyFields Rank="124"/>
    <StudyFields Rank="125"/>
    <StudyFields Rank="126"/>
    <StudyFields Rank="127"/>
    <StudyFields Rank="128"/>
    <StudyFields Rank="129"/>
    <StudyFields Rank="130"/>
    <StudyFields Rank="131"/>
    <StudyFields Rank="132"/>
    <StudyFields Rank="133"/>
    <StudyFields Rank="134"/>
    <StudyFields Rank="135"/>
    <StudyFields Rank="136"/>
    <StudyFields Rank="137"/>
    <StudyFields Rank="138"/>
    <StudyFields Rank="139"/>
    <StudyFields Rank="140"/>
    <StudyFields Rank="141"/>
    <StudyFields Rank="142"/>
    <StudyFields Rank="143"/>
    <StudyFields Rank="144"/>
    <StudyFields Rank="145"/>
    <StudyFields Rank="146"/>
    <StudyFields Rank="147"/>
    <StudyFields Rank="148"/>
    <StudyFields Rank="149"/>
    <StudyFields Rank="150"/>
    <StudyFields Rank="151"/>
    <StudyFields Rank="152"/>
    <StudyFields Rank="153"/>
    <StudyFields Rank="154"/>
    <StudyFields Rank="155"/>
    <StudyFields Rank="156"/>
    <StudyFields Rank="157"/>
    <StudyFields Rank="158"/>
    <StudyFields Rank="159"/>
    <StudyFields Rank="160"/>
    <StudyFields Rank="161"/>
    <StudyFields Rank="162"/>
    <StudyFields Rank="163"/>
    <StudyFields Rank="164"/>
    <StudyFields Rank="165"/>
    <StudyFields Rank="166"/>
    <StudyFields Rank="167"/>
    <StudyFields Rank="168"/>
    <StudyFields Rank="169"/>
    <StudyFields Rank="170"/>
    <StudyFields Rank="171"/>
    <StudyFields Rank="172"/>
    <StudyFields Rank="173"/>
    <StudyFields Rank="174"/>
    <StudyFields Rank="175"/>
    <StudyFields Rank="176"/>
    <StudyFields Rank="177"/>
    <StudyFields Rank="178"/>
    <StudyFields Rank="179"/>
    <StudyFields Rank="180"/>
    <StudyFields Rank="181"/>
    <StudyFields Rank="182"/>
  </StudyFieldsList>
</StudyFieldsResponse>

Я пытаюсь создать ежедневный журнал числа клинических испытаний, доступных дляконкретный рак, чтобы отслеживать число с течением времени.

Я попробовал несколько библиотек XML, но не могу понять, что происходит ... Я уверен, что это очень просто для тех, кто имеет опыт... TIA

import requests
response = requests.get('https://clinicaltrials.gov/api/query/study_fields?max_rnk=1000&expr=(AREA[OverallStatus]Recruiting+AND+cholangiocarcinoma+OR+AREA[OverallStatus]%5C%22Not+Yet+Recruiting%5C%22+AND+cholangiocarcinoma)+AND+NOT+AREA[OverallStatus]Active,+not+recruiting').text #Storing the XML into res

Ответы [ 3 ]

0 голосов
/ 29 сентября 2019

Вы можете использовать lxml или BeautifulSoup для получения текста или атрибута, но вы должны использовать строчные имена

lxml:

import requests
import lxml.html

r = requests.get('https://clinicaltrials.gov/api/query/study_fields?max_rnk=1000&expr=(AREA[OverallStatus]Recruiting+AND+cholangiocarcinoma+OR+AREA[OverallStatus]%5C%22Not+Yet+Recruiting%5C%22+AND+cholangiocarcinoma)+AND+NOT+AREA[OverallStatus]Active,+not+recruiting')

soup = lxml.html.fromstring(r.text)

print(soup.xpath('//nstudiesavail')[0].text)

for item in soup.xpath('//studyfields'):
    print(item.attrib['rank'])

BeautifulSoup:

import requests
from bs4 import BeautifulSoup as BS

r = requests.get('https://clinicaltrials.gov/api/query/study_fields?max_rnk=1000&expr=(AREA[OverallStatus]Recruiting+AND+cholangiocarcinoma+OR+AREA[OverallStatus]%5C%22Not+Yet+Recruiting%5C%22+AND+cholangiocarcinoma)+AND+NOT+AREA[OverallStatus]Active,+not+recruiting')

soup = BS(r.text)

print(soup.find('nstudiesavail').text)

for item in soup.find_all('studyfields'):
    print(item['rank'])
0 голосов
/ 29 сентября 2019

Ниже

import requests
import  xml.etree.ElementTree as ET
response = requests.get('https://clinicaltrials.gov/api/query/study_fields?max_rnk=1000&expr=(AREA[OverallStatus]Recruiting+AND+cholangiocarcinoma+OR+AREA[OverallStatus]%5C%22Not+Yet+Recruiting%5C%22+AND+cholangiocarcinoma)+AND+NOT+AREA[OverallStatus]Active,+not+recruiting')
if response.status_code == 200:
    root = ET.fromstring(response.text)
    studies_avail_elem = root.find('.//NStudiesAvail')
    if studies_avail_elem is not None:
        studies_avail = studies_avail_elem.text
        print('Num of available studies: {}'.format(studies_avail))

выход

Num of available studies: 317735
0 голосов
/ 29 сентября 2019

Вот основы создания одного элемента:

>>> from xml.etree.ElementTree import Element
>>> single_element = Element('study', rank=130)
>>> single_element = Element('study', rank=130, source='NEJM')
>>> single_element.text = 'Find cure for common cold'

Вот основы извлечения данных из одного элемента:

>>> print(single_element.tag)
study
>>> print(single_element.attrib)
{'rank': 130, 'source': 'NEJM'}
>>> print(single_element.text)
Find cure for common cold

Обход вложенных элементов включает ' find () или findall () методы:

>>> from xml.etree.ElementTree import fromstring
>>> xml = fromstring(xml_text)
>>> for study_fields in xml.findall('.//StudyFields'):
        print(study_fields.get('Rank'))
        print(study_fields.text)
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...