Я хочу очистить URL всех названий, используя python - PullRequest
0 голосов
/ 16 марта 2020

Я написал код для получения всех URL-адресов заголовков, но у меня есть некоторые проблемы, например, он отображает None значения. Не могли бы вы помочь мне?

Вот мой код:

import requests
from bs4 import BeautifulSoup
import csv

def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
    return soup

def get_index_data(soup):
    try:
        titles_link = soup.find_all('div',class_="marginTopTextAdjuster")
    except:
        titles_link = []
    urls = [item.get('href') for item in titles_link]
    print(urls)

def main():
    #url = "http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1"
    mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
    #get_page(url)
    get_index_data(get_page(mainurl))
    #write_csv(data,url)


if __name__ == '__main__':
    main()

Ответы [ 4 ]

1 голос
/ 16 марта 2020

Вы пытаетесь получить атрибут href тега div. Вместо этого попробуйте выбрать все теги a. Кажется, они имеют общий атрибут класса body_link_11.

Используйте titles_link = soup.find_all('a',class_="body_link_11") вместо titles_link = soup.find_all('div',class_="marginTopTextAdjuster")

0 голосов
/ 16 марта 2020
url = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
req = requests.get(url) 
soup = BeautifulSoup(req.text, "lxml") 

titles_link = []
titles_div = soup.find_all('div', attrs={'class': 'marginTopTextAdjuster'})
for link in titles_div:
    tag = link.find_all('a', href=True)
    try:
        if tag[0].attrs.get('item_id', None):
            titles_link.append({tag[0].text: tag[0].attrs.get('href', None)})
    except IndexError:
        continue

print(titles_link)

вывод:

[{'Civil Affairs Handbook, Japan, section 1a: population statistics.': '/cdm/singleitem/collection/p4013coll8/id/2653/rec/1'}, {'Army Air Forces Program 1943.': '/cdm/singleitem/collection/p4013coll8/id/2385/rec/2'}, {'Casualty report number II.': '/cdm/singleitem/collection/p4013coll8/id/3309/rec/3'}, {'Light armored division, proposed March 1943.': '/cdm/singleitem/collection/p4013coll8/id/2425/rec/4'}, {'Tentative troop list by type units for Blacklist operations.': '/cdm/singleitem/collection/p4013coll8/id/150/rec/5'}, {'Chemical Warfare Service: history of training, part 2, schooling of commissioned officers.': '/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6'}, {'Horses in the German Army (1941-1945).': '/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7'}, {'Unit history: 38 (MECZ) cavalry rcn. sq.': '/cdm/singleitem/collection/p4013coll8/id/3672/rec/8'}, {'Operations in France: December 1944, 714th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3407/rec/9'}, {'G-3 Reports : Third Infantry Division. (22 Jan- 30 Mar 44)': '/cdm/singleitem/collection/p4013coll8/id/4393/rec/10'}, {'Summary of operations, 1 July thru 31 July 1944.': '/cdm/singleitem/collection/p4013coll8/id/3445/rec/11'}, {'After action report 36th Armored Infantry Regiment, 3rd Armored Division, Nov 1944 thru April 1945.': '/cdm/singleitem/collection/p4013coll8/id/3668/rec/12'}, {'Unit history, 38th Mechanized Cavalry Reconnaissance Squadron, 9604 thru 9665.': '/cdm/singleitem/collection/p4013coll8/id/3703/rec/13'}, {'Redeployment: occupation forces in Europe series, 1945-1946.': '/cdm/singleitem/collection/p4013coll8/id/2952/rec/14'}, {'Twelfth US Army group directives. Annex no. 1.': '/cdm/singleitem/collection/p4013coll8/id/2898/rec/15'}, {'After action report, 749th Tank Battalion: Jan, Feb, Apr - 8 May 45.': '/cdm/singleitem/collection/p4013coll8/id/3502/rec/16'}, {'743rd Tank Battalion, S3 journal history.': '/cdm/singleitem/collection/p4013coll8/id/3553/rec/17'}, {'History of military training, WAAC / WAC training.': '/cdm/singleitem/collection/p4013coll8/id/4052/rec/18'}, {'After action report, 756th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3440/rec/19'}, {'After action report 92nd Cavalry Recon Squadron Mechanized 12th Armored Division, Jan thru May 45.': '/cdm/singleitem/collection/p4013coll8/id/3583/rec/20'}]
0 голосов
/ 16 марта 2020

Попробуйте это.

from simplified_scrapy import SimplifiedDoc,req,utils
url = 'http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1'
html = req.get(url)
doc  = SimplifiedDoc(html)
lst = doc.selects('div.marginTopTextAdjuster').select('a')
titles_link = [(utils.absoluteUrl(url,a.href),a.text) for a in lst if a]
print (titles_link)

Результат:

[('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'Civil Affairs Handbook, Japan, section 1a: population statistics.'), ('http://cgsc.cdmhost.com/cdm/landingpage/collection/p4013coll8', 'World War II Operational Documents'), ('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'Army Air Forces Program 1943.'),...
0 голосов
/ 16 марта 2020

Простой способ сделать это с requests и BeautifulSoup:

import requests
from bs4 import BeautifulSoup

req = requests.get(url)  # url stands for the page's url you want to find
soup = BeautifulSoup(req.text, "html.parser")  # req.text is the complete html of the page

print(soup.title.string)  # soup.title will give you the title of the page but with the <title> tags so .string removes them
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...