Python, мы не можем скопировать ссылку на файл PDF, используя BS и запросы - PullRequest
0 голосов
/ 02 мая 2020

Я пытаюсь скопировать pdf-файл, расположенный по этому URL: введите описание ссылки здесь

тег проверки кнопки показывает:

<path d="M16.2542045,37.9409195 L74.0703409,37.9409195 L74.0703409,38.6987315 L16.2542045,38.6987315 L16.2542045,37.9409195 L16.2542045,37.9409195 Z M22.8381818,47.7883691 C22.8381818,48.6830939 22.5630682,49.3724497 21.9456818,49.9224967 C21.3286364,50.4739127 20.5056818,50.7491074 19.4768182,50.7491074 C18.5175,50.7491074 17.7627273,50.4739127 17.1453409,49.9923221 C16.3905682,49.4409061 16.0486364,48.7529195 16.0486364,47.8568255 C16.0486364,47.0305571 16.3905682,46.3422282 17.1453409,45.7911544 C17.7627273,45.3095638 18.5856818,45.0343691 19.5463637,45.0343691 C20.71125,45.0343691 21.6719318,45.4475033 22.4267045,46.3422282 L21.60375,46.7553624 C21.0545455,46.1358323 20.3693182,45.8606376 19.4768182,45.8606376 C18.7915909,45.8606376 18.2423864,45.9975503 17.7627273,46.3422282 C17.2135227,46.7553624 16.9397727,47.236953 16.9397727,47.8568255 C16.9397727,48.476698 17.2135227,48.9593154 17.625,49.3724497 C18.1060227,49.7161007 18.7234091,49.9224967 19.4768182,49.9224967 C20.6430682,49.9224967 21.3981818,49.4409061 21.7401137,48.5461812 L19.2709091,48.5461812 L19.2709091,47.7199127 L22.8381818,47.7199127 L22.8381818,47.7883691 L22.8381818,47.7883691 Z M33.1939773,50.5423691 L32.1651137,50.5423691 L30.7251136,48.5461812 L28.0503409,48.5461812 L28.0503409,50.5423691 L27.1592045,50.5423691 L27.1592045,45.2411074 L31.3421591,45.2411074 C31.8913636,45.2411074 32.3710227,45.3780201 32.6447727,45.6542416 C32.9884091,45.9975503 33.1257955,46.4106846 33.1257955,46.8936443 C33.1257955,47.3752349 32.9884091,47.7199127 32.7825,47.9951074 C32.5084091,48.2699597 32.1651137,48.4082416 31.7536364,48.476698 L33.1939773,50.5423691 L33.1939773,50.5423691 Z M31.2047727,47.7199127 C31.8913636,47.7199127 32.2332955,47.4436913 32.2332955,46.8936443 C32.2332955,46.2737718 31.8913636,45.9975503 31.1365909,45.9975503 L28.0503409,45.9975503 L28.0503409,47.7199127 L31.2047727,47.7199127 L31.2047727,47.7199127 Z M43.8930682,47.8568255 C43.8930682,48.7529195 43.5497727,49.4409061 42.7960227,49.9923221 C42.1789773,50.4739127 41.4242045,50.7491074 40.4635227,50.7491074 C39.5723864,50.7491074 38.7494318,50.4739127 38.1320455,49.9923221 C37.4464773,49.4409061 37.035,48.7529195 37.035,47.8568255 C37.035,47.0305571 37.4464773,46.3422282 38.1320455,45.7911544 C38.8176136,45.3095638 39.5723864,45.0343691 40.4635227,45.0343691 C41.4242045,45.0343691 42.1789773,45.3095638 42.7960227,45.7911544 C43.5497727,46.3422282 43.8930682,47.0305571 43.8930682,47.8568255 L43.8930682,47.8568255 L43.8930682,47.8568255 Z M43.0019318,47.8568255 C43.0019318,47.236953 42.7268182,46.7553624 42.1789773,46.3422282 C41.6979545,45.9975503 41.1501137,45.8606376 40.4635227,45.8606376 C39.7779545,45.8606376 39.1609091,46.0673759 38.68125,46.4106846 C38.2002273,46.8238188 37.9943182,47.3067785 37.9943182,47.8568255 C37.9943182,48.476698 38.2002273,48.9593154 38.68125,49.3724497 C39.0927273,49.7161007 39.7097727,49.9224967 40.4635227,49.9224967 C41.2182955,49.9224967 41.8356818,49.7161007 42.3153409,49.3724497 C42.7268182,48.9593154 43.0019318,48.476698 43.0019318,47.8568255 L43.0019318,47.8568255 L43.0019318,47.8568255 Z M53.8373863,48.8213759 C53.8373863,49.3724497 53.5632955,49.8540403 53.0140909,50.1990604 C52.4665909,50.5423691 51.7114773,50.7491074 50.8203409,50.7491074 C49.9278409,50.7491074 49.1744318,50.5423691 48.6252273,50.1990604 C48.0763636,49.8540403 47.8022727,49.3724497 47.8022727,48.8213759 L47.8022727,45.2411074 L48.6934091,45.2411074 L48.6934091,48.6146376 C48.6934091,49.0277718 48.8311363,49.3724497 49.2426137,49.5791879 C49.5859091,49.7855839 50.13375,49.9224967 50.8203409,49.9224967 C51.5059091,49.9224967 52.0551137,49.7855839 52.3970455,49.5791879 C52.7403409,49.3724497 52.94625,49.0277718 52.94625,48.6146376 L52.94625,45.2411074 L53.8373863,45.2411074 L53.8373863,48.8213759 L53.8373863,48.8213759 Z M62.5482955,45.2411074 C63.0961363,45.2411074 63.5076136,45.3780201 63.8509091,45.6542416 C64.1942045,45.9975503 64.3305682,46.3422282 64.3305682,46.8936443 C64.3305682,47.4436913 64.1942045,47.7883691 63.8509091,48.133047 C63.5076136,48.4082416 63.0279545,48.5461812 62.4105682,48.5461812 L59.1865909,48.5461812 L59.1865909,50.5423691 L58.3636364,50.5423691 L58.3636364,45.2411074 L62.5482955,45.2411074 L62.5482955,45.2411074 Z M62.4105682,47.7199127 C63.0961363,47.7199127 63.4394318,47.4436913 63.4394318,46.8936443 C63.4394318,46.2737718 63.0961363,45.9975503 62.3423863,45.9975503 L59.1865909,45.9975503 L59.1865909,47.7199127 L62.4105682,47.7199127 L62.4105682,47.7199127 Z M74.0703409,50.5423691 L68.5148863,50.5423691 L68.5148863,45.2411074 L73.8644318,45.2411074 L73.8644318,45.9975503 L69.4060227,45.9975503 L69.4060227,47.3752349 L73.2473863,47.3752349 L73.2473863,48.2015033 L69.4060227,48.2015033 L69.4060227,49.7855839 L74.0703409,49.7855839 L74.0703409,50.5423691 L74.0703409,50.5423691 Z M75.1673865,1.16991947 C75.0296587,0.89472483 74.7555682,0.756785235 74.4818182,0.756785235 L69.0627273,0.756785235 C68.7889773,0.756785235 68.5148863,0.89472483 68.3775,1.16991947 L58.8446591,19.280396 C58.7072727,17.6963154 57.7465909,16.1132618 56.4439773,15.1487114 C54.3180682,13.6344564 50.6147727,12.6014497 46.4301136,12.2567718 C44.0304545,12.0500336 36.9657955,11.7053557 34.7717045,10.9489127 C33.1257955,10.3290403 32.3028409,9.43294628 32.3028409,8.12508728 C32.3028409,4.54481879 39.0231818,4.26859732 41.2864773,4.26859732 C46.2245455,4.26859732 50.0655682,5.30126174 53.2895455,6.95414094 C53.6314773,7.16053691 54.0429545,7.02362416 54.2488637,6.67894631 L55.5528409,4.20014094 C55.7584091,3.78700671 55.6892045,3.30541611 55.2092045,3.02919463 C51.4377273,0.89472483 46.9793182,0 41.7675,0 C37.8579545,0 34.0851136,0.55004698 30.9988637,2.40932215 C30.3814773,2.754 29.9018182,3.16713423 29.4221591,3.58026846 C28.0503409,1.72099329 25.1696591,0.756785235 22.2211364,0.756785235 L0.822954548,0.756785235 C0.341931818,0.756785235 0,1.10146309 0,1.51459732 L0,28.7831678 C0,29.2647584 0.341931818,29.6094362 0.822954548,29.6094362 L4.04556818,29.6094362 C4.45704545,29.6094362 4.80034091,29.2647584 4.80034091,28.7831678 L4.80034091,5.78422148 C4.80034091,5.37108725 5.14363636,5.02640939 5.55511363,5.02640939 L20.9867045,5.02640939 C24.8263636,5.02640939 26.8155682,6.61048994 26.8155682,9.84608055 C26.8155682,12.8081879 24.9640909,14.322443 21.2604545,14.322443 L10.0121591,14.322443 L7.74920452,18.5924094 L21.1922727,18.5924094 C24.6204545,18.5924094 27.2270455,17.834255 29.0106818,16.32 C29.6280682,15.7685839 30.1077273,15.1487114 30.5192045,14.4607248 C32.4392045,15.3554497 35.7310227,16.0434362 40.4635227,16.32 C45.8130682,16.6633088 49.1048864,17.0079866 50.3396591,17.3526644 C52.6721591,17.972537 53.8373863,19.1421141 53.8373863,21.0013892 C53.8373863,25.4092953 46.9793182,26.0976242 43.4134091,26.0976242 C38.5435227,26.0976242 33.7431818,25.1330738 29.69625,22.4475302 C29.2847727,22.1723356 28.9414773,22.240792 28.7355682,22.5858121 L27.2952273,25.2713557 C27.0214773,25.7529463 27.2270455,26.0976242 27.5011363,26.3040201 C32.5084091,29.5409799 38.2697727,30.2974229 43.6871591,30.2974229 C47.3226136,30.2974229 52.5344318,29.3342416 54.2488637,28.0948389 L53.8373863,28.8516242 C53.7010227,29.1278456 53.8373863,29.6094362 54.38625,29.6094362 L58.1580682,29.6094362 C58.4331818,29.6094362 58.7072727,29.402698 58.8446591,29.196302 L70.9840909,6.12787248 C71.1204545,5.85267785 71.2578409,5.71473826 71.6011363,5.71473826 L72.0126137,5.71473826 C72.3548864,5.71473826 72.4922727,5.85267785 72.63,6.12787248 L78.8706817,17.972537 C79.144773,18.5225839 78.9388635,18.8672618 78.391023,18.8672618 L68.1034091,18.8672618 L65.8401136,23.1368859 L81.133977,23.1368859 C81.407727,23.1368859 81.6831818,23.2737986 81.8195452,23.5500201 L84.6998865,29.196302 C84.8376135,29.402698 85.1113635,29.6094362 85.386477,29.6094362 L89.1582952,29.6094362 C89.7756817,29.6094362 89.7756817,29.0580201 89.6379548,28.7147114 L75.1673865,1.16991947 L75.1673865,1.16991947 Z"></path>

Что я предполагаю относится к JavaScript. Используя сеть, чтобы проверить, какие компоненты вызываются, я вижу, что сделано 2 вызова: первый - статус 302, второй - файл. Я попытался отправить запрос и получить повар ie, но без сохранения ie сохранить.

import requests

web_request = requests.get(url='https://www.groupe-psa.com/en/publication/monthly-world-sales-december-2019/')
cookie = web_request.cookies
print(cookie)
<RequestsCookieJar[]>

Как получить файл без использования селена? Лучший / Спасибо!

1 Ответ

0 голосов
/ 02 мая 2020

О пути, указанном в HTML инспектора, то есть просто директивы SVG: https://www.w3schools.com/graphics/svg_path.asp

Что касается структуры сайта, вы можете автоматизировать ежемесячные загрузки. Базовая структура работает на python3 с urllib и BS, избегая использования селена по запросу.

from bs4 import BeautifulSoup
import urllib
import re


# get month and year from november 2019 
month = "march"
year = "2020"
url = "https://www.groupe-psa.com/en/publication/monthly-world-sales-" + \
       month + "-" + year


# if there where many links, related to the document folder space... 
def getLinks(url):
    html_page = urllib.request.urlopen(url)
    soup = BeautifulSoup(html_page, "html.parser")
    links = []
    # Find all links related to the document space of the site
    for link in soup.findAll('a',  \
        attrs={'href': re.compile("^https://www.groupe-psa.com/en/document/")}):
        links.append(link.get('href'))
    return links

anchors = getLinks(url)
for anchor in anchors:
    pdfFile = urllib.request.urlopen(anchor)
    # Save the file
    file_name = month + year + ".pdf" 
    file = open(file_name, 'wb')
    file.write(pdfFile.read())
    file.close()
    print("Printing " + file_name)
...