Красивый суп 4 найти все входные теги для конкретного Div ID - PullRequest
0 голосов
/ 05 апреля 2020

Я пытаюсь извлечь значение выделенного тега ввода. В настоящее время я могу получить результат div id = "WopiDocWACContainer" style = "overflow: hidden; запустив код ниже

import requests
from bs4 import BeautifulSoup
Page = requests.get('https://editproj.sharepoint.com/:x:/g/Ea32XJl_g9VBreFAia_zMmEBY6FW2ZWh8F4VeJ1Rt5Z4YA?e=rpUKYv')
soup = BeautifulSoup(Page.text, 'html.parser')
fromtag = soup.find_all("div", {"id" : "WopiDocWACContainer"})
print(fromtag)

enter image description here

Я пытался код ниже, чтобы сузить и получить все входные теги, но я не могу получить

for tag in fromtag:
    inputtag = fromtag.find("input",{"type" : "hidden"})
    for tag in inputtag:
        print (tag.text)

Как извлечь значение выделенного ввода на картинке выше?

1 Ответ

1 голос
/ 05 апреля 2020

Страница загружена с JavaScript, модуль requests не сможет отрендерить JS.

Для вашего случая вывод desired фактически представлен в пределах HTML script тег. Поэтому я быстро проанализировал его с помощью regex.

import requests
import re

r = requests.get(
    "https://editproj.sharepoint.com/:x:/g/Ea32XJl_g9VBreFAia_zMmEBY6FW2ZWh8F4VeJ1Rt5Z4YA?rtime=CJOEsjTZ10g")

match = re.search(r'FileGetUrl":"(.*?)"', r.text)

print(match.group(1))

Вывод:

https://editproj.sharepoint.com/_layouts/15/download.aspx?UniqueId=995cf6ad-837f-41d5-ade1-4089aff33261\u0026Translate=false\u0026tempauth=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.eyJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvZWRpdHByb2ouc2hhcmVwb2ludC5jb21AZDJjZTI4MGQtYWExMi00ODQxLWFjYjMtOWYxZDNlMDYzYjhkIiwiaXNzIjoiMDAwMDAwMDMtMDAwMC0wZmYxLWNlMDAtMDAwMDAwMDAwMDAwIiwibmJmIjoiMTU4NjA3MzQwNCIsImV4cCI6IjE1ODYxMDk0MDQiLCJlbmRwb2ludHVybCI6Imtsc0lNb1NtVDQyejBXY085ZGQ2bHovUUJ3ZUVuZzZRd0MxcmdkTGxsVEU9IiwiZW5kcG9pbnR1cmxMZW5ndGgiOiIxMTkiLCJpc2xvb3BiYWNrIjoiVHJ1ZSIsImNpZCI6Ik5EVmlPRFExT1dZdE9UQmhNaTFoTURBd0xXTmlOemN0TTJSaE4yVTBZMlF6WWpWaiIsInZlciI6Imhhc2hlZHByb29mdG9rZW4iLCJzaXRlaWQiOiJNVFEwWmpsbU0yRXRNakV5TnkwME16RmhMV0ppTmpBdE1EY3dNbUV5TXpnNVpqQTMiLCJuYW1laWQiOiIwIy5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2IiwibmlpIjoibWljcm9zb2Z0LnNoYXJlcG9pbnQiLCJpc3VzZXIiOiJ0cnVlIiwiY2FjaGVrZXkiOiIwaC5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2Iiwic2hhcmluZ2lkIjoiNGtZU0VmaDFaMGlObWM3NnV1bkl6dyIsInR0IjoiMCIsInVzZVBlcnNpc3RlbnRDb29raWUiOiIyIn0.R3NjZWhxKzZobmI2bVhSK1JvZzNqUFl0QUw4SDhiTHlETkdQUWQ5MFZjVT0

Для полной нагрузки:

import requests
import re
import json

r = requests.get(
    "https://editproj.sharepoint.com/:x:/g/Ea32XJl_g9VBreFAia_zMmEBY6FW2ZWh8F4VeJ1Rt5Z4YA?rtime=CJOEsjTZ10g")

match = re.search(r"var _wopiContextJson =({.+})", r.text).group(1)

data = json.loads(match)

print(data.keys())

# print(json.dumps(data, indent=4)) # to see it in human readable format.

Вывод:

dict_keys(['HostName', 'SessionId', 'UserId', 'WebAppUrl', 'FileName', 'FileSize', 'FileGetUrl', 'BundleMajorVersion', 'BundleUrl', 'ReadOnly', 'IrmEnabled', 'LabelIrmed', 'LastModified', 'ServerStartTime', 'ServerCompleteTime', 'DocUniqueId', 'CTag', 'ETag', 'RumOneUpdate', 'OpenWacInPlace', 'TemplateInfo', 'BundleStaleness', 'IsAsyncBundleStale', 'IsActivatedAsyncPreviewKillSwich', 'ViewOnly', 'DelayLoadResources', 'DocAspxSingleFlush', 'Origin', 'Slrid', 'InteractiveReadonlyExperiment', 'ClickTime', 'UniqueClick', 'HostGeo', 'PredictedOfficeAppEndPoint', 'PreseededSessionKey', 'PreseededWacSessionId', 'ParentFolderFullUrl', 'DocAgeBucketAtViewTimeBasedOnLastModifiedTime', 'DocAgeBucketAtViewTimeBasedOnLastWrittenTime', 'DocCategoryBasedOnLastModifiedTime', 'DocCategoryBasedOnLastWrittenTime', 'SSRGenerationReason', 'RecordAgeBucketsAndCategoryForRumOneUsingWopicontext', 'ListItemId', 'ListId', 'AllowedOrigins', 'IsPragueDocument', 'PragueSocketStorageDiscovery', 'ResetUriToAddressBarLink', 'IsEduUser']) 
{
    "HostName": "SharePoint Online",
    "SessionId": "ABB8459F-300E-A000-C5E7-EB06D6377846",        
    "UserId": "",
    "WebAppUrl": "https://excel.officeapps.live.com/x/_layouts/xlviewerinternal.aspx?unified=1&ui=en%2DUS&rs=en%2DUS&WOPISrc=https%3A%2F%2Feditproj%2Esharepoint%2Ecom%2F%5Fvti%5Fbin%2Fwopi%2Eashx%2Ffiles%2F995cf6ad837f41d5ade14089aff33261&wdEnableRoaming=1&mscc=0&hid=abb8459f-300e-a000-c5e7-eb06d6377846",
    "FileName": "coronavirus-school-closures-data.xlsx",        
    "FileSize": 22439,
    "FileGetUrl": "https://editproj.sharepoint.com/_layouts/15/download.aspx?UniqueId=995cf6ad-837f-41d5-ade1-4089aff33261&Translate=false&tempauth=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.eyJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvZWRpdHByb2ouc2hhcmVwb2ludC5jb21AZDJjZTI4MGQtYWExMi00ODQxLWFjYjMtOWYxZDNlMDYzYjhkIiwiaXNzIjoiMDAwMDAwMDMtMDAwMC0wZmYxLWNlMDAtMDAwMDAwMDAwMDAwIiwibmJmIjoiMTU4NjA3MzgxOSIsImV4cCI6IjE1ODYxMDk4MTkiLCJlbmRwb2ludHVybCI6Imtsc0lNb1NtVDQyejBXY085ZGQ2bHovUUJ3ZUVuZzZRd0MxcmdkTGxsVEU9IiwiZW5kcG9pbnR1cmxMZW5ndGgiOiIxMTkiLCJpc2xvb3BiYWNrIjoiVHJ1ZSIsImNpZCI6IllXSmlPRFExT1dZdE16QXdaUzFoTURBd0xXTTFaVGN0WldJd05tUTJNemMzT0RRMiIsInZlciI6Imhhc2hlZHByb29mdG9rZW4iLCJzaXRlaWQiOiJNVFEwWmpsbU0yRXRNakV5TnkwME16RmhMV0ppTmpBdE1EY3dNbUV5TXpnNVpqQTMiLCJuYW1laWQiOiIwIy5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2IiwibmlpIjoibWljcm9zb2Z0LnNoYXJlcG9pbnQiLCJpc3VzZXIiOiJ0cnVlIiwiY2FjaGVrZXkiOiIwaC5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2Iiwic2hhcmluZ2lkIjoiS0dISHljQ05Ca0s2Mlk4Z24zLzNSQSIsInR0IjoiMCIsInVzZVBlcnNpc3RlbnRDb29raWUiOiIyIn0.R2k1TU9kWFpKajJ5WHdiOFhtTWNFNmV2cERaTnNMQXlMM1plaDZIc1drOD0",
    "BundleMajorVersion": 1,
    "BundleUrl": "https://editproj.sharepoint.com/_api/v2.0/drives/b!Op9PFCchGkO7YAcCojifB1FMdGKT1DNGuc0VZ45Ny28Z17W52UlFQ5ThVO31JqPP/items/01RTHF455N6ZOJS74D2VA23YKARGX7GMTB/versions/Published/streams/content_preview_O{0}/streamContent?tempauth=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.eyJhdWQiOiIwMDAwMDAwMy0wMDAwLTBmZjEtY2UwMC0wMDAwMDAwMDAwMDAvZWRpdHByb2ouc2hhcmVwb2ludC5jb21AZDJjZTI4MGQtYWExMi00ODQxLWFjYjMtOWYxZDNlMDYzYjhkIiwiaXNzIjoiMDAwMDAwMDMtMDAwMC0wZmYxLWNlMDAtMDAwMDAwMDAwMDAwIiwibmJmIjoiMTU4NjA3MzgxOSIsImV4cCI6IjE1ODYwOTU0MTkiLCJlbmRwb2ludHVybCI6IjdxVmtxdVZ6bVlaN1MwQXQyUWR4dytLWktRclpHazlnVkxhVW43TGkyaUE9IiwiZW5kcG9pbnR1cmxMZW5ndGgiOiIxNTYiLCJpc2xvb3BiYWNrIjoiVHJ1ZSIsImNpZCI6IllXSmlPRFExT1dZdE16QXdaUzFoTURBd0xXTTFaVGN0WldJd05tUTJNemMzT0RRMiIsInZlciI6Imhhc2hlZHByb29mdG9rZW4iLCJzaXRlaWQiOiJNVFEwWmpsbU0yRXRNakV5TnkwME16RmhMV0ppTmpBdE1EY3dNbUV5TXpnNVpqQTMiLCJuYW1laWQiOiIwIy5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2IiwibmlpIjoibWljcm9zb2Z0LnNoYXJlcG9pbnQiLCJpc3VzZXIiOiJ0cnVlIiwiY2FjaGVrZXkiOiIwaC5mfG1lbWJlcnNoaXB8dXJuJTNhc3BvJTNhYW5vbiNkNWI2NDYyODQwYjk1MTVlNzcwYWE4MTViNDljNjNiZjk2OWY4MmQwNTdmMDhhZTljYjMwNjQwNTQ5YmMzYmQ2Iiwic2hhcmluZ2lkIjoiS0dISHljQ05Ca0s2Mlk4Z24zLzNSQSIsInR0IjoiMCIsInVzZVBlcnNpc3RlbnRDb29raWUiOiIyIn0.Z3BlU3hhWmNSRFI0YitvNkErWFkxU01uZVptZVVpZEgwUHBwNDBuVWhzMD0&usecachedssr=1&prefetchSSRCorrelationId=abb8459f-300e-a000-c5e7-eb06d6377846",
    "ReadOnly": true,
    "IrmEnabled": false,
    "LabelIrmed": false,
    "LastModified": 1585952003000,
    "ServerStartTime": 1586073819363.505,
    "ServerCompleteTime": 0,
    "DocUniqueId": "editproj.sharepoint.com_144f9f3a-2127-431a-bb60-0702a2389f07_995cf6ad-837f-41d5-ade1-4089aff33261",
    "CTag": "{995CF6AD-837F-41D5-ADE1-4089AFF33261},53,73",     
    "ETag": "\"{995CF6AD-837F-41D5-ADE1-4089AFF33261},53\"",    
    "RumOneUpdate": true,
    "OpenWacInPlace": false,
    "TemplateInfo": null,
    "BundleStaleness": 0,
    "IsAsyncBundleStale": false,
    "IsActivatedAsyncPreviewKillSwich": false,
    "ViewOnly": false,
    "DelayLoadResources": true,
    "DocAspxSingleFlush": true,
    "Origin": "Sharing",
    "Slrid": "aab8459f-d0f2-a000-c5e7-e3229e5f2252",
    "InteractiveReadonlyExperiment": "Control",
    "ClickTime": 0,
    "UniqueClick": "504924ca-7e0e-49d9-a411-0779c638ce68",      
    "HostGeo": "US",
    "PredictedOfficeAppEndPoint": "",
    "PreseededSessionKey": null,
    "PreseededWacSessionId": null,
    "ParentFolderFullUrl": "https://editproj.sharepoint.com/Shared Documents/cartomapcoronavirus",
    "DocAgeBucketAtViewTimeBasedOnLastModifiedTime": "1-2 days",
    "DocAgeBucketAtViewTimeBasedOnLastWrittenTime": "1-2 days", 
    "DocCategoryBasedOnLastModifiedTime": "Valid",
    "DocCategoryBasedOnLastWrittenTime": "Valid",
    "SSRGenerationReason": "AnalysisPipeline",
    "RecordAgeBucketsAndCategoryForRumOneUsingWopicontext": true,
    "ListItemId": 37,
    "ListId": "b9b5d719-49d9-4345-94e1-54edf526a3cf",
    "AllowedOrigins": [
        "https://excel.officeapps.live.com"
    ],
    "IsPragueDocument": false,
    "PragueSocketStorageDiscovery": null,
    "ResetUriToAddressBarLink": false,
    "IsEduUser": false
}
...