Как я могу получить в скрипте поле "привет"? Python Соскоб BS4 - PullRequest
1 голос
/ 14 июля 2020

Я хочу получить URL-адрес всех высококачественных изображений продукта со страницы Amazon. Я получаю доступ к данным в скрипте с помощью этого кода. Но мне не удалось получить hiRes Url в скрипте. Как я могу сломать этот скрипт и получить нужный URL? Можно ли сделать это с помощью Json?

from bs4 import BeautifulSoup
import time
import pandas as pd
import random
import requests
import re
from lxml import etree
from random import choice
import json

random1 = random.randrange(7, 10, 1)
random2 = random.randrange(70, 79, 1)
random3 = random.randrange(100, 130, 1)
random4 = random.randrange(5000, 9999, 1)

headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT "+str(random1)+".0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"+str(random2)+".0."+str(random4)+"."+str(random3)+" Safari/537.36",
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'TE': 'Trailers'}

url = "https://www.amazon.com/dp/B079JQ9XXV/?th=1&psc=1"

req = requests.get(url, headers=headers)

soup = BeautifulSoup(req.content,"html5lib")

data = soup.select("#imageBlock_feature_div > script:nth-child(2)")
print (data)

Вывод кода выглядит следующим образом:

[<script type="text/javascript">
P.when('A').register("ImageBlockATF", function(A){
var data = {
'colorImages': { 'initial': [{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41EiPhPDK5L._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41EiPhPDK5L._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SX679_.jpg":[679,679]},"variant":"MAIN","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41N9KFoYKoL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41N9KFoYKoL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SX679_.jpg":[679,679]},"variant":"PT01","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/4182H8952zL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/4182H8952zL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SX679_.jpg":[679,679]},"variant":"PT02","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SL1348_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/412bXbalmtL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/412bXbalmtL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SX679_.jpg":[679,679]},"variant":"PT03","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41%2BX1BNNQsL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41%2BX1BNNQsL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SX679_.jpg":[679,679]},"variant":"PT04","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41Y4Vo4qWPL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41Y4Vo4qWPL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SX679_.jpg":[679,679]},"variant":"PT05","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/31gMtEgw1aL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/31gMtEgw1aL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SX679_.jpg":[679,679]},"variant":"PT06","lowRes":null}]},
'colorToAsin': {'initial': {}},
'holderRatio': 1.0,
'holderMaxHeight': 700,
'heroImage': {'initial': []},
'heroVideo': {'initial': []},
'spin360ColorData': {'initial': {}},
'spin360ColorEnabled': {'initial': 0},
'spin360ConfigEnabled': false,
'spin360LazyLoadEnabled': false,
'showroomEnabled': false,
'showroomViewModel': {'initial': {}},
'playVideoInImmersiveView':true,
'useTabbedImmersiveView':true,
'totalVideoCount':'0',
'videoIngressATFSlateThumbURL':'',
'mediaTypeCount':'0',
'atfEnhancedHoverOverlay' : false,
'winningAsin': 'B079JQ9XXV',
'weblabs' : {},
'aibExp3Layout' : 0,
'aibRuleName' : 'frank-powered',
'acEnabled' : true,
'dp60VideoPosition': 0,
'dp60VariantList': '',
'dp60VideoThumb': '',
'dp60MainImage': 'https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SY355_.jpg',

'airyConfig' :A.$.parseJSON('{"jsUrl":"https://images-na.ssl-images-amazon.com/images/G/01/vap/video/airy2/prod/2.0.1460.0/js/airy.skin._CB485981857_.js","cssUrl":"https://images-na.ssl-images-amazon.com/images/G/01/vap/video/airy2/prod/2.0.1460.0/css/beacon._CB485971591_.css","swfUrl":"https://images-na.ssl-images-amazon.com/images/G/01/vap/video/airy2/prod/2.0.1460.0/flash/AiryBasicRenderer._CB485925577_.swf","foresterMetadataParams":{"marketplaceId":"ATVPDKIKX0DER","method":"Ce.ImageBlock","requestId":"03QTEGPJ7069REHEVVR7","session":"144-8953114-9602366","client":"Dpx"}}')

};
A.trigger('P.AboveTheFold'); // trigger ATF event.
return data;
});
</script>]

Я хочу возьмите это поле.

"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SL1500_.jpg
"hiRes":"https://images-na.ssl-images-amazon.com/images/I/54BBbLL._AC_SL1123_.jpg
"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61124BBbLL._AC_SL12340_.jpg
"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61v654bLL._AC_SL1430_.jpg

1 Ответ

2 голосов
/ 14 июля 2020

Вам нужно использовать регулярное выражение, чтобы вытащить соответствующую json строку

from bs4 import BeautifulSoup
import time
import pandas as pd
import random
import requests
import re
from lxml import etree
from random import choice
import json

random1 = random.randrange(7, 10, 1)
random2 = random.randrange(70, 79, 1)
random3 = random.randrange(100, 130, 1)
random4 = random.randrange(5000, 9999, 1)

headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT "+str(random1)+".0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"+str(random2)+".0."+str(random4)+"."+str(random3)+" Safari/537.36",
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'TE': 'Trailers'}

url = "https://www.amazon.com/dp/B079JQ9XXV/?th=1&psc=1"

req = requests.get(url, headers=headers)

soup = BeautifulSoup(req.content,"lxml")

data = soup.select("#imageBlock_feature_div > script:nth-child(2)")
script_text = data[0].text
json_str = re.search("{(.+)}",script_text)[0].replace("\'",'"').replace("null",'"null"')
json_obj = json.loads(json_str)
high_res = [i["hiRes"] for i in json_obj["initial"]]
print(high_res)

Вывод:

['https://images-na.ssl-images-amazon.com/images/I/61vyQkBBbLL._AC_SL1500_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/61zCQSTDE-L._AC_SL1500_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/61i%2BYShtPDL._AC_SL1500_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/61DWDTdsasL._AC_SL1348_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/615s8yG33VL._AC_SL1500_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71IDxOU0AAL._AC_SL1500_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/61%2BkttmqLiL._AC_SL1500_.jpg']
...