В настоящее время я делаю приложение для скребка, но прежде чем полностью использовать приложение, используя другие фреймворки, такие как Discord.py, мне сначала нужно было сначала почистить сайт. Оказалось довольно сложно поцарапать сайт. Сайт, который я пытаюсь почистить, это Fiverr. В любом случае, короче говоря, мне нужно было получить несколько файлов cookie для входа в систему с помощью запросов Python. Большая проблема сейчас заключается в том, что данные, которые мне нужно обработать, представлены в виде JSON, о котором я не знаю много. Мне удалось выбрать рассматриваемый javascript, но после загрузки он выдает ошибку: «TypeError: объект JSON должен быть str, bytes или bytearray, а не Tag». Мне особенно нужна часть "строк", которая является частью данных JSON.
Я не совсем уверен, как это исправить, и прочитал и попробовал некоторые подобные вопросы здесь. Я буду признателен за любую помощь.
import requests
from bs4 import BeautifulSoup
import re
import json
# Irrelevant to the question
class JobClass:
def __init__(self, date=None, buyer=None, request=None, duration=None, budget=None, link="https://www.fiverr.com/users/myusername/requests", id=None):
self.date = date
self.buyer = buyer
self.request = request
self.duration = duration
self.budget = budget
self.link = link
self.id = id
# Irrelevant to the question
duplicateSet = set()
scrapedSet = set()
jobObjArr = []
headers = {
# Some private cookies. To get them you just need to use a site like https://curl.trillworks.com/ it is really a life saver
# This is used to tell the site who you are to be logged in (which is why I deleted this part out of the code)
}
# Please note that I used "myusername" in the URL. This is going to be different depending on user
# Using the requests module, we use the "get" function
# provided to access the webpage provided as an
# argument to this function:
result = requests.get(
'https://www.fiverr.com/users/myusername/requests', headers=headers)
# Now, let us store the page content of the website accessed
# from requests to a variable:
src = result.content
# Now that we have the page source stored, we will use the
# BeautifulSoup module to parse and process the source.
# To do so, we create a BeautifulSoup object based on the
# source variable we created above:
soup = BeautifulSoup(src, "lxml")
data = soup.select("[type='text/javascript']")[1]
print(data)
# TypeError: the JSON object must be str, bytes or bytearray, not Tag
jsonObject = json.loads(data)
# Here is the output of print(data):
<script type="text/javascript">
document.viewData = {
"dds": {
"subCats": {
"current": {
"text": "All Subcategories",
"val": "-1"
},
"options": [{
"text": "Web \u0026 Mobile Design",
"val": 151
}, {
"text": "Web Programming",
"val": 140
}]
}
},
"results": {
"rows": [{
"type": "none",
"identifier": "5cf132b55e08360011efe633",
"cells": [{
"text": "May 31, 2019",
"type": "date",
"withText": true
}, {
"userPict": "\u003cspan class=\"missing-image-user \"\u003ec\u003c/span\u003e",
"type": "profile-40",
"cssClass": "height95"
}, {
"hintBottom": false,
"text": "My website was hacked and deleted. Need to have it recreated ",
"type": "text-wide",
"tags": [],
"attachment": false
}, {
"text": 1,
"type": "applications",
"alignCenter": true
}, {
"text": "3 days",
"type": "hidden-action",
"actionVisible": false,
"alignCenter": true,
"withText": true,
"buttons": [{
"type": "span",
"text": "3 days",
"class": "duration"
}, {
"type": "button",
"text": "Remove Request",
"class": "remove-request js-remove-request",
"meta": {
"requestId": "5cf132b55e08360011efe633",
"isProfessional": false
}
}]
}, {
"text": "---",
"type": "hidden-action",
"actionVisible": false,
"alignCenter": true,
"withText": true,
"buttons": [{
"type": "span",
"text": "---",
"class": "budget"
}, {
"type": "button",
"text": "Send Offer",
"class": "btn-standard btn-green-grad js-send-offer",
"meta": {
"username": "conto217",
"category": 3,
"subCategory": 151,
"requestId": "5cf132b55e08360011efe633",
"requestText": "My website was hacked and deleted. Need to have it recreated ",
"userPict": "\u003cspan class=\"missing-image-user \"\u003ec\u003c/span\u003e",
"isProfessional": false,
"buyerId": 32969684
}
}]
}]
}, {
"type": "none",
"identifier": "5cf12f641b6e99000edf1b60",
"cells": [{
"text": "May 31, 2019",
"type": "date",
"withText": true
}, {
"userPict": "\u003cimg src=\"https://fiverr-res.cloudinary.com/t_profile_small,q_auto,f_auto/attachments/profile/photo/648ceb417a85844b25e8bf070a70d9a0-254781561534997516.9743/MyFileName\" alt=\"muazamkhokher\" width=\"40\" height=\"40\"\u003e",
"type": "profile-40",
"cssClass": "height95"
}, {
"hintBottom": false,
"text": "Need mobile ui/ux designer from marvel wireframes",
"type": "text-wide",
"tags": [],
"attachment": false
}, {
"text": 4,
"type": "applications",
"alignCenter": true
}, {
"text": "5 days",
"type": "hidden-action",
"actionVisible": false,
"alignCenter": true,
"withText": true,
"buttons": [{
"type": "span",
"text": "5 days",
"class": "duration"
}, {
"type": "button",
"text": "Remove Request",
"class": "remove-request js-remove-request",
"meta": {
"requestId": "5cf12f641b6e99000edf1b60",
"isProfessional": false
}
}]
}, {
"text": "$50",
"type": "hidden-action",
"actionVisible": false,
"alignCenter": true,
"withText": true,
"buttons": [{
"type": "span",
"text": "$50",
"class": "budget"
}, {
"type": "button",
"text": "Send Offer",
"class": "btn-standard btn-green-grad js-send-offer",
"meta": {
"username": "muazamkhokher",
"category": 3,
"subCategory": 151,
"requestId": "5cf12f641b6e99000edf1b60",
"requestText": "Need mobile ui/ux designer from marvel wireframes",
"userPict": "\u003cimg src=\"https://fiverr-res.cloudinary.com/t_profile_small,q_auto,f_auto/attachments/profile/photo/648ceb417a85844b25e8bf070a70d9a0-254781561534997516.9743/MyFileName\" alt=\"muazamkhokher\" width=\"100\" height=\"100\"\u003e",
"isProfessional": false,
"buyerId": 25478156
}
}]
}]
....
Я ожидаю, что JSON будет загружен в jsonObject, но я получаю сообщение об ошибке: "TypeError: объект JSON должен быть str, байтами или байтовым массивом, а не тегом"
Edit: Вот некоторый код в конце оператора print. По какой-то причине он случайно отключается без конечного тега script:
}, {
"type": "none",
"identifier": "5cf1236a959aa5000f1ce094",
"cells": [{
"text": "May 31, 2019",
"type": "date",
"withText": true
}, {
"userPict": "\u003cimg src=\"https://fiverr-res.cloudinary.com/t_profile_small,q_auto,f_auto/profile/photos/30069758/original/Universalco_2a_Cloud.png\" alt=\"clarky2000\" width=\"40\" height=\"40\"\u003e",
"type": "profile-40",
"cssClass": "height95"
}, {
"hintBottom": false,
"text": "Slider revolution slider. 3 slides for a music festival. I can supply a copy what each slide should look like (see attached) and all the individual objects. Anyone can create basic RS slides, but I want this to be dynamic as its for a music festival. We are using the free version of RS if were are required to use the paid version of SL for addons please let us know. Bottom line this must be 3 dynamic slides (using the same background) for a music festival audience. Unlimited revisions is a must.",
"type": "see-more",
"tags": [{
"text": "Graphic UI"
}, {
"text": "Landing Pages"
}],
"attachment": {
"url": "/download/file/1559260800%2Fgig_requests%2Fattachment_f2a5f51b9fb473e8fc7f498929f39e3f",
"name": "Outwith Rotator_1920x1080_1.jpg",
"size": "2.68 MB"
}
}, {
"text": 2,
"type": "applications",
"alignCenter": true
}, {
"text": "24 hours",
"type": "hidden-action",
"actionVisible": false,
"alignCenter": true,
"withText": true,
"buttons": [{
"type": "span",
"text": "24 hours",
"class": "duration"
}, {
"type": "button",
"text": "Remove Request",
"class": "remove-request js-remove-request",
"meta": {
"requestId": "5cf1236a959aa5000f1ce094",
"isProfessional": false
}
}]
}, {
"text": "$23",
"type": "hidden-action",
"actionVisible": false,
"alignCenter": true,
"withText": true,
"buttons": [{
"type": "span",
"text": "$23",
"class": "budget"
}, {
"type": "button",
"text": "Send Of