Я пытаюсь войти в систему с requests
. Я уже пытался получить все заголовки запроса, а также пытался сначала отправить команду post
, но все равно не повезло. Это страница, на которой я хотел бы войти: https://app.textmaster.com/sign_in
мой полный код выглядит следующим образом:
import bs4
import csv
import requests
import io
class Scraper:
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.19 16.47 Safari/537.36'
BASE_URL = 'https://app.textmaster.com/'
HEADERS = {
'authority': 'app.textmaster.com',
'method': 'GET',
'path': '/sign_in',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7,de-DE;q=0.6,de;q=0.5,es;q=0.4',
'cache-control': 'max-age=0',
# 'cookie': '__cfduid=d7f44a0609d068787e651f2b02361e98f1559677645; tm_user_subdomain=eu; _ga=GA1.2.750647803.1559677650; _fbp=fb.1.1559677650541.1238343741; __adroll_fpc=a0b53df5cec0fdb37a2333eb600de462-s2-1559677651144; _ga=GA1.3.750647803.1559677650; first_page_url=%252F; referrer_url=https%253A%252F%252Fwww.google.com%252F; hubspotutk=c83690f9decc2a43ad5dbf93dc65d7f0; _gid=GA1.2.1337788018.1578269194; __hssrc=1; __hstc=16116521.c83690f9decc2a43ad5dbf93dc65d7f0.1561982007244.1578677034723.1578744030017.61; __hssc=16116521.1.1578744030017; locale=pt-PT; arp_scroll_position=0; tm_login_status=logged+in; _TextMaster.com_session=NW56OVNXZjlXSHE1TXIrQXNGUWJTWWtrQnFUaW5oZjFNWGcxYUQ3dDJuTXAzelo2c0VzWDVTTW0yNXVwNkNzeTRJSko1WHBuSUdrc1ZBeC9BQzZpYko5RC8rT1BBT3pOL3k0Yk5RMFFLZXFQQjZrSUtCUjF4U21vWElKemRBdWp0ZUpRUElmeVdVUzBWdVBKbjhJMjJHeGx0Z1BHaFFCT25IeThXejFnVUs3aDhWRFlCZDIzT091TGVwTUo4Rno2WlFETFhPelNvUC84Tk5MRmxWcm05ZGN4VXozaFhhdUQrOC90VVNSSUk2SFF4MFVoUEo4dFNPSmwzRGNqK3dIc3hkSnN6dm9RYTFDc3RWckJwczlIc3Y2RTg5L1BwN2s3eXV4d2xYZXpiZzlVREJybldVYUNZaFZYME93Wm9qaFNQQ1d0c2hXR0IrbVJuOTFIWW1NQVdLR1h6RlNkT3I4ZDlNWVE4K2lRMmV3PS0tR2pzYjk0TlZxaTZHcVlKdGxBN3BEdz09--5d82d7d0acc92d74c20225b2f7aac55d0beaba13; __ar_v4=FZ7ISAP6XFCWVCHSI4YDBZ%3A20200030%3A39012%7CXGGEFNKZ6JE3BKC22JQJST%3A20200030%3A39012%7CGPAMRTN4ONFDDBPFKNYYUL%3A20200030%3A39012',
'referer': 'https://app.textmaster.com/authors/dashboard',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}
def __init__(self):
# if there is any error, will be stored here
self.errors = []
self.session = requests.session()
self.authenticated = self.authenticate()
def request(self, url: str) -> bs4.BeautifulSoup:
'''
get a page and return a bs object
'''
res = self.session.get(url, headers=self.HEADERS)
html = Scraper.make_soup(res.text)
return html
def authenticate(self) -> bool:
LOGIN_URL = self.BASE_URL + 'sign_in'
# enter in the authentation page
soup = self.request(LOGIN_URL)
# gets the authenticity token from the login page
# it's necesary to perform the login
input_token = soup.find(
'input', {'name': 'authenticity_token', 'type': 'hidden'}
)
# if the token was not found, add the error
# and conclude the authentation
if not input_token:
self.errors.append('Could\'t get the authentation token')
return False
# get the token from the page
token = input_token.attrs['value']
email = '********'
passw = '*******'
# using the user, password and token, perform the authentation
payload = {
'utf8': '',
'authenticity_token': token,
'user[email]': email,
'user[password]': passw,
'commit': 'Log in'
}
# send the payload
rest = self.session.get(LOGIN_URL, headers=self.HEADERS, data=payload, stream=True) ; testing if get post fixes
res = self.session.post(LOGIN_URL, headers=self.HEADERS, data=payload)
print(res.url)
# successful authentation will redirect to /authors/dashboard
authenticated = '/authors/dashboard' in res.url
return authenticated
def extract_data(self, soup: bs4.BeautifulSoup) -> dict:
question_items = soup.find_all('div', {'class': 'q-question-item'})
results = []
for question_item in question_items:
question_info = question_item.find(
'div', {'class': 'q-question-info'})
question_ano = question_info.find_all('span')[0]
question_banca = question_info.find_all('span')[1]
question_enunciation = question_item.find(
'div', {'class': 'q-question-enunciation'})
question_ano_text = question_ano.text.replace('Ano:', '').strip()
question_banca_text = question_banca.text.replace(
'Banca:', '').strip()
question_enunciation_text = question_enunciation.text
row_data = {
'ano': question_ano_text,
'banca': question_banca_text,
'enunciation': question_enunciation_text,
}
letters = ('', 'a', 'b', 'c', 'd', 'e')
for i in range(5):
row_data['choice_' + letters[i + 1]] = ''
question_options = question_item.find(
'ul', {'class': 'q-question-options'}).find_all('li')
if len(question_options) == 5:
for i in range(5):
enum = question_options[i].find(
'div', {'class': 'q-item-enum'})
row_data['choice_' + letters[i + 1]
] = enum.text if enum is not None else ''
row_data['type'] = 'multiple'
elif len(question_options) == 2:
row_data['type'] = 'true-false'
else:
row_data['type'] = 'N/A'
results.append(row_data)
return results
@staticmethod
def make_soup(html: str) -> bs4.BeautifulSoup:
return bs4.BeautifulSoup(html, 'html.parser')
if __name__ == "__main__":
scraper = Scraper()
Если это основа c вопрос, не могли бы вы рассказать мне, как я могу продолжить изучение этой проблемы?
Спасибо!