Сохранение очищенных данных в MySQL - PullRequest
0 голосов
/ 19 июня 2020

Я написал скрап-бота для очистки данных, и теперь я хотел сохранить его в MySQL. Для этого я написал сценарий конвейера. Но он не работает, не знаю, с какой проблемой он сталкивается. Может ли кто-нибудь подсказать, в чем именно проблема. Код работает отлично, я думаю, он не может подключиться только к mySQL.

Код бота Scrapy:

from..items import NdtvItem

class NdtvReviewSpider(scrapy.Spider):
    name = 'ndtv_review'
    page_number = 2
    start_urls = ['https://www.ndtv.com/movie-reviews/page-1'
    ]

    def parse(self, response):

        items = {}

        i = 1
        count = response.xpath('//*[@id="ins_storylist"]/ul/li[{}]/div[2]/div[1]/a/i/text()'.format("*")).getall()
        count = len(count)
        while i<=count:
            outputs = NdtvItem()
            outputs['page_title'] = response.xpath('//*[@id="ins_storylist"]/ul/li[{}]/div[2]/div[1]/a/i/text()'.format(i)).get()
            outputs['review_content'] = response.xpath("//*[@id='ins_storylist']/ul/li[{}]/div[2]/div[3]/text()".format(i)).get()
            outputs['review_link'] = response.xpath("//*[@id='ins_storylist']/ul/li[{}]/div[2]/div[1]/a/@href".format(i)).get()
            i+=1
            fl = 0
            if outputs['page_title'] == []:
                outputs['page_title'] = ''
                fl+=1
            if outputs['review_content'] == []:
                outputs['review_content'] = ''
                fl+=1
            if outputs['review_link'] == []:
                outputs['review_link'] = ''
                fl += 1
            else:
                yield outputs
                #if outputs['review_content'] != []:
                 #   if 'Review:' in outputs['review_content'].split(" ") or 'review:' in outputs['review_content'].split(" ") :
                  #      yield outputs


        next_page = 'https://www.ndtv.com/movie-reviews/page-' + str(NdtvReviewSpider.page_number) +''
        if NdtvReviewSpider.page_number <= 15:
            NdtvReviewSpider.page_number += 1  
            yield response.follow(next_page, callback = self.parse)
        
        pass

Скрипт конвейера

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import mysql.connector

class ReviewBotPipeline(object):

    def __init__(self):
        self.create_connection()
        self.create_table()

    def create_connection(self):
        self.conn = mysql.connector.connect(
            host = 'localhost',
            user = 'root',
            passwd = 'xxxxxx',
            database = 'review'
        )
        self.curr = self.conn.cursor()

    def create_table(self):
        self.curr.execute(""" DROP TABLE IF EXISTS review_tb """)
        self.curr.execute(""" create table review_tb(
            page_title text,
            review_content text,
            review_link text
            ) """)
    def process_item(self, outputs, spider):
        self.store_db(outputs)
        return outputs

    def store_db(self, outputs):
        self.curr.execute(""" insert into review_tb values(%s,%s,%s)""",(
            outputs['page_title'][0],
            outputs['review_content'][0],
            outputs['review_link'][0]
        ))
        self.conn.commit()



И это ошибка, которую я получаю.

  File "/Users/divyanshu/env/lib/python3.7/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/utils/defer.py", line 154, in f
    return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
  File "/Users/divyanshu/review_bot/review_bot/pipelines.py", line 33, in process_item
    self.store_db(outputs)
  File "/Users/divyanshu/review_bot/review_bot/pipelines.py", line 40, in store_db
    outputs['review_link']
  File "/Users/divyanshu/env/lib/python3.7/site-packages/mysql/connector/cursor_cext.py", line 248, in execute
    prepared = self._cnx.prepare_for_mysql(params)
  File "/Users/divyanshu/env/lib/python3.7/site-packages/mysql/connector/connection_cext.py", line 626, in prepare_for_mysql
    result = self._cmysql.convert_to_mysql(*params)
_mysql_connector.MySQLInterfaceError: Python type list cannot be converted

1 Ответ

0 голосов
/ 11 июля 2020

Другой способ. Код для извлечения и хранения данных проще и удобнее для чтения.

from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
from simplified_scrapy.core.mysql_objstore import MysqlObjStore
from simplified_html.request_render import RequestRender

class ReviewBotPipeline(MysqlObjStore):
    def saveObj(self, data):
        conn = None
        cur = None
        try:
            conn = self.connect()
            cur = conn.cursor()
            try:
                cur.execute("insert into review_tb(page_title,review_content,review_link) values(%s,%s,%s)",
                            (data['page_title'], data['review_content'],
                             data['review_link']))
                return conn.commit()
            except Exception as err:
                conn.rollback()
                print(err)
        except Exception as err:
            print(err)
        finally:
            if (cur): cur.close()
            if (conn): conn.close()

class MySpider(Spider):
    name = 'ndtv_review'
    start_urls = [{
        'url': 'https://www.ndtv.com/movie-reviews/page-1',
        'requestMethod': 'render' # The download mode is 'render'
    }]  # Entry page
    concurrencyPer1s = 1
    refresh_urls = True
    obj_store = ReviewBotPipeline(
        name, {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'pwd': 'root',
            'dbName': 'review',
            'tbName': 'review_tb'
        })

    # Download page using headless browser. start
    # pyppeteer needs to be installed
    req = RequestRender({
        'executablePath':
        '/Applications/chrome.app/Contents/MacOS/Google Chrome'
    })

    def renderUrl(self, url, callback):
        self.req.get(url['url'], self._callback, extr_data=(url, callback))

    def _callback(self, html, url, extr_data):
        extr_data[1](html, extr_data[0], self)
    # Download page using headless browser. end

    def extract(self, url, html, models, modelNames):
        doc = SimplifiedDoc(html)
        datas = []
        lis = doc.selects('div#ins_storylist>li')
        for li in lis:
            outputs = {}
            a = li.select('div.nstory_header>a')
            if not a: continue
            outputs['page_title'] = a.text
            outputs['review_content'] = li.select('div.nstory_intro').text
            outputs['review_link'] = a.href
            datas.append(outputs)
        lstA = doc.selects('div.new_pagination>a')
        return {"Urls": lstA, "Data": datas}


SimplifiedMain.startThread(MySpider())  # Start download

Вот еще примеры: https://github.com/yiyedata/simplified-scrapy-demo/tree/master/spider_examples

...