Проблемы кодирования в python3 - PullRequest
0 голосов
/ 12 января 2020

Я следую руководству sentdex по созданию чат-бота с использованием комментариев reddit и загрузил месяц комментариев в виде файла .bz2.

Когда я запускаю код, указанный ниже, я получаю сообщение об ошибке

 Traceback (most recent call last):
  File "/Users/my_user/Documents/ok.py", line 94, in <module>
for row in x:
 File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/codecs.py", line 322, 
in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd9 in position 13: invalid continuation 
byte

Мой код выглядит следующим образом:

import sqlite3
import json
import datetime

timeframe = '2015-01'
sql_transaction = []

connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()

def create_table():
    c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")

def format_data(data):
    data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
    return data

def transaction_bldr(sql):
    global sql_transaction
    sql_transaction.append(sql)
    if len(sql_transaction) > 1000:
        c.execute('BEGIN TRANSACTION')
        for s in sql_transaction:
            try:
                c.execute(s)
            except:
                pass
        connection.commit()
        sql_transaction = []

def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
    try:
        sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
        transaction_bldr(sql)
    except Exception as e:
        print('s0 insertion',e)

def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
    try:
        sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
        transaction_bldr(sql)
    except Exception as e:
        print('s0 insertion',e)

def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
    try:
        sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
        transaction_bldr(sql)
    except Exception as e:
        print('s0 insertion',e)

def acceptable(data):
    if len(data.split(' ')) > 50 or len(data) < 1:
        return False
    elif len(data) > 1000:
        return False
    elif data == '[deleted]':
        return False
    elif data == '[removed]':
        return False
    else:
        return True

def find_parent(pid):
    try:
        sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
        c.execute(sql)
        result = c.fetchone()
        if result != None:
            return result[0]
        else: return False
    except Exception as e:
        #print(str(e))
        return False

def find_existing_score(pid):
    try:
        sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
        c.execute(sql)
        result = c.fetchone()
        if result != None:
            return result[0]
        else: return False
    except Exception as e:
        #print(str(e))
        return False

if __name__ == '__main__':
    create_table()
    row_counter = 0
    paired_rows = 0

    with open("/Users/my_user/Downloads/reddit_data/{}/RC_{}.bz2".format(timeframe.split('-')[0],timeframe), buffering=1000) as x:
        for row in x:
            row_counter += 1
            row = json.loads(row)
            parent_id = row['parent_id']
            body = format_data(row['body'])
            created_utc = row['created_utc']
            score = row['score']
            comment_id = row['name']
            subreddit = row['subreddit']
            parent_data = find_parent(parent_id)
            if score >= 2:
                existing_comment_score = find_existing_score(parent_id)
                if existing_comment_score:
                    if score > existing_comment_score:
                        if acceptable(body):
                            sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)

                else:
                    if acceptable(body):
                        if parent_data:
                            sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
                            paired_rows += 1
                        else:
                            sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)

            if row_counter % 100000 == 0:
                print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))

Спасибо за ваше время!

PS Извините, если я задаю глупый вопрос, я довольно новичок в sqlite и json

1 Ответ

0 голосов
/ 12 января 2020

Вы пытаетесь прочитать сжатый bz2 файл как текстовый файл.

import bz2
if __name__ == '__main__':
    create_table()
    paired_rows = 0
    with bz2.open("/Users/my_user/Downloads/reddit_data/{}/RC_{}.bz2".format(timeframe.split('-')[0],timeframe)) as rows:
        for row_counter, row in enumerate(rows, 1):
            row = json.loads(row)
            ... 

Но это не единственная проблема. Никогда не форматируйте параметры в SQL -Statements. Например:

def find_parent(pid):
    sql = "SELECT comment FROM parent_reply WHERE comment_id = ? LIMIT 1"
    c.execute(sql, [pid])
    result = c.fetchone()
    if result is not None:
        return result[0]
    return False

Но у функции должно быть лучшее имя, например fetch_comment, и она не должна возвращать False, если нет комментариев, None, что было бы лучше. comment_id или parent_id правильно?

Все вместе, это должно выглядеть так:

import sqlite3
import json
import datetime
import bz2

TIMEFRAME = '2015-01'

def create_table(cursor):
    cursor.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")

def format_data(data):
    return data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")

def sql_insert_replace_comment(cursor, commentid,parentid,parent,comment,subreddit,time,score):
    cursor.execute("""UPDATE parent_reply SET comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""",
            [commentid, parent, comment, subreddit, int(time), score, parentid])

def sql_insert(cursor, commentid,parentid,parent,comment,subreddit,time,score):
    cursor.execute("""INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES (?,?,?,?,?,?,?);""",
        [parentid, commentid, parent, comment, subreddit, int(time), score])

def acceptable(data):
    return (
        len(data.split(' ')) <= 50
        and 1 <= len(data) <= 1000
        and data not in ('[deleted]','[removed]')
    )

def fetch_comment(cursor, pid):
    # TODO: is this comment or parent data?
    cursor.execute("SELECT comment FROM parent_reply WHERE comment_id = ? LIMIT 1", [pid])
    result = cursor.fetchone()
    return result and result[0]

def fetch_score(cursor, pid):
    cursor.execute("SELECT score FROM parent_reply WHERE parent_id = ? LIMIT 1", [pid])
    result = cursor.fetchone()
    return result and result[0]

def main():
    connection = sqlite3.connect('{}.db'.format(TIMEFRAME))
    cursor = connection.cursor()
    create_table(cursor)
    paired_rows = 0
    with bz2.open("/Users/my_user/Downloads/reddit_data/{}/RC_{}.bz2".format(timeframe.split('-')[0],timeframe)) as rows:
        for row_counter, row in enumerate(rows, 1):
            row = json.loads(row)
            parent_id = row['parent_id']
            body = format_data(row['body'])
            created_utc = row['created_utc']
            score = row['score']
            comment_id = row['name']
            subreddit = row['subreddit']
            parent_data = fetch_comment(cursor, parent_id) # or is this comment?
            if score >= 2 and acceptable(body):
                existing_comment_score = fetch_score(cursor, parent_id)
                if existing_comment_score is not None:
                    if score > existing_comment_score:
                        sql_insert_replace_comment(cursor, comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
                 else:
                    if parent_data:
                        paired_rows += 1
                    sql_insert(cursor, comment_id,parent_id,parent_data,body,subreddit,created_utc,score)

            if row_counter % 100000 == 0:
                print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))

if __name__ == '__main__':
    main()
...