jupyter notebook: _csv.Error: итератор должен возвращать строки, а не байты (вы открывали файл в текстовом режиме?) - PullRequest
0 голосов
/ 27 января 2020

Я пытаюсь запустить следующий код в Jupyter Notebook, используя Python 3. У меня следующая ошибка

_csv.Error: итератор должен возвращать строки, а не байты (вы открывали файл в текстовом режиме?)

Не могли бы вы помочь?

import os
from datetime import datetime, timedelta, date
import csv
import threading

from tqdm import tqdm
import requests

DATA_DIRECTORY = 'data'


class Collector(threading.Thread):
    def __init__(self, region, start_date, end_date):
        super(Collector, self).__init__()
        self.region = region
        self.start_date = start_date
        self.end_date = end_date
        self.base_headers = ['Position', 'Track Name', 'Artist', 'Streams', 'URL']

    def date_range(self):
        one_day = timedelta(days=1)
        current_date = self.start_date
        while current_date <= self.end_date:
            yield current_date
            current_date += one_day

    def is_csv_ok(self, download_content):
        csv_reader = csv.reader(download_content.splitlines(), delimiter=',')
        headers = csv_reader.__next__()
        return set(headers) == set(self.base_headers)

    def download_csv_file(self, url):
        with requests.Session() as session:
            session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                    'Accept-Encoding': 'gzip, deflate',
                                    'Accept-Language': 'pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3',
                                    'Connection': 'keep-alive',
                                    'Host': 'spotifycharts.com',
                                    'Referer': 'https://spotifycharts.com/regional/ad/weekly/latest',
                                    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'})
            retry = 3
            while True:
                download = session.get(url, verify=False)
                if self.is_csv_ok(download.content):
                    return download.content
                print ("Retrying for '%s'" % url)
                retry -= 1
                if retry <= 0:
                    print ("Retry failed for '%s'" % url)
                    return None

    def extract_csv_rows(self, csv_file):
        csv_reader = csv.reader(csv_file.splitlines(), delimiter=',')
        # Skip headers
        csv_reader.__next__()
        for row in csv_reader:
            yield row

    def run(self):
        if not os.path.exists(DATA_DIRECTORY):
            os.makedirs(DATA_DIRECTORY)

        headers = self.base_headers + ["Date", "Region"]

        file_path = os.path.join(DATA_DIRECTORY, "%s.csv" % self.region)
        if os.path.exists(file_path):
            print ("File '%s' already exists, skipping" % file_path)

        with open(file_path, 'w', 1) as out_csv_file:
            writer = csv.writer(out_csv_file)
            writer.writerow(headers)

            for current_date in tqdm(self.date_range(), desc="Collecting from '%s'" % self.region):
                url = "https://spotifycharts.com/regional/%s/daily/%s/download" % (self.region, current_date)
                csv_file = self.download_csv_file(url)
                if csv_file is None:
                    continue

                for row in self.extract_csv_rows(csv_file):
                    row.extend([current_date, self.region])
                    writer.writerow(row)

    @staticmethod
    def generate_final_file():
        final_filename = 'data.csv'

        with open(final_filename, 'w') as outfile:
            csv_writer = csv.writer(outfile)
            csv_writer.writerow(['Position', 'Track Name', 'Artist', 'Streams', 'URL'])
            for filename in tqdm(os.listdir(DATA_DIRECTORY), desc="Generating final file: %s" % final_filename):
                if filename.endswith(".csv"):
                    with open(os.path.join(DATA_DIRECTORY, filename)) as infile:
                        csv.reader(infile)
                        csv_reader.__next__()
                        for row in csv_reader:
                            csv_writer.writerow(row)


if __name__ == "__main__":

    one_day = timedelta(days=1)
    start_date = date(2017, 1, 1)
    end_date = datetime.now().date() - (2 * one_day)

    regions = ["global", "us", "gb", "ad", "ar", "at", "au", "be", "bg",
               "bo", "br", "ca", "ch", "cl", "co", "cr", "cy", "cz", "de",
               "dk", "do", "ec", "ee", "es", "fi", "fr", "gr", "gt", "hk",
               "hn", "hu", "id", "ie", "is", "it", "jp", "lt", "lu", "lv",
               "mc", "mt", "mx", "my", "ni", "nl", "no", "nz", "pa", "pe",
               "ph", "pl", "pt", "py", "se", "sg", "sk", "sv", "tr", "tw", "uy"]

    for region in regions:
        collector = Collector(region, start_date, end_date)
        collector.start()
    Collector.generate_final_file()
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...