Python извлекает и добавляет данные во фрейм данных - PullRequest
0 голосов
/ 02 июля 2018

Я просмотрел веб-сайт для своих исследований, но не смог найти правильный способ извлечь его во фрейм данных. Я считаю, что моя проблема связана с объектами списка, которые находятся между строками 36 и 38.

Строка печати работает очень хорошо, и я вижу окончательную версию фрейма данных в консоли Python.

Решение может быть очень простым, но я не мог понять это. Заранее спасибо за помощь.

from time import sleep
from bs4 import BeautifulSoup, SoupStrainer
import requests
import pandas as pd

# Insert the hisghest page number for website
highest_number = 12


def total_page_number(url):
    all_webpage_links = []
    all_webpage_links.insert(0, url)
    pages = [str(each_number) for each_number in range(2, highest_number)]
    for page in pages:
        link = ''.join(url + '&page=' + page)
        all_webpage_links.append(link)
    return all_webpage_links


# Use total_page_number function to create page list for website
All_page = total_page_number(
    'https://www.imdb.com/search/title?countries=tr&languages=tr&locations=Turkey&count=250&view=simple')


def clean_text(text):
    """ Removes white-spaces before, after, and between characters
    :param text: the string to remove clean
    :return: a "cleaned" string with no more than one white space between
    characters
    """
    return ' '.join(text.split())


# Create list objects for data
# Problem occurs in this line !!!!!!
actor_names = []
titles = []
dates = []


def get_cast_from_link(movie_link):
    """ Go to the IMDb Movie page in link, and find the cast overview list.
        Prints tab-separated movie_title, actor_name, and character_played to
        stdout as a result. Nothing returned
    :param movie_link: string of the link to IMDb movie page (http://imdb.com
    ...)
    :return: void
    """
    movie_page = requests.get(movie_link)

    # Use SoupStrainer to strain the cast_list table from the movie_page
    # This can save some time in bigger scraping projects
    cast_strainer = SoupStrainer('table', class_='cast_list')
    movie_soup = BeautifulSoup(movie_page.content, 'html.parser', parse_only=cast_strainer)
    # Iterate through rows and extract the name and character
    # Remember that some rows might not be a row of interest (e.g., a blank
    # row for spacing the layout). Therefore, we need to use a try-except
    # block to make sure we capture only the rows we want, without python
    # complaining.
    for row in movie_soup.find_all('tr'):
        try:
            actor = clean_text(row.find(itemprop='name').text)
            actor_names.append(actor)
            titles.append(movie_title)
            dates.append(movie_date)
            print('\t'.join([movie_title, actor, movie_date]))
        except AttributeError:
            pass


# Export data frame
# Problem occurs in this line !!!!!!
tsd_df = pd.DataFrame({'Actor_Names': actor_names,
                       'Movie_Title': titles,
                       'Movie_Date': dates})
tsd_df.to_csv('/Users/ea/Desktop/movie_df.tsv', encoding='utf-8')

for each in All_page:
    # Use requests.get('url') to load the page you want
    web_page = requests.get(each)
    # https://www.imdb.com/search/title?countries=tr&languages=tr&count=250&view=simple&page=2

    # Prepare the SoupStrainer to strain just the tbody containing the list of movies
    list_strainer = SoupStrainer('div', class_='lister-list')

    # Parse the html content of the web page with BeautifulSoup
    soup = BeautifulSoup(web_page.content, 'html.parser', parse_only=list_strainer)

    # Generate a list of the "Rank & Title" column of each row and iterate
    movie_list = soup.find_all('span', class_='lister-item-header')
    for movie in movie_list:
        movie_title = movie.a.text
        movie_date = movie.find('span', class_='lister-item-year text-muted unbold').text

        # get the link to the movie's own IMDb page, and jump over
        link = 'http://imdb.com' + movie.a.get('href')

        get_cast_from_link(link)
        # remember to be nice, and sleep a while between requests!
        sleep(15)
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...