Модуль Python Pandas не может получить название фильма - PullRequest
0 голосов
/ 06 декабря 2018

У меня есть эти тестовые коды о поиске в сети, которые я пробую, но я не могу получить все названия фильмов с сайта.Вот код

 from requests import get
 from bs4 import BeautifulSoup
 import pandas as pd

 url = 'http://www.imdb.com/search/title? 
 release_date=2017&sort=num_votes,desc&page=1'

 response = get(url)
 print(response.text[:500])



 html_soup = BeautifulSoup(response.text, 'html.parser')
  type(html_soup)

  movie_containers = html_soup.find_all('div', class_ = 'lister-item 
  mode-advanced')
  print(type(movie_containers))
  print(len(movie_containers))

   first_movie = movie_containers[0]
   first_movie

first_movie.div

 first_movie.a

 first_movie.h3

first_movie.h3.a

first_name = first_movie.h3.a.text

first_year = first_movie.h3.find('span', class_ = 'lister-item-year text- 
muted unbold ')


print(first_movie.strong)

first_imdb = float(first_movie.strong.text)
print"IMDB= " ,first_imdb

first_mscore = first_movie.find('span', class_ = 'metascore favorable')

first_mscore = int(first_mscore.text)
print "First MetaScore", first_mscore

 first_votes = first_movie.find('span', attrs = {'name':'nv'})
 first_votes['data-value']
  first_votes = int(first_votes['data-value'])
 print "First_Votes=",first_votes

eighth_movie_mscore = movie_containers[7].find('div', class_ = 'ratings- 
metascore')
 type(eighth_movie_mscore)


 # Lists to store the scraped data in
  names = []
  years = []
  imdb_ratings = []
   metascores = []
   votes = []

# Extract data from individual movie container
for container in movie_containers:

    # If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:

    # The name
    name = container.h3.a.text
    names.append(name)

    # The year
    year = container.h3.find('span', class_ = 'lister-item-year').text
    years.append(year)

    # The IMDB rating
    imdb = float(container.strong.text)
    imdb_ratings.append(imdb)

    # The Metascore
    m_score = container.find('span', class_ = 'metascore').text
    metascores.append(int(m_score))

    # The number of votes
    vote = container.find('span', attrs = {'name':'nv'})['data-value']
    votes.append(int(vote))


test_df = pd.DataFrame({
                    'movie': names,
                   'year': years,
                   'imdb': imdb_ratings,
                   'metascore': metascores,
                   'votes': votes})
 print(test_df.info())
   print (test_df)

На выходе отображаются не только названия фильмов, но и остальные отображаются правильно, без каких-либо проблем.RangeIndex: 46 записей, от 0 до 45 столбцов данных (всего 5 столбцов): imdb 46 ненулевой float64 metascore 46 ненулевой фильм int64 46 ненулевой объект голосов 46 ненулевой int64 год 46 ненулевой объект dtypes: float64 (1), int64 (2), object (2) использование памяти: 1,9+ КБ

1 Ответ

0 голосов
/ 06 декабря 2018
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
print(response.text[:500])



html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

first_movie = movie_containers[0]
first_movie

first_movie.div

first_movie.a

first_movie.h3

first_movie.h3.a

first_name = first_movie.h3.a.text

first_year = first_movie.h3.find('span', class_ = 'lister-item-year text- muted unbold ')


print(first_movie.strong)

first_imdb = float(first_movie.strong.text)
print("IMDB= ", first_imdb)

first_mscore = first_movie.find('span', class_ = 'metascore favorable')

first_mscore = int(first_mscore.text)
print ("First MetaScore", first_mscore)

first_votes = first_movie.find('span', attrs = {'name':'nv'})
first_votes['data-value']
first_votes = int(first_votes['data-value'])
print ("First_Votes=",first_votes)

eighth_movie_mscore = movie_containers[7].find('div', class_ = 'ratings-metascore')
type(eighth_movie_mscore)


# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Extract data from individual movie container
for container in movie_containers:

    # If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:

      # The name
      name = container.h3.a.text
      names.append(name)

      # The year
      year = container.h3.find('span', class_ = 'lister-item-year').text
      years.append(year)

      # The IMDB rating
      imdb = float(container.strong.text)
      imdb_ratings.append(imdb)

      # The Metascore
      m_score = container.find('span', class_ = 'metascore').text
      metascores.append(int(m_score))

      # The number of votes
      vote = container.find('span', attrs = {'name':'nv'})['data-value']
      votes.append(int(vote))


test_df = pd.DataFrame({
                   'movie': names,
                   'year': years,
                   'imdb': imdb_ratings,
                   'metascore': metascores,
                   'votes': votes})
print(test_df.info())
print (test_df)

Это работает для меня.enter image description here

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...