for year_url in years_url:
# For every page in the interval 1-4
for page in pages:
# Make a get request
response = requests.get('http://www.imdb.com/search/title?release_date=' + year_url +
'&sort=num_votes,desc&page=' + page)
# Parse the content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'lxml')
# Select all the 50 movie containers from a single page
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
# For every movie of these 50
for container in mv_containers:
# If the movie has a Metascore, then:
if container.find('div', class_ = 'ratings-metascore') is not None:
# Scrape the name
name = container.h3.a.text
names.append(name)
# Scrape the genre
genre = container.p.find('span', class_ = 'genre').text.rstrip().replace("\n","").split(",")
genres.append(genre)
# Scrape the runtime
runtime = container.p.find('span', class_ = 'runtime').text
runtimes.append(runtime)
# Scrape the year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# Scrape the IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# Scrape the Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# Scrape the number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
# Scrape the GrossMill
gross = int(container.find('span', text='Gross:').find_next('span')['data-value'].replace(',', ''))
print(gross)
grossmill.append(gross)
我无法从上述url创建总收集数据 vote和gross具有相同的属性,因此我发现很难从下面提到的链接中提取gross数据。我是如何计算选票的。 指向url=“https://www.imdb.com/search/title/?release_date=2019&sort=num_votes,desc&page=1”的链接
您的错误意味着对于给定的电影,没有使用
text='Gross'
找到span
。正如我从你正在抓取的IMDB webpage中看到的,有些电影虽然有元分数,却没有显示它们的总分数。电影就是这样在使用方法
find_next()
调用gross之前,应该首先检查它是否存在替换:
作者:
相关问题 更多 >
编程相关推荐