我想從代碼中給出的html頁面中獲得五部電影的電影評分,年份,評分,流派和運行時間。這些位於稱爲結果的表格行中。Scraping htmlpage
from bs4 import BeautifulSoup
import urllib2
def read_from_url(url, num_m=5):
html_string = urllib2.urlopen(url)
soup = BeautifulSoup(html_string)
movie_table = soup.find('table', 'results') # table of movie
list_movies = []
count = 0
for row in movie_table.find_all("tr"):
dict_each_movie = {}
title = title.encode("ascii", "ignore") # getting title
dict_each_movie["title"] = title
year = year.encode("ascii","ignore") # getting year
dict_each_movie["year"] = year
rank = rank.encode("ascii","ignore") # getting rank
dict_each_movie["rank"] = rank
# genres = [] # getting genres of a movie
runtime = runtime.encode("ascii","ignore") # getting rank
dict_each_movie["runtime"] = runtime
list_movies.append(dict_each_movie)
count+=1
if count==num_of_m:
break
return list_movies
print read_from_url('http://www.imdb.com/search/title?at=0&sort=user_rating&start=1&title_type=feature&year=2005,2015',2)
預期輸出:
[{'rating': '10.0', 'genres': ['Comedy', 'Family'], 'title': 'How to Beat a Bully', 'rank': '1', 'year': '2014', 'runtime': '90'},..........]
title = title.encode(「ascii」,「ignore」)是什麼標題? – 2015-02-05 14:50:12