1
這段代碼用XPath
表達式將IMDb網站(獲得電影的標題,年份,等級等)和返回結果,但其selector()
函數只能正確運行一次然後中斷。我如何解決它?爲什麼此函數只能正確運行一次,然後返回一個空列表?
#!/usr/bin/env python3
import lxml.html
import requests as rq
IMDB_HTML = "http://www.imdb.com/filmosearch"
IMDB_JSON = "http://www.imdb.com/xml/find"
class IMDBParser(object):
def __init__(self, role_type=None, sort_type='user_rating, desc',
job_type="actor", title_type="movie"):
self.job_type = job_type
self.sort_type = sort_type
self.title_type = title_type
self.role_type = role_type
self.params = {
'page': 0,
'sort': sort_type,
'role': role_type,
'job_type': job_type,
'title_type': title_type
}
def identity(self):
"""gets actor's name and extracts its id from
imdb website."""
response = rq.get(IMDB_JSON, params={'json': 1, 'nm': 'one',
'q': rq.compat.quote_plus(self.role_type)})
movie_dicts = response.json()
return movie_dicts.get('name_popular', 'name_approx')[0]['id']
def selector(self, expr):
"""gets an expression and extracts all matched then
returns a generator of each matching value."""
self.params['role'] = self.identity()
while True:
self.params['page'] += 1
response = rq.get(IMDB_HTML, params=self.params)
elements = lxml.html.fromstring(response.text).xpath(expr)
if not elements:
break
yield from (element.text for element in elements)
class IMDBApplication(IMDBParser):
def __init__(self, role_type=None, sort_type='user_rating, desc',
job_type="actor", title_type="movie"):
IMDBParser.__init__(self)
self.job_type = job_type
self.sort_type = sort_type
self.title_type = title_type
self.role_type = role_type
def get_titles(self):
"""passes the xpath expression to the function and gets
its return."""
expr = "//*/div/div[2]/div[3]/div/div[2]/h3/a[1]"
return self.selector(expr)
def get_scores(self):
"""passes the xpath expression to the function and gets
its return."""
expr = "//*/div[2]/div[3]/div/div[2]/div/div[1]/strong"
return self.selector(expr)
def get_years(self):
"""passes the xpath expression to the function and gets
its return."""
expr = "//*/div/div[2]/div[3]/div/div[2]/h3/span[2]"
return self.selector(expr)
if __name__ == "__main__":
ia1 = IMDBApplication("Daniel Craig")
print([i for i in ia1.get_titles()])
print([i for i in ia1.get_scores()])
究竟它是如何突破?請提供更多細節。 – Necreaux
@peterh - 代碼審查是爲工作代碼,而不是調試問題 – rolfl
@rolfl我可以理解,爲什麼。謝謝! – peterh