2015-04-23 37 views
1

這段代碼用XPath表達式將IMDb網站(獲得電影的標題,年份,等級等)和返回結果,但其selector()函數只能正確運行一次然後中斷。我如何解決它?爲什麼此函數只能正確運行一次,然後返回一個空列表?

#!/usr/bin/env python3 
import lxml.html 
import requests as rq 


IMDB_HTML = "http://www.imdb.com/filmosearch" 
IMDB_JSON = "http://www.imdb.com/xml/find" 


class IMDBParser(object): 
    def __init__(self, role_type=None, sort_type='user_rating, desc', 
       job_type="actor", title_type="movie"): 
     self.job_type = job_type 
     self.sort_type = sort_type 
     self.title_type = title_type 
     self.role_type = role_type 
     self.params = { 
      'page': 0, 
      'sort': sort_type, 
      'role': role_type, 
      'job_type': job_type, 
      'title_type': title_type 
     } 

    def identity(self): 
     """gets actor's name and extracts its id from 
     imdb website.""" 
     response = rq.get(IMDB_JSON, params={'json': 1, 'nm': 'one', 
          'q': rq.compat.quote_plus(self.role_type)}) 
     movie_dicts = response.json() 
     return movie_dicts.get('name_popular', 'name_approx')[0]['id'] 

    def selector(self, expr): 
     """gets an expression and extracts all matched then 
     returns a generator of each matching value.""" 
     self.params['role'] = self.identity() 
     while True: 
      self.params['page'] += 1 
      response = rq.get(IMDB_HTML, params=self.params) 
      elements = lxml.html.fromstring(response.text).xpath(expr) 
      if not elements: 
       break 
      yield from (element.text for element in elements) 


class IMDBApplication(IMDBParser): 
    def __init__(self, role_type=None, sort_type='user_rating, desc', 
       job_type="actor", title_type="movie"): 
     IMDBParser.__init__(self) 
     self.job_type = job_type 
     self.sort_type = sort_type 
     self.title_type = title_type 
     self.role_type = role_type 

    def get_titles(self): 
     """passes the xpath expression to the function and gets 
     its return.""" 
     expr = "//*/div/div[2]/div[3]/div/div[2]/h3/a[1]" 
     return self.selector(expr) 

    def get_scores(self): 
     """passes the xpath expression to the function and gets 
     its return.""" 
     expr = "//*/div[2]/div[3]/div/div[2]/div/div[1]/strong" 
     return self.selector(expr) 

    def get_years(self): 
     """passes the xpath expression to the function and gets 
     its return.""" 
     expr = "//*/div/div[2]/div[3]/div/div[2]/h3/span[2]" 
     return self.selector(expr) 

if __name__ == "__main__": 
    ia1 = IMDBApplication("Daniel Craig") 
    print([i for i in ia1.get_titles()]) 
    print([i for i in ia1.get_scores()]) 
+1

究竟它是如何突破?請提供更多細節。 – Necreaux

+0

@peterh - 代碼審查是爲工作代碼,而不是調試問題 – rolfl

+0

@rolfl我可以理解,爲什麼。謝謝! – peterh

回答

1

的問題是:

self.params['page'] += 1 

其中第一個要求,你增加的頁面數量,直到你沒有得到任何結果。但是,你永遠不會重置它。如果你改變你的selector功能:

def selector(self, expr): 
    """gets an expression and extracts all matched then 
    returns a generator of each matching value.""" 
    self.params['role'] = self.identity() 
    while True: 
     self.params['page'] += 1 
     response = rq.get(IMDB_HTML, params=self.params) 
     elements = lxml.html.fromstring(response.text).xpath(expr) 
     if not elements: 
      break 
     yield from (element.text for element in elements) 
    self.params['page'] = 0 

它工作正常,賦予:

['Casino Royale', 'The Girl with the Dragon Tattoo', 'One Life', 'Skyfall', 'Road to Perdition', 'Munich', 'Elizabeth', 'Layer Cake', 'The Adventures of Tintin: The Secret of the Unicorn', 'Defiance', 'The Power of One', 'The Jacket', 'Infamous', 'Sorstalanság', 'The Mother', 'Flashbacks of a Fool', 'Renaissance', 'Ten Minutes Older: The Cello', 'Quantum of Solace', 'Some Voices', 'Love Is the Devil: Study for a Portrait of Francis Bacon', 'Hotel Splendide', 'Enduring Love', 'Sylvia', 'The Golden Compass', 'Cowboys & Aliens', 'The Trench', 'Dream House', 'The Invasion', 'Lara Croft: Tomb Raider', 'I Dreamed of Africa', 'Obsession', 'Love & Rage', 'Saint-Ex', "A Kid in King Arthur's Court", 'Spectre', 'The Girl Who Played with Fire', 'Bond 25', "The Girl Who Kicked the Hornets' Nest"] 
['8.0', '7.9', '7.9', '7.8', '7.7', '7.6', '7.5', '7.4', '7.4', '7.2', '7.2', '7.1', '7.1', '7.1', '6.9', '6.8', '6.8', '6.8', '6.7', '6.7', '6.6', '6.5', '6.4', '6.3', '6.1', '6.1', '6.1', '5.9', '5.9', '5.7', '5.5', '5.3', '5.3', '5.1', '4.7'] 
相關問題