2011-11-20 64 views

回答

0

使用BaseSpider代替CrawlSpider,然後設置添加到start_requests或start_urls []

class MySpider(BaseSpider): 
    name = "myspider" 

    def start_requests(self): 
     return [Request("https://www.example.com", 
      callback=self.parse)] 

    def parse(self, response): 
     hxs = HtmlXPathSelector(response) 
     ... 
0

類ThemenHubSpider(CrawlSpider):

name = 'themenHub' 
allowed_domains = ['themen.t-online.de'] 
start_urls = ["http://themen.t-online.de/themen-a-z/a"] 
rules = [Rule(SgmlLinkExtractor(allow=['id_\d+']), 'parse_news')]