2013-07-12 59 views




我相信我的代碼中的Rules/LinkExtractor段存在一些問題。 python代碼正在執行,抓取啓動url,但沒有解析或任何後續任務。


  1. 一些法案不具有多個版本(和ERGO在
  2. 一些法案不有聯繫的部分,因爲他們是如此之短的URL的主體部分沒有鏈接,而一些不過是鏈接部分。
  3. 有些部分鏈接並不僅僅包含特定的部分內容,大部分內容都是之前或之後部分內容的只是多餘的夾雜物。


from scrapy.item import Item, Field 
from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.selector import HtmlXPathSelector 

class BillItem(Item): 
    title = Field() 
    body = Field() 

class VersionItem(Item): 
    title = Field() 
    body = Field() 

class SectionItem(Item): 
    body = Field() 

class Lrn2CrawlSpider(CrawlSpider): 
    name = "lrn2crawl" 
    allowed_domains = ["thomas.loc.gov"] 
    start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767 


rules = (
     # Extract links matching /query/ fragment (restricting tho those inside the content body of the url) 
     # and follow links from them (since no callback means follow=True by default). 
     # Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse. 
     Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[@id="content"]')), callback='parse_bills', follow=True), 

     # Extract links in the body of a bill-version & follow them. 
     #Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse. 
     Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True) 

def parse_bills(self, response): 
    hxs = HtmlXPathSelector(response) 
    bills = hxs.select('//div[@id="content"]') 
    scraped_bills = [] 
    for bill in bills: 
     scraped_bill = BillItem() ### Bill object defined previously 
     scraped_bill['title'] = bill.select('p/text()').extract() 
     scraped_bill['body'] = response.body 
    return scraped_bills 

def parse_versions(self, response): 
    hxs = HtmlXPathSelector(response) 
    versions = hxs.select('//div[@id="content"]') 
    scraped_versions = [] 
    for version in versions: 
     scraped_version = VersionItem() ### Version object defined previously 
     scraped_version['title'] = version.select('center/b/text()').extract() 
     scraped_version['body'] = response.body 
    return scraped_versions 

def parse_sections(self, response): 
    hxs = HtmlXPathSelector(response) 
    sections = hxs.select('//div[@id="content"]') 
    scraped_sections = [] 
    for section in sections: 
     scraped_section = SectionItem() ## Segment object defined previously 
     scraped_section['body'] = response.body 
    return scraped_sections 

spider = Lrn2CrawlSpider() 



我剛剛固定壓痕,除去在腳本的末尾spider = Lrn2CrawlSpider()線,通過scrapy runspider lrn2crawl.py跑了蜘蛛和它刮掉,如下鏈接,返回的項目 - 你的規則工作。


from scrapy.item import Item, Field 
from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.selector import HtmlXPathSelector 

class BillItem(Item): 
    title = Field() 
    body = Field() 

class VersionItem(Item): 
    title = Field() 
    body = Field() 

class SectionItem(Item): 
    body = Field() 

class Lrn2CrawlSpider(CrawlSpider): 
    name = "lrn2crawl" 
    allowed_domains = ["thomas.loc.gov"] 
    start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767 


    rules = (
      # Extract links matching /query/ fragment (restricting tho those inside the content body of the url) 
      # and follow links from them (since no callback means follow=True by default). 
      # Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse. 
      Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[@id="content"]')), callback='parse_bills', follow=True), 

      # Extract links in the body of a bill-version & follow them. 
      #Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse. 
      Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True) 

    def parse_bills(self, response): 
     hxs = HtmlXPathSelector(response) 
     bills = hxs.select('//div[@id="content"]') 
     scraped_bills = [] 
     for bill in bills: 
      scraped_bill = BillItem() ### Bill object defined previously 
      scraped_bill['title'] = bill.select('p/text()').extract() 
      scraped_bill['body'] = response.body 
     return scraped_bills 

    def parse_versions(self, response): 
     hxs = HtmlXPathSelector(response) 
     versions = hxs.select('//div[@id="content"]') 
     scraped_versions = [] 
     for version in versions: 
      scraped_version = VersionItem() ### Version object defined previously 
      scraped_version['title'] = version.select('center/b/text()').extract() 
      scraped_version['body'] = response.body 
     return scraped_versions 

    def parse_sections(self, response): 
     hxs = HtmlXPathSelector(response) 
     sections = hxs.select('//div[@id="content"]') 
     scraped_sections = [] 
     for section in sections: 
      scraped_section = SectionItem() ## Segment object defined previously 
      scraped_section['body'] = response.body 
     return scraped_sections 



是的,這確實有幫助,並且基本上刪除最後一行「spider = [...]」確實允許腳本運行。我仍然困惑爲什麼?當我在調試中運行腳本時,它告訴我在「規則([...]」)上出現語法錯誤,這就是爲什麼我說我相信問題出在那裏。 我剛發現這個腳本很奇怪運行但不執行任務;調試指向了錯誤的方向嗎?也許我錯了 無論如何,是的,這對我有很大的幫助。 –



def __init__(self, *a, **kw): 
    super(CrawlSpider, self).__init__(*a, **kw) 
