2015-12-09 36 views
2

在scrapy中,我如何才能讓scrapy抓取所允許的域之外的所有鏈接只有1級深度。在抓取過程中,我希望能夠確保網站中的所有出站鏈接都正常工作,而不是404'd。我不希望它抓取不允許的域的整個網站。我目前正在處理允許的域名404s。我知道我可以設置1的DEPTH_LIMIT,但這也會影響允許的域。Scrapy:在異地鏈接上抓取1級深度

我的代碼:

from scrapy.selector import Selector 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 

from smcrawl.items import Website 
import smcrawl.util 

def iterate(lists): 
    for a in lists: 
     return a  

class WalmartSpider(CrawlSpider): 
    handle_httpstatus_list = [200, 302, 404, 500, 502] 
    name = "surveymonkeycouk" 
    allowed_domains = ["surveymonkey.co.uk", "surveymonkey.com"]  

    start_urls = ['https://www.surveymonkey.co.uk/']  

    rules = (
     Rule(
      LinkExtractor(
       allow=(), 
       deny=(), 
       process_value=smcrawl.util.trim), 
       callback="parse_items", 
       follow=True,), 
    ) 
    #process_links=lambda links: [link for link in links if not link.nofollow] = filter nofollow links 

    #parses start urls 
    def parse_start_url(self, response): 
     list(self.parse_items(response))  

    def parse_items(self, response): 
     hxs = Selector(response) 
     sites = response.selector.xpath('//html') 
     items = [] 

     for site in sites: 
      if response.status == 404:    
       item = Website() 
       item['url'] = response.url 
       item['referer'] = response.request.headers.get('Referer') 
       item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() 
       item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract() 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       item['description'] = site.xpath('//meta[@name="description"]/@content').extract() 
       item['redirect'] = response.status  
      elif response.status == 200:    
       item = Website() 
       item['url'] = response.url 
       item['referer'] = response.request.headers.get('Referer') 
       item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() 
       item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract() 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       item['description'] = site.xpath('//meta[@name="description"]/@content').extract() 
       item['redirect'] = response.status   
       titles = site.xpath('/html/head/title/text()').extract() 
       try: 
        titles = iterate(titles) 
        titles = titles.strip() 
       except: 
        pass 
       item['title'] = titles 
       h1 = site.xpath('//h1/text()').extract() 
       try: 
        h1 = iterate(h1) 
        h1 = h1.strip() 
       except: 
        pass 
       item['h1'] = h1  
      elif response.status == 302: 
       item = Website() 
       item['url'] = response.url 
       item['referer'] = response.request.headers.get('Referer') 
       item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() 
       item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract() 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       item['description'] = site.xpath('//meta[@name="description"]/@content').extract() 
       item['redirect'] = response.status   
       titles = site.xpath('/html/head/title/text()').extract() 
       try: 
        titles = iterate(titles) 
        titles = titles.strip() 
       except: 
        pass 
       item['title'] = titles 
       h1 = site.xpath('//h1/text()').extract() 
       try: 
        h1 = iterate(h1) 
        h1 = h1.strip() 
       except: 
        pass 
       item['h1'] = h1  
      elif response.status == 404:    
       item = Website() 
       item['url'] = response.url 
       item['referer'] = response.request.headers.get('Referer') 
       item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() 
       item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract() 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       item['description'] = site.xpath('//meta[@name="description"]/@content').extract() 
       item['redirect'] = response.status   
       titles = site.xpath('/html/head/title/text()').extract() 
       try: 
        titles = iterate(titles) 
        titles = titles.strip() 
       except: 
        pass 
       item['title'] = titles 
       h1 = site.xpath('//h1/text()').extract() 
       try: 
        h1 = iterate(h1) 
        h1 = h1.strip() 
       except: 
        pass 
       item['h1'] = h1         
      elif response.status == 500:    
       item = Website() 
       item['url'] = response.url 
       item['referer'] = response.request.headers.get('Referer') 
       item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() 
       item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract() 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       item['description'] = site.xpath('//meta[@name="description"]/@content').extract() 
       item['redirect'] = response.status   
       titles = site.xpath('/html/head/title/text()').extract() 
       try: 
        titles = iterate(titles) 
        titles = titles.strip() 
       except: 
        pass 
       item['title'] = titles 
       h1 = site.xpath('//h1/text()').extract() 
       try: 
        h1 = iterate(h1) 
        h1 = h1.strip() 
       except: 
        pass 
       item['h1'] = h1  
      elif response.status == 502:    
       item = Website() 
       item['url'] = response.url 
       item['referer'] = response.request.headers.get('Referer') 
       item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() 
       item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract() 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       item['description'] = site.xpath('//meta[@name="description"]/@content').extract() 
       item['redirect'] = response.status   
       titles = site.xpath('/html/head/title/text()').extract() 
       try: 
        titles = iterate(titles) 
        titles = titles.strip() 
       except: 
        pass 
       item['title'] = titles 
       h1 = site.xpath('//h1/text()').extract() 
       try: 
        h1 = iterate(h1) 
        h1 = h1.strip() 
       except: 
        pass 
       item['h1'] = h1 
      else:   
       item = Website() 
       item['url'] = response.url 
       item['referer'] = response.request.headers.get('Referer') 
       item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() 
       item['robots'] = site.xpath('//meta[@name="robots"]/@content').extract() 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       item['description'] = site.xpath('//meta[@name="description"]/@content').extract() 
       item['redirect'] = response.status   
       titles = site.xpath('/html/head/title/text()').extract() 
       try: 
        titles = iterate(titles) 
        titles = titles.strip() 
       except: 
        pass 
       item['title'] = titles 
       h1 = site.xpath('//h1/text()').extract() 
       try: 
        h1 = iterate(h1) 
        h1 = h1.strip() 
       except: 
        pass 
       item['h1'] = h1             
      items.append(item) 

     return items 
+0

你可以分享你的代碼到目前爲止? – eLRuLL

+0

@eLRuLL添加了代碼 – MMonkey

回答

0

好了,有一兩件事你可以做的是避免使用allowed_domains,這樣你就不會過濾任何場外的請求。

而是使之有趣,你可以創建自己的OffsiteMiddleware,像這樣的東西:

from scrapy.spidermiddlewares.offsite import OffsiteMiddleware 

class MyOffsiteMiddleware(OffsiteMiddleware): 
    offsite_domains = set() 
    def should_follow(self, request, spider): 
     regex = self.host_regex 
     host = urlparse_cached(request).hostname or '' 
     if host in offsite_domains: 
      return False 
     if not bool(regex.search(host)): 
      self.offsite_domains.add(host) 
     return True 

我沒有測試它,但它應該工作,記住,你應該禁用默認中間件和使上設置你的:

SPIDER_MIDDLEWARES = { 
    'myproject.middlewares.MyOffsiteMiddleware': 543, 
    'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None, 
} 
+0

您可以定義'offsite_requests'嗎? – MMonkey

+0

對不起,這是一個排字錯誤 – eLRuLL

+0

'self.offsite_domains'。我將如何去抓取所有的起始域,但只有1個外部域的深度? – MMonkey