2017-07-04 104 views
0

我想跟蹤所有內部鏈接,同時跟蹤網站的所有內部和外部鏈接。我剛剛開始使用Scrapy,我無法弄清楚在網站中的所有內部鏈接時如何抓取。Scrapy在抓取網站時無法關注內部鏈接

它只是獲取深度鏈接,但不遵循它們。

class BRS(CrawlSpider): 
    name = "brs" 
    rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_obj', follow=True),) 
    def __init__(self): 
     global start_urls 
     #settings.overrides['DEPTH_LIMIT'] = 10                          
     path = os.path.dirname(os.path.abspath(__file__)) 
     with open(os.path.join(path,"urls.txt"), "rt") as f: 
      self.start_urls = filter(None,[url.strip() for url in f.readlines()]) 
     start_urls = self.start_urls 


    def parse(self, response): 
     brsitem = BrsItem() 
     brsitem['url'] = response.url 
     internal = LinkExtractor(allow_domains=[response.url]) 
     external = LinkExtractor(deny_domains=[response.url]) 
     links = internal.extract_links(response) 
     internal = [] 
     fd = open('output.txt','a+') 
     for link in links: 
      internal.append(link.url) 

     links = external.extract_links(response) 
     external = [] 
     for link in links: 
      external.append(link.url) 
     for link in internal: 
      fd.write(link+"\tinternal\n") 

     for link in external: 
      fd.write(link+"\texternal\n") 

     return brsitem 

我urls.txt包含截至目前: http://www.stackoverflow.com

任何幫助表示讚賞。

回答

0

使用這個link的引用得到它的工作,並且當我忘記設置DEPTH_LIMIT參數時,我的ip被阻塞在stackoverflow上。有些事情是艱難的。

import scrapy 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.contrib.spiders import Rule, CrawlSpider 
from scrapy.linkextractors import LinkExtractor 
import urllib2,requests 
from scrapy.conf import settings 
from requests.auth import HTTPBasicAuth 
import urllib2,requests,os,sys 
from urlparse import urlparse 
from brs.items import BrsItem 


class BRS(CrawlSpider): 
    name = "brs" 

    def __init__(self): 
     global start_urls,rules 
     settings.overrides['DEPTH_LIMIT'] = 10 
     path = os.path.dirname(os.path.abspath(__file__)) 
     with open(os.path.join(path,"urls.txt"), "r+") as f: 
      self.start_urls = filter(None,[url.strip() for url in f.readlines()]) 

     start_urls = self.start_urls 
     self.rules = (Rule(SgmlLinkExtractor(allow=()), callback=self.parse_items, follow=True),) 
     rules = self.rules 
     self._rules = rules 



    def extract_domain(self,url): 
     return urlparse(url).netloc 


    def parse_items(self, response): 

     internal = LinkExtractor(allow_domains=[self.extract_domain(response.url)]) 
     external = LinkExtractor(deny_domains=[self.extract_domain(response.url)]) 
     links = internal.extract_links(response) 
     internal = [] 
     fd = open('output.txt','a+') 
     for link in links: 
      internal.append(link.url) 

     for link in internal: 
      fd.write(link+"\tinternal\n") 

     links = external.extract_links(response) 
     external = [] 
     for link in links: 
      external.append(link.url) 
     for link in external: 
      fd.write(link+"\texternal\n") 
     for link in internal: 
      yield scrapy.Request(link.strip(), callback=self.parse_attr) 



    def parse_attr(self, response): 
     brsitem = BrsItem() 
     brsitem['url'] = response.url.strip() 
     return brsitem