2014-12-30 126 views
3

我有一個〜2211開始url列表和scrapy抓取一些,但不是全部。 當我將start_url設置爲單個URL時,它會抓取URL,如果我在大列表中有URL,scrapy將不會抓取。Scrapy不抓取所有start_url的

是否有設置爲start_urls的限制?

我的代碼:

from pymongo import MongoClient 
import re 
from scrapy.selector import Selector 
#from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 

from mongo.items import MongoItem 
import scrapy 
import json 
from scrapy.http import Request 
from bs4 import BeautifulSoup as BS 


uri = "mongodb://[email protected]:23423423/" 
client = MongoClient(uri) 
db = client['page_content'] 
collection3 = db['category_page_content'] 
copyblocks3 = collection3.distinct('cwc') 
copyblockss = str(copyblocks3) 

hrefs = re.findall(r'href=[\'"]?([^\'" >]+)', copyblockss) 

class MongoSpider(scrapy.Spider): 
    name = "collections3" 
    allowed_domains = ["www.ecommerce.com"] 
    handle_httpstatus_list = [502, 503, 504, 400, 408, 404] 
    start_urls = hrefs 

    def parse(self, response): 
     hxs = Selector(response) 
     sites = response.selector.xpath('//html') 
     items = [] 

     if response.status == 404: 
      for site in sites: 
       item = MongoItem() 
       item['url'] = response.url 
       item['status'] = response.status 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       items.append(item) 

       htmlvar = item['original_url'] 
       change_list = list(collection3.find({"cwc":{"$regex":htmlvar}})) 

       alldata = dict() 
       cwcblockdic = "" 
       for a in change_list: 
        alldata.update(a) 
       ids = alldata['_id'] 
       cwcblock = alldata['cwc'] 
       cwcblockdic = cwcblockdic + cwcblock 

       soup = BS(cwcblockdic) 
       wholehref = soup.find(href=htmlvar) 
       try: 
        anchortext = soup.findAll(href=htmlvar)[0].text 
       except: 
        anchortext = wholehref.get_text() 
       soup.find(href=htmlvar).replaceWith(anchortext) 
       soup = str(soup) 
       newlist = soup.replace('<html><body>', '').replace('</body></html>','') 

       print "this is the anchor:", anchortext 
       print "this is the href:", wholehref 
       print "this is newlist:", newlist 
       print "this is the id:", ids 
       print "this is pagetype: CP" 

       for item in change_list: 
        item['cwc'] = newlist 
        collection3.update({'_id':ids}, {"$set":{"cwc":item['cwc']}}, upsert=False) 
      return items 

     elif hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'): 
      for site in sites: 
       item = MongoItem() 
       item['url'] = response.url 
       item['status'] = response.status 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       items.append(item) 

       htmlvar = item['original_url'] 
       change_list = list(collection3.find({"cwc":{"$regex":htmlvar}})) 

       alldata = dict() 
       cwcblockdic = "" 
       for a in change_list: 
        alldata.update(a) 
       ids = alldata['_id'] 
       cwcblock = alldata['cwc'] 
       cwcblockdic = cwcblockdic + cwcblock 

       soup = BS(cwcblockdic) 
       wholehref = soup.find(href=htmlvar) 
       try: 
        anchortext = soup.findAll(href=htmlvar)[0].text 
       except: 
        anchortext = wholehref.get_text() 
       soup.find(href=htmlvar).replaceWith(anchortext) 
       soup = str(soup) 
       newlist = soup.replace('<html><body>', '').replace('</body></html>','') 

       print "this is the anchor:", anchortext 
       print "this is the href:", wholehref 
       print "this is newlist:", newlist 
       print "this is the id:", ids 
       print "this is pagetype: CP" 

       for item in change_list: 
        item['cwc'] = newlist 
        collection3.update({'_id':ids}, {"$set":{"cwc":item['cwc']}}, upsert=False) 
      return items 

     elif hxs.xpath('//head/link[@rel="canonical"]/@href[contains(.,"invalid-category-id")]'): 
      for site in sites: 
       item = MongoItem() 
       item['url'] = response.url 
       item['status'] = response.status 
       item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
       items.append(item) 

       htmlvar = item['original_url'] 
       change_list = list(collection3.find({"cwc":{"$regex":htmlvar}})) 

       alldata = dict() 
       cwcblockdic = "" 
       for a in change_list: 
        alldata.update(a) 
       ids = alldata['_id'] 
       cwcblock = alldata['cwc'] 
       cwcblockdic = cwcblockdic + cwcblock 

       soup = BS(cwcblockdic) 
       wholehref = soup.find(href=htmlvar) 
       try: 
        anchortext = soup.findAll(href=htmlvar)[0].text 
       except: 
        anchortext = wholehref.get_text() 
       soup.find(href=htmlvar).replaceWith(anchortext) 
       soup = str(soup) 
       newlist = soup.replace('<html><body>', '').replace('</body></html>','') 

       print "this is the anchor:", anchortext 
       print "this is the href:", wholehref 
       print "this is newlist:", newlist 
       print "this is the id:", ids 
       print "this is pagetype: CP" 

       for item in change_list: 
        item['cwc'] = newlist 
        collection3.update({'_id':ids}, {"$set":{"cwc":item['cwc']}}, upsert=False) 
      return items 

     else: 
      if hxs.xpath('//*[@class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'): 
       for site in sites: 
        item = MongoItem() 
        item['url'] = response.url 
        item['status'] = response.status 
        item['original_url'] = response.meta.get('redirect_urls', [response.url])[0] 
        items.append(item) 

        htmlvar = item['original_url'] 
        change_list = list(collection3.find({"cwc":{"$regex":htmlvar}})) 

        alldata = dict() 
        cwcblockdic = "" 
        for a in change_list: 
         alldata.update(a) 
        ids = alldata['_id'] 
        cwcblock = alldata['cwc'] 
        cwcblockdic = cwcblockdic + cwcblock 

        soup = BS(cwcblockdic) 
        wholehref = soup.find(href=htmlvar) 
        try: 
         anchortext = soup.findAll(href=htmlvar)[0].text 
        except: 
         anchortext = wholehref.get_text() 
        soup.find(href=htmlvar).replaceWith(anchortext) 
        soup = str(soup) 
        newlist = soup.replace('<html><body>', '').replace('</body></html>','') 

        print "this is the anchor:", anchortext 
        print "this is the href:", wholehref 
        print "this is newlist:", newlist 
        print "this is the id:", ids 
        print "this is pagetype: CP" 

        for item in change_list: 
         item['cwc'] = newlist 
         collection3.update({'_id':ids}, {"$set":{"cwc":item['cwc']}}, upsert=False) 
       return items 
+0

請提出更具體的問題,提供您的蜘蛛的代碼和一個樣本列表的URL來測試。否則 - 它太寬泛。謝謝。 – alecxe

+0

@alecxe真實,謝謝,更新。 –

+0

您能否向我們提供一份網址列表,其中有些網址未被抓取?謝謝。 – alecxe

回答

1

這可能是原因之一,但仍然是一個有效的一個:有在URL列表中重複的URL:

>>> urls = [...] # list of urls you've posted 
>>> len(urls) 
2221 
>>> len(set(urls)) 
1177 

而且Scrapy過濾器重複請求默認爲

+0

這是一個非常有效的點,ty。遺憾的是,未抓取的網址不是重複的。我將在發送到start_urls之前使用set來清理URL列表,儘管如此,TY! –

+0

@EliquidVape很好,謝謝,你還看到不是所有的爬蟲都爬過了嗎?請在爬行結束時顯示Scrapy的報告。 – alecxe

+0

感謝您的跟進,Scrapy仍然不抓取所有的URL,但mongo數據庫不再存在。 –