2015-10-06 21 views


pip install Twisted --update 


python python27\scripts\pywin32_postinstall.py -install 


G:\Job_vacancies\Python\vacancies>scrapy crawl jobs 
2015-10-06 09:12:53 [scrapy] INFO: Scrapy 1.0.3 started (bot: vacancies) 
2015-10-06 09:12:53 [scrapy] INFO: Optional features available: ssl, http11 
2015-10-06 09:12:53 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'va 
cancies.spiders', 'SPIDER_MODULES': ['vacancies.spiders'], 'DEPTH_LIMIT': 3, 'BO 
T_NAME': 'vacancies'} 
2015-10-06 09:12:53 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsol 
e, LogStats, CoreStats, SpiderState 
Unhandled error in Deferred: 
2015-10-06 09:12:53 [twisted] CRITICAL: Unhandled error in Deferred: 

Traceback (most recent call last): 
    File "c:\python27\lib\site-packages\scrapy\cmdline.py", line 150, in _run_comm 
    cmd.run(args, opts) 
    File "c:\python27\lib\site-packages\scrapy\commands\crawl.py", line 57, in run 

    self.crawler_process.crawl(spname, **opts.spargs) 
    File "c:\python27\lib\site-packages\scrapy\crawler.py", line 153, in crawl 
    d = crawler.crawl(*args, **kwargs) 
    File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 1274, in 
    return _inlineCallbacks(None, gen, Deferred()) 
--- <exception caught here> --- 
    File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 1128, in 
    result = g.send(result) 
    File "c:\python27\lib\site-packages\scrapy\crawler.py", line 71, in crawl 
    self.engine = self._create_engine() 
    File "c:\python27\lib\site-packages\scrapy\crawler.py", line 83, in _create_en 
    return ExecutionEngine(self, lambda _: self.stop()) 
    File "c:\python27\lib\site-packages\scrapy\core\engine.py", line 66, in __init 
    self.downloader = downloader_cls(crawler) 
    File "c:\python27\lib\site-packages\scrapy\core\downloader\__init__.py", line 
65, in __init__ 
    self.handlers = DownloadHandlers(crawler) 
    File "c:\python27\lib\site-packages\scrapy\core\downloader\handlers\__init__.p 
y", line 23, in __init__ 
    cls = load_object(clspath) 
    File "c:\python27\lib\site-packages\scrapy\utils\misc.py", line 44, in load_ob 
    mod = import_module(module) 
    File "c:\python27\lib\importlib\__init__.py", line 37, in import_module 
    File "c:\python27\lib\site-packages\scrapy\core\downloader\handlers\s3.py", li 
ne 6, in <module> 
    from .http import HTTPDownloadHandler 
    File "c:\python27\lib\site-packages\scrapy\core\downloader\handlers\http.py", 
line 5, in <module> 
    from .http11 import HTTP11DownloadHandler as HTTPDownloadHandler 
    File "c:\python27\lib\site-packages\scrapy\core\downloader\handlers\http11.py" 
, line 15, in <module> 
    from scrapy.xlib.tx import Agent, ProxyAgent, ResponseDone, \ 
    File "c:\python27\lib\site-packages\scrapy\xlib\tx\__init__.py", line 3, in <m 
    from twisted.web import client 
    File "c:\python27\lib\site-packages\twisted\web\client.py", line 42, in <modul 
    from twisted.internet.endpoints import TCP4ClientEndpoint, SSL4ClientEndpoin 
    File "c:\python27\lib\site-packages\twisted\internet\endpoints.py", line 34, i 
n <module> 
    from twisted.internet.stdio import StandardIO, PipeAddress 
    File "c:\python27\lib\site-packages\twisted\internet\stdio.py", line 30, in <m 
    from twisted.internet import _win32stdio 
    File "c:\python27\lib\site-packages\twisted\internet\_win32stdio.py", line 7, 
in <module> 
    import win32api 
exceptions.ImportError: DLL load failed: The specified module could not be found 
2015-10-06 09:12:53 [twisted] CRITICAL: 


# -*- coding: utf-8 -*- 
# encoding=UTF-8 
import scrapy, urlparse 
from scrapy.http import Request 
from scrapy.utils.response import get_base_url 
from urlparse import urlparse, urljoin 
from vacancies.items import JobItem 

#We need that in order to force Slovenian pages instead of English pages. It happened at "http://www.g-gmi.si/gmiweb/" that only English pages were found and no Slovenian. 
#from scrapy.conf import settings 
#settings.overrides['DEFAULT_REQUEST_HEADERS'] = {'Accept':'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8','Accept-Language':'sl',} 
#settings.overrides['DEFAULT_REQUEST_HEADERS'] = {'Accept':'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8','Accept-Language':'sl','en':q=0.8,} 

class JobSpider(scrapy.Spider): 

    name = "jobs" 

    #Test sample of SLO companies 
    start_urls = [ 

    #Result of the programme is this list of job vacancies webpages. 
    jobs_urls = [] 

    def parse(self, response): 


     #We take all urls, they are marked by "href". These are either webpages on our website either new websites. 
     urls = response.xpath('//@href').extract() 

     #Base url. 
     base_url = get_base_url(response) 

     #Loop through all urls on the webpage. 
     for url in urls: 

      #If url represents a picture, a document, a compression ... we ignore it. We might have to change that because some companies provide job vacancies information in PDF. 
      if url.endswith((
       '.jpg', '.jpeg', '.png', '.gif', '.eps', '.ico', '.svg', '.tif', '.tiff', 
       '.JPG', '.JPEG', '.PNG', '.GIF', '.EPS', '.ICO', '.SVG', '.TIF', '.TIFF', 

       '.xls', '.ppt', '.doc', '.xlsx', '.pptx', '.docx', '.txt', '.csv', '.pdf', '.pd', 
       '.XLS', '.PPT', '.DOC', '.XLSX', '.PPTX', '.DOCX', '.TXT', '.CSV', '.PDF', '.PD', 

       #music and video 
       '.mp3', '.mp4', '.mpg', '.ai', '.avi', '.swf', 
       '.MP3', '.MP4', '.MPG', '.AI', '.AVI', '.SWF', 

       #compressions and other 
       '.zip', '.rar', '.css', '.flv', '.php', 
       '.ZIP', '.RAR', '.CSS', '.FLV', '.PHP', 


      #If url includes characters like ?, %, &, # ... it is LIKELY NOT to be the one we are looking for and we ignore it. 
      #However in this case we exclude good urls like http://www.mdm.si/company#employment 
      if any(x in url for x in ['?', '%', '&', '#']): 

      #Ignore ftp. 
      if url.startswith("ftp"): 

      #We need to save original url for xpath, in case we change it later (join it with base_url) 
      url_xpath = url 

      #If url doesn't start with "http", it is relative url, and we add base url to get absolute url. 
      # -- It is true, that we may get some strange urls, but it is fine for now.    
      if not (url.startswith("http")): 

       url = urljoin(base_url,url) 

      #We don't want to go to other websites. We want to stay on our website, so we keep only urls with domain (netloc) of the company we are investigating.   
      if (urlparse(url).netloc == urlparse(base_url).netloc): 

       #The main part. We look for webpages, whose urls include one of the employment words as strings. 

       # -- Instruction. 
       # -- Users in other languages, please insert employment words in your own language, like jobs, vacancies, career, employment ... -- 
       if any(x in url for x in [ 















        #This is additional filter, suggested by Dan Wu, to improve accuracy. We will check the text of the url as well. 
        texts = response.xpath('//a[@href="%s"]/text()' % url_xpath).extract() 

        #1. Texts are empty. 
        if texts == []: 

         print "Ni teksta za url: " + str(url) 

         #We found url that includes one of the magic words and also the text includes a magic word. 
         #We check url, if we have found it before. If it is new, we add it to the list "jobs_urls". 
         if url not in self.jobs_urls: 

          item = JobItem() 
          #item["text"] = text 
          item["url"] = url 

          #We return the item. 
          yield item 

        # 2. There are texts, one or more. 

         #For the same partial url several texts are possible. 
         for text in texts: 

          if any(x in text for x in [ 















           #We found url that includes one of the magic words and also the text includes a magic word. 
           #We check url, if we have found it before. If it is new, we add it to the list "jobs_urls". 
           if url not in self.jobs_urls: 

            item = JobItem() 
            item["text"] = text 
            item["url"] = url 

            #We return the item. 
            yield item 

       #We don't put "else" sentence because we want to further explore the employment webpage to find possible new employment webpages. 
       #We keep looking for employment webpages, until we reach the DEPTH, that we have set in settings.py. 
       yield Request(url, callback = self.parse) 

     # We run the programme in the command line with this command: 

     #  scrapy crawl jobs -o jobs.csv -t csv --logfile log.txt 

     # We get two output files 
     # 1) jobs.csv 
     # 2) log.txt 

     # Then we manually put one of employment urls from jobs.csv into read.py 





pip install --upgrade twisted pypiwin32 



pip install --upgrade twisted[windows_platform] 



謝謝你,它的工作,令人難以置信。 – Marko


感謝您使用Twisted!如果這個答案解決了你的問題,你可以點擊綠色的選中標記(「接受」),也許upvote? :) – Glyph


嗨@Glyph,我upvoted,並有一個綠色的勾號。然而,我使用Twisted是因爲程序要求,並不是因爲我是Twisted的粉絲:) – Marko