2017-04-12 39 views
-1

我試圖運行與pyqt接口的scrapy蜘蛛。然而,儘快pyqt開始運行 - scrapy凍結。我什至不能得到一個適當的錯誤,它只是吐出頭幾行scrapy。 P.S蜘蛛完美地運行它自己刮pyqt接口

import lost_second 
import sys 
import json 
import time 
from PyQt4 import QtCore, QtGui 
from twisted.internet import reactor, defer 
from scrapy.crawler import CrawlerRunner 
from scrapy.utils.project import get_project_settings 
from scrapy.utils.log import configure_logging 


class Form(QtGui.QDialog): 


    def __init__(self, parent=None): 
     super(Form, self).__init__(parent) 
     w =QtGui.QWidget() 
     w.resize(250, 150) 
     w.move(300, 300) 
     self.MyFunction() 



    def Parse_me(self,spidername): 
     self.spidername = spidername 
     configure_logging({'LOG_LEVEL': 'DEBUG'}) 
     runner = CrawlerRunner(get_project_settings()) 

     d = runner.crawl(self.spidername) 
     d.addBoth(lambda _: reactor.stop()) 
     reactor.run 


    def MyFunction(self): 
      self.Parse_me(lost_second.MySpider2) 

if __name__ == "__main__": 
    app = QtGui.QApplication(sys.argv) 
    form = Form() 
    form.show() 

    sys.exit(app.exec_()) 

lost_second蜘蛛的代碼:

import scrapy 
import re 
import json 
from scrapy.http import Request 



class MySpider2(scrapy.Spider): 
    name = "lost2" 
    allowed_domains = ["myurl"] 


    def start_requests(self): 
     self.video_link = {} 
     self.finalone = {} 

     with open('list.json', 'r') as z: 
      self.links = json.load(z) 

     for name in self.links: 
      self.video_link[name] ='myurl'+ self.links[name]['link']+'/video' 
      request = Request(self.video_link[name], 
           callback = self.video_scrape_parse) 
      request.meta['Movie_name'] = name 
      yield request 

     json.dump(self.finalone, open('video_links.json', 'w')) 



    def video_scrape_parse(self,response): 
     print('Response url :' + response.url) 
     response_name = response.meta['Movie_name'] 
     print('Response meta name:' + response_name) 
     movie_links = re.findall(r'anotherurl(.*?)mp4', response.body) 
     self.x = [] 
     for item in range(len(movie_links)): 
      self.x.append('anotherurl'+str(movie_links[item])+'mp4') 
     self.finalone[response_name] =self.x  
     print('Final response response_name:' + response_name) 

Scrapy日誌:

2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled extensions: 
['scrapy.extensions.logstats.LogStats', 
'scrapy.extensions.telnet.TelnetConsole', 
'scrapy.extensions.corestats.CoreStats'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled downloader middlewares: 
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 
'scrapy.downloadermiddlewares.retry.RetryMiddleware', 
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 
'scrapy_splash.SplashCookiesMiddleware', 
'scrapy_splash.SplashMiddleware', 
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 
'scrapy.downloadermiddlewares.stats.DownloaderStats'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled spider middlewares: 
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 
'scrapy_splash.SplashDeduplicateArgsMiddleware', 
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 
'scrapy.spidermiddlewares.referer.RefererMiddleware', 
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 
'scrapy.spidermiddlewares.depth.DepthMiddleware'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled item pipelines: 
[] 
2017-04-12 20:38:07 [scrapy.core.engine] INFO: Spider opened 
2017-04-12 20:38:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 
2017-04-12 20:38:07 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled extensions: 
['scrapy.extensions.logstats.LogStats', 
'scrapy.extensions.telnet.TelnetConsole', 
'scrapy.extensions.corestats.CoreStats'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled extensions: 
['scrapy.extensions.logstats.LogStats', 
'scrapy.extensions.telnet.TelnetConsole', 
'scrapy.extensions.corestats.CoreStats'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled downloader middlewares: 
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 
'scrapy.downloadermiddlewares.retry.RetryMiddleware', 
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 
'scrapy_splash.SplashCookiesMiddleware', 
'scrapy_splash.SplashMiddleware', 
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 
'scrapy.downloadermiddlewares.stats.DownloaderStats'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled downloader middlewares: 
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 
'scrapy.downloadermiddlewares.retry.RetryMiddleware', 
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 
'scrapy_splash.SplashCookiesMiddleware', 
'scrapy_splash.SplashMiddleware', 
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 
'scrapy.downloadermiddlewares.stats.DownloaderStats'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled spider middlewares: 
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 
'scrapy_splash.SplashDeduplicateArgsMiddleware', 
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 
'scrapy.spidermiddlewares.referer.RefererMiddleware', 
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 
'scrapy.spidermiddlewares.depth.DepthMiddleware'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled spider middlewares: 
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 
'scrapy_splash.SplashDeduplicateArgsMiddleware', 
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 
'scrapy.spidermiddlewares.referer.RefererMiddleware', 
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 
'scrapy.spidermiddlewares.depth.DepthMiddleware'] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled item pipelines: 
[] 
2017-04-12 20:38:07 [scrapy.middleware] INFO: Enabled item pipelines: 
[] 
2017-04-12 20:38:07 [scrapy.core.engine] INFO: Spider opened 
2017-04-12 20:38:07 [scrapy.core.engine] INFO: Spider opened 
2017-04-12 20:38:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 
2017-04-12 20:38:07 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 
2017-04-12 20:38:07 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6024 
2017-04-12 20:38:07 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6024 
+0

如何安裝下列庫:'lost,lost_second'? – eyllanesc

+0

我得到了lost.py和lost_second.py與scrapy類,正在使用 –

+0

閱讀此:[如何創建一個最小,完整和可驗證的示例] https://stackoverflow.com/help/mcve – eyllanesc

回答

-1

好吧,我的問題的解決方案是使用線程。我的代碼如下:

import lost_second 
import sys 
import json 
import time 
import threading 
from PyQt4 import QtCore, QtGui 
from scrapy.crawler import CrawlerRunner 
from twisted.internet import reactor, defer 
from scrapy.utils.log import configure_logging 


##Qtreactor ? Threads? 
class CrawlerThread(threading.Thread): 
    def __init__(self): 
     super(CrawlerThread, self).__init__() 


    def stop(self): 
     try: 
      reactor.stop() 
     except: 
      pass 
    def run(self): 
     configure_logging({'LOG_LEVEL': 'DEBUG'}) 
     crun = CrawlerRunner()#get_project_settings? 
     crun.crawl(lost_second.MySpider2) 
     reactor.run(installSignalHandlers=False) 


class Form(QtGui.QDialog): 


    def __init__(self, parent=None): 
     super(Form, self).__init__(parent) 
     w =QtGui.QWidget() 
     w.resize(250, 150) 
     w.move(300, 300) 
     self.ct = CrawlerThread() 
     self.ct.start() 

if __name__ == "__main__": 
    app = QtGui.QApplication(sys.argv) 
    form = Form() 
    form.show() 


    sys.exit(app.exec_())