2012-10-12 33 views
2

我試圖解析與QtWebKit的JS生成的網頁,我發現如何讓網頁的源文件的例子:如何在python線程中使用qtwebkit?

import sys 
from PySide.QtGui import * 
from PySide.QtCore import * 
from PySide.QtWebKit import * 
class Render(QWebPage): 
    def __init__(self, url): 
     self.app = QApplication(sys.argv) 
     QWebPage.__init__(self) 
     self.loadFinished.connect(self._loadFinished) 
     self.mainFrame().load(QUrl(url)) 
     self.app.exec_() 

    def _loadFinished(self, result): 
     self.frame = self.mainFrame() 
     self.app.quit() 
url = 'http://www.thesite.gov/search' 
r = Render(url) 
html = r.frame.toHtml() 

但我不知道如何使它在線程工作。 那麼,如何做到這一點,如果這是不可能的 - 是否有另一種快速的方式來獲取由js生成的網頁?

回答

3

鑑於QT的異步性質,QtWebkit方法也是非阻塞的,所以在線程中沒有運行它們的意思。你可以像這樣並行啓動它們:

from functools import partial 

from PySide.QtCore import QUrl 
from PySide.QtGui import QApplication 
from PySide.QtWebKit import QWebView, QWebSettings 


TARGET_URLS = (
    'http://stackoverflow.com', 
    'http://github.com', 
    'http://bitbucket.org', 
    'http://news.ycombinator.com', 
    'http://slashdot.org', 
    'http://www.reddit.com', 
    'http://www.dzone.com', 
    'http://www.ideone.com', 
    'http://jsfiddle.net', 
) 


class Crawler(object): 

    def __init__(self, app): 
     self.app = app 
     self.results = dict() 
     self.browsers = dict() 

    def _load_finished(self, browser_id, ok): 
     print ok, browser_id 
     web_view, _flag = self.browsers[browser_id] 
     self.browsers[browser_id] = (web_view, True) 

     frame = web_view.page().mainFrame() 
     self.results[frame.url()] = frame.toHtml() 

     web_view.loadFinished.disconnect() 
     web_view.stop() 

     if all([closed for bid, closed in self.browsers.values()]): 
      print 'all finished' 
      self.app.quit() 

    def start(self, urls): 
     for browser_id, url in enumerate(urls): 
      web_view = QWebView() 
      web_view.settings().setAttribute(QWebSettings.AutoLoadImages, 
              False) 
      loaded = partial(self._load_finished, browser_id) 
      web_view.loadFinished.connect(loaded) 
      web_view.load(QUrl(url)) 
      self.browsers[browser_id] = (web_view, False) 


if __name__ == '__main__': 
    app = QApplication([]) 
    crawler = Crawler(app) 
    crawler.start(TARGET_URLS) 
    app.exec_() 
    print 'got:', crawler.results.keys() 
+0

如何用PySide或PtQt解析網頁(html + js)?這些代碼似乎無法解析JS – user1179442

+1

@ user1179442:它可以通過:'QWebView()。settings()。setAttribute(QWebSettings.JavascriptEnabled,False)' – andrean

+0

感謝您的信息。但第二個參數應該是「真」:) – user1179442