2011-11-12 61 views
0
import lxml.html 
import mechanize, cookielib 
import multiprocessing 

browser = None 

def download(i): 
    link = 'www.google.com' 
    response = browser.open(link) 
    tree = lxml.html.parse(response) 
    print tree 
    return 0 

if __name__ == '__main__':  
    browser = mechanize.Browser() 
    cookie_jar = cookielib.LWPCookieJar() 
    browser.set_cookiejar(cookie_jar) 
    browser.set_handle_equiv(True) 
    browser.set_handle_gzip(True) 
    browser.set_handle_redirect(True) 
    browser.set_handle_referer(False) #inicialmente estava on mas deve ser melhor off 
    browser.set_handle_robots(False) 
    browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) 
    browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:2.0.1) Gecko/20100101 Ubuntu/11.04 maverick Firefox/4.0.1')] 

    pool = multiprocessing.Pool(None) 
    tasks = range(8) 
    r = pool.map_async(download, tasks) 
    r.wait() # Wait on the results 

如果我刪除多處理部分,它的工作原理。如果我不在下載功能中調用瀏覽器,它也可以。但是,似乎多處理+機械化根本無法工作。Python:我該如何修復這段代碼才能在Windows上工作?

我該如何解決這個問題?它不會在linux下發生。

回答

0

只有主進程執行門控if __name__ == '__main__'塊。由於Windows缺少fork系統調用,因此池中創建的每個進程都需要自己的瀏覽器。您可以使用初始化函數來完成此操作。作爲參考,請參閱的initializerinitargs選項。

import lxml.html 
import mechanize, cookielib 
import multiprocessing as mp 

def download(i): 
    link = 'http://www.google.com' 
    response = browser.open(link) 
    tree = lxml.html.parse(response) 
    print tree 
    return 0 

def init(count): 
    global browser 
    browser = mechanize.Browser() 
    cookie_jar = cookielib.LWPCookieJar() 
    browser.set_cookiejar(cookie_jar) 
    browser.set_handle_equiv(True) 
    browser.set_handle_gzip(True) #warning 
    browser.set_handle_redirect(True) 
    browser.set_handle_referer(False) 
    browser.set_handle_robots(False) 
    browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), 
           max_time=1) 
    browser.addheaders = [('User-agent', 
     'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:2.0.1) ' 
     'Gecko/20100101 Ubuntu/11.04 maverick Firefox/4.0.1')] 

    count.value -= 1 

if __name__ == '__main__': 
    import time 
    count = mp.Value('I', mp.cpu_count()) 
    pool = mp.Pool(count.value, initializer=init, initargs=(count,)) 
    #wait until all processes are initialized 
    while count.value > 0: 
     time.sleep(0.1) 

    tasks = range(8) 
    r = pool.map_async(download, tasks) 
    r.wait() 
0

我想嘗試:

  • 刪除browser = None
  • 移動在__name__=="__main__"代碼爲main()功能,前加global browserbrowser=mechanize.Browser() 或初始化browserinitializer
  • 移動代碼

如果您的任務是I/O綁定的,那麼您不一定需要multiprocessing來發出併發請求。例如,您可以改爲使用concurrent.futures.ThreadPoolExecutor,gevent,Twisted

相關:Problem with multi threaded Python app and socket connections