2012-04-24 65 views
-1

對不起,我花了你的時間,但我真的被阻止!Python - 線程和urlopen(urllib2)和解析

我在Python中是一個n00b,但我努力學習,我試圖讓這個腳本運行。它的工作沒有線程,但爲了學習和提高我的Python技能,我想了解這有什麼問題!

問題: - 劇本永遠不會結束 - 它不解析什麼...的東西的urlopen似乎沒有正常工作

非常感謝您的幫助,我還在努力:-)

import Queue 
import threading 
import urllib2 
from urllib2 import urlopen 
import time 
from bs4 import BeautifulSoup as BeautifulSoup 
import xlwt 
import time 
import socket 

socket.setdefaulttimeout(20.0) 


class Retry(object): 
    default_exceptions = (Exception,) 
    def __init__(self, tries, exceptions=None, delay=0): 
     """ 
     Decorator for retrying a function if exception occurs 

     tries -- num tries 
     exceptions -- exceptions to catch 
     delay -- wait between retries 
     """ 
     self.tries = tries 
     if exceptions is None: 
      exceptions = Retry.default_exceptions 
     self.exceptions = exceptions 
     self.delay = delay 

    def __call__(self, f): 
     def fn(*args, **kwargs): 
      exception = None 
      for _ in range(self.tries): 
       try: 
        return f(*args, **kwargs) 
       except self.exceptions, e: 
        print "Retry, exception: "+str(e) 
        time.sleep(self.delay) 
        exception = e 
      #if no success after tries, raise last exception 
      raise exception 
     return fn 

@Retry(5) 
def open_url(source): 
    print("OPENING %s" % source) 
    print("Retrying to open and read the page") 
    resp = urlopen(source) 
    resp = resp.read() 
    return resp 



queue = Queue.Queue() 
out_queue = Queue.Queue() 

class ThreadUrl(threading.Thread): 
    """Threaded Url Grab""" 
    def __init__(self, queue, out_queue): 
     threading.Thread.__init__(self) 
     self.queue = queue 
     self.out_queue = out_queue 

    def run(self): 
     while True: 
      #grabs host from queue 
      host = self.queue.get() 

      #grabs urls of hosts and then grabs chunk of webpage 
      chunk = open_url(host) 
      #chunk = url.read() 

      #place chunk into out queue 
      self.out_queue.put(chunk) 

      #signals to queue job is done 
      self.queue.task_done() 

class DatamineThread(threading.Thread): 
    """Threaded Url Grab""" 
    def __init__(self, out_queue): 
     threading.Thread.__init__(self) 
     self.out_queue = out_queue 

    def run(self): 
     global x 
     while True: 
      #grabs host from queue 
      chunk = self.out_queue.get() 

      #parse the chunk 
      soup = BeautifulSoup(chunk) 
      #print soup 
      tableau = soup.findAll('table') 
     rows = tableau[1].findAll('tr') 
      print("DONE") 
     for tr in rows: 
      cols = tr.findAll('td') 
       y = 0 
       x = x + 1 
      for td in cols: 
        texte_bu = td.text 
        texte_bu = texte_bu.encode('utf-8') 
      print texte_bu 
        ws.write(x,y,td.text) 
        y = y + 1 
     wb.save("IA.xls") 

      #signals to queue job is done 
      self.out_queue.task_done() 
      break 

start = time.time() 
def main(): 

    #spawn a pool of threads, and pass them queue instance 
    for i in range(13): 
     t = ThreadUrl(queue, out_queue) 
     t.setDaemon(True) 
     t.start() 

    #populate queue with data 
    for host in hosts: 
     queue.put(host) 

    for i in range(1): 
     dt = DatamineThread(out_queue) 
     dt.setDaemon(True) 
     dt.start() 


    #wait on the queue until everything has been processed 
    queue.join() 
    out_queue.join() 


global x 
x = 0 

wb = xlwt.Workbook(encoding='utf-8') 
ws = wb.add_sheet("BULATS_IA_PARSED") 

Countries_List = ['Afghanistan','Armenia','Brazil','Argentina','Armenia','Australia','Austria','Azerbaijan','Bahrain','Bangladesh','Belgium','Belize','Bolivia','Bosnia and Herzegovina','Brazil','Brunei Darussalam','Bulgaria','Cameroon','Canada','Central African Republic','Chile','China','Colombia','Costa Rica','Croatia','Cuba','Cyprus','Czech Republic','Denmark','Dominican Republic','Ecuador','Egypt','Eritrea','Estonia','Ethiopia','Faroe Islands','Fiji','Finland','France','French Polynesia','Georgia','Germany','Gibraltar','Greece','Grenada','Hong Kong','Hungary','Iceland','India','Indonesia','Iran','Iraq','Ireland','Israel','Italy','Jamaica','Japan','Jordan','Kazakhstan','Kenya','Kuwait','Latvia','Lebanon','Libya','Liechtenstein','Lithuania','Luxembourg','Macau','Macedonia','Malaysia','Maldives','Malta','Mexico','Monaco','Montenegro','Morocco','Mozambique','Myanmar (Burma)','Nepal','Netherlands','New Caledonia','New Zealand','Nigeria','Norway','Oman','Pakistan','Palestine','Papua New Guinea','Paraguay','Peru','Philippines','Poland','Portugal','Qatar','Romania','Russia','Saudi Arabia','Serbia','Singapore','Slovakia','Slovenia','South Africa','South Korea','Spain','Sri Lanka','Sweden','Switzerland','Syria','Taiwan','Thailand','Trinadad and Tobago','Tunisia','Turkey','Ukraine','United Arab Emirates','United Kingdom','United States','Uruguay','Uzbekistan','Venezuela','Vietnam'] 
hosts = ["http://www.cambridgeesol.org/institutions/results.php?region=%s&type=&BULATS=on" % Countries for Countries in Countries_List] 

main() 

print "Elapsed Time: %s" % (time.time() - start) 

PS:另外,你認爲urllib3(keep-connexion)在這種情況下可以有用,你能解釋一下誰來實現這個。

回答

1

我必須承認我沒有審查您發佈的所有代碼,但「thread」和「urllib2」一起足以導致警報。

不要試圖將urllib2用於除單線程同步連接之外的其他任何事情!不是因爲urllib2有什麼問題,而只是因爲這個問題已經解決了,並且解決方案在Twisted中,這是一個用於Python的異常良好的異步網絡庫。

+1

嗯,我要把眼睛上扭曲了!謝謝 ! 但是,該腳本正在爲一些國家...並在沒有...廣告有時是...奇怪! – 2012-04-24 03:58:38

+1

編輯:據我的小研究...扭曲似乎對我來說太難了。我的代碼真的無法工作? (還有,urllib3在這裏可以用嗎?) – 2012-04-24 04:04:24

+0

Twisted肯定比嘗試自己實現線程更容易。我沒有嘗試過urllib3,但是從描述中聽起來,我發現它會比嘗試使用urllib2本身有所改進,但不如像Twisted這樣的全功能庫。 – 2012-04-24 04:11:01

1

腳本不會結束,因爲run方法包含無限循環,沒有什麼讓他們打破這個循環

while True: 
+0

哈哈,你是對的!我正在修復它! 謝謝! – 2012-04-24 09:33:10

+0

我想這是一個縮進問題 – marbdq 2012-04-24 09:50:33