對不起,我花了你的時間,但我真的被阻止!Python - 線程和urlopen(urllib2)和解析
我在Python中是一個n00b,但我努力學習,我試圖讓這個腳本運行。它的工作沒有線程,但爲了學習和提高我的Python技能,我想了解這有什麼問題!
問題: - 劇本永遠不會結束 - 它不解析什麼...的東西的urlopen似乎沒有正常工作
非常感謝您的幫助,我還在努力:-)
import Queue
import threading
import urllib2
from urllib2 import urlopen
import time
from bs4 import BeautifulSoup as BeautifulSoup
import xlwt
import time
import socket
socket.setdefaulttimeout(20.0)
class Retry(object):
default_exceptions = (Exception,)
def __init__(self, tries, exceptions=None, delay=0):
"""
Decorator for retrying a function if exception occurs
tries -- num tries
exceptions -- exceptions to catch
delay -- wait between retries
"""
self.tries = tries
if exceptions is None:
exceptions = Retry.default_exceptions
self.exceptions = exceptions
self.delay = delay
def __call__(self, f):
def fn(*args, **kwargs):
exception = None
for _ in range(self.tries):
try:
return f(*args, **kwargs)
except self.exceptions, e:
print "Retry, exception: "+str(e)
time.sleep(self.delay)
exception = e
#if no success after tries, raise last exception
raise exception
return fn
@Retry(5)
def open_url(source):
print("OPENING %s" % source)
print("Retrying to open and read the page")
resp = urlopen(source)
resp = resp.read()
return resp
queue = Queue.Queue()
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
chunk = open_url(host)
#chunk = url.read()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
class DatamineThread(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
global x
while True:
#grabs host from queue
chunk = self.out_queue.get()
#parse the chunk
soup = BeautifulSoup(chunk)
#print soup
tableau = soup.findAll('table')
rows = tableau[1].findAll('tr')
print("DONE")
for tr in rows:
cols = tr.findAll('td')
y = 0
x = x + 1
for td in cols:
texte_bu = td.text
texte_bu = texte_bu.encode('utf-8')
print texte_bu
ws.write(x,y,td.text)
y = y + 1
wb.save("IA.xls")
#signals to queue job is done
self.out_queue.task_done()
break
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(13):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
for i in range(1):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
#wait on the queue until everything has been processed
queue.join()
out_queue.join()
global x
x = 0
wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet("BULATS_IA_PARSED")
Countries_List = ['Afghanistan','Armenia','Brazil','Argentina','Armenia','Australia','Austria','Azerbaijan','Bahrain','Bangladesh','Belgium','Belize','Bolivia','Bosnia and Herzegovina','Brazil','Brunei Darussalam','Bulgaria','Cameroon','Canada','Central African Republic','Chile','China','Colombia','Costa Rica','Croatia','Cuba','Cyprus','Czech Republic','Denmark','Dominican Republic','Ecuador','Egypt','Eritrea','Estonia','Ethiopia','Faroe Islands','Fiji','Finland','France','French Polynesia','Georgia','Germany','Gibraltar','Greece','Grenada','Hong Kong','Hungary','Iceland','India','Indonesia','Iran','Iraq','Ireland','Israel','Italy','Jamaica','Japan','Jordan','Kazakhstan','Kenya','Kuwait','Latvia','Lebanon','Libya','Liechtenstein','Lithuania','Luxembourg','Macau','Macedonia','Malaysia','Maldives','Malta','Mexico','Monaco','Montenegro','Morocco','Mozambique','Myanmar (Burma)','Nepal','Netherlands','New Caledonia','New Zealand','Nigeria','Norway','Oman','Pakistan','Palestine','Papua New Guinea','Paraguay','Peru','Philippines','Poland','Portugal','Qatar','Romania','Russia','Saudi Arabia','Serbia','Singapore','Slovakia','Slovenia','South Africa','South Korea','Spain','Sri Lanka','Sweden','Switzerland','Syria','Taiwan','Thailand','Trinadad and Tobago','Tunisia','Turkey','Ukraine','United Arab Emirates','United Kingdom','United States','Uruguay','Uzbekistan','Venezuela','Vietnam']
hosts = ["http://www.cambridgeesol.org/institutions/results.php?region=%s&type=&BULATS=on" % Countries for Countries in Countries_List]
main()
print "Elapsed Time: %s" % (time.time() - start)
PS:另外,你認爲urllib3(keep-connexion)在這種情況下可以有用,你能解釋一下誰來實現這個。
嗯,我要把眼睛上扭曲了!謝謝 ! 但是,該腳本正在爲一些國家...並在沒有...廣告有時是...奇怪! – 2012-04-24 03:58:38
編輯:據我的小研究...扭曲似乎對我來說太難了。我的代碼真的無法工作? (還有,urllib3在這裏可以用嗎?) – 2012-04-24 04:04:24
Twisted肯定比嘗試自己實現線程更容易。我沒有嘗試過urllib3,但是從描述中聽起來,我發現它會比嘗試使用urllib2本身有所改進,但不如像Twisted這樣的全功能庫。 – 2012-04-24 04:11:01