有70%的機率會顯示錯誤:pool.map列表索引超出範圍巨蟒
res=pool.map(feng,urls)
File "c:\Python27\lib\multiprocessing\pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "c:\Python27\lib\multiprocessing\pool.py", line 567, in get
raise self._value
IndexError: list index out of range
不知道爲什麼,如果數據小於100,只有5%的機會表明message.any一個有想法如何改進?
#coding:utf-8
import multiprocessing
import requests
import bs4
import re
import string
root_url = 'http://www.haoshiwen.org'
#index_url = root_url+'/type.php?c=1'
def xianqin_url():
f = 0
h = 0
x = 0
y = 0
b = []
l=[]
for i in range(1,64):#頁數
index_url=root_url+'/type.php?c=1'+'&page='+"%s" % i
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text,"html.parser")
x = [a.attrs.get('href') for a in soup.select('div.sons a[href^=/]')]#取出每一頁的div是sons的鏈接
c=len(x)#一共c個鏈接
j=0
for j in range(c):
url = root_url+x[j]
us = str(url)
print "收集到%s" % us
l.append(url) #pool = multiprocessing.Pool(8)
return l
def feng (url) :
response = requests.get(url)
response.encoding='utf-8'
#print response.text
soup = bs4.BeautifulSoup(response.text, "html.parser")
#content = soup.select('div.shileft')
qq=str(soup)
soupout = re.findall(r"原文(.+?)</div>",qq,re.S)#以「原文」開頭<div>結尾的字段
#print soupout[1]
content=str(soupout[1])
b="風"
cc=content.count(b,0,len(content))
return cc
def start_process():
print 'Starting',multiprocessing.current_process().name
def feng (url) :
response = requests.get(url)
response.encoding='utf-8'
#print response.text
soup = bs4.BeautifulSoup(response.text, "html.parser")
#content = soup.select('div.shileft')
qq=str(soup)
soupout = re.findall(r"原文(.+?)</div>",qq,re.S)#以「原文」開頭<div>結尾的字段
#print soupout[1]
content=str(soupout[1])
b="風"
c="花"
d="雪"
e="月"
f=content.count(b,0,len(content))
h=content.count(c,0,len(content))
x=content.count(d,0,len(content))
y=content.count(e,0,len(content))
return f,h,x,y
def find(urls):
r= [0,0,0,0]
pool=multiprocessing.Pool()
res=pool.map4(feng,urls)
for i in range(len(res)):
r=map(lambda (a,b):a+b, zip(r,res[i]))
return r
if __name__=="__main__":
print "開始收集網址"
qurls=xianqin_url()
print "收集到%s個鏈接" % len(qurls)
print "開始匹配先秦詩文"
find(qurls)
print '''
%s篇先秦文章中:
---------------------------
風有:%s
花有:%s
雪有:%s
月有:%s
數據來源:%s
''' % (len(qurls),find(qurls)[0],find(qurls)[1],find(qurls)[2],find(qurls)[3],root_url)
stackoverflow:身體不能包含「池ma p」。 將其更改爲res = pool.map4(feng,url) 我試圖從本網站獲取一些子字符串,並進行多處理。
我們不能真正幫助你,除非你A)給我們代碼,並B)解釋它想做什麼... – Toastrackenigma
我試過了,系統說我寫了太多的代碼,我會嘗試上傳圖片。 –
代碼上傳.. –