-2
這是我的代碼:Python-完備數據(網頁抓取)
from bs4 import BeautifulSoup
import urllib2
import re
import sys
main_url = "http://sukhansara.com/سخن-سرا-پر-خوش-آمدید/newposts/parveenshakir/psghazals/"
test_url = urllib2.urlopen(main_url)
readHtml = test_url.read()
test_url.close()
soup = BeautifulSoup(readHtml, "html.parser")
url = soup.find('div',attrs={"class":"entry-content"}).findAll('div', attrs={"class":None})
count = 1
fobj = open('D:\Scrapping\parveen_again2.xml', 'w')
for getting in url:
url = getting.find('a')
if url.has_attr('href'):
urls = url['href']
test_url = urllib2.urlopen(urls, timeout=36)
readHtml = test_url.read()
test_url.close()
soup1 = BeautifulSoup(readHtml, "html.parser")
title = soup1.find('title')
title = title.get_text('+')
title = title.split("|")
author = soup1.find('div',attrs={"class":"entry-meta"}).find('span',attrs={"class":"categories-links"})
author = author.findAll('a')
fobj.write("<add><doc>\n")
fobj.write("<field name=\"id\">sukhansara.com_pg1Author"+author[0].string.encode('utf8')+"Count"+str(count)+"</field>\n")
fobj.write("<field name=\"title\">"+title[0].encode('utf8')+"</field>\n")
fobj.write("<field name=\"content\">")
count += 1
poetry = soup1.find('div',attrs={"class":"entry-content"}).findAll('div')
x=1
check = True
while check:
if poetry[x+1].string.encode('utf8') != author[0].string.encode('utf8'):
fobj.write(poetry[x].string.encode('utf8')+"|")
x+=1
else:
check = False
fobj.write(poetry[x].string.encode('utf8'))
fobj.write("</field>\n")
fobj.write("<field name=\"group\">ur_poetry</field>\n")
fobj.write("<field name=\"author\">"+author[0].string.encode('utf8')+"</field>\n")
fobj.write("<field name=\"url\">"+urls+"</field>\n")
fobj.write("<add><doc>\n\n")
fobj.close()
print "Done printing"
Sonetimes我從24個網址24的詩歌,有時81.但也有近100網址嗎?每次當我達到81這個錯誤發生
AttributeError: 'NoneType' object has no attribute 'encode'
或某些時候設置超時錯誤。我究竟做錯了什麼?