2016-02-08 115 views
-2

這是我的代碼:Python-完備數據(網頁抓取)

from bs4 import BeautifulSoup 
import urllib2 
import re 
import sys 


main_url = "http://sukhansara.com/سخن-سرا-پر-خوش-آمدید/newposts/parveenshakir/psghazals/" 
test_url = urllib2.urlopen(main_url) 
readHtml = test_url.read() 
test_url.close() 


soup = BeautifulSoup(readHtml, "html.parser") 

url = soup.find('div',attrs={"class":"entry-content"}).findAll('div', attrs={"class":None}) 

count = 1 

fobj = open('D:\Scrapping\parveen_again2.xml', 'w') 
for getting in url: 
    url = getting.find('a') 
    if url.has_attr('href'): 
      urls = url['href']  
      test_url = urllib2.urlopen(urls, timeout=36) 
      readHtml = test_url.read() 
      test_url.close() 

      soup1 = BeautifulSoup(readHtml, "html.parser") 

      title = soup1.find('title') 
      title = title.get_text('+') 
      title = title.split("|") 

      author = soup1.find('div',attrs={"class":"entry-meta"}).find('span',attrs={"class":"categories-links"}) 


      author = author.findAll('a') 

      fobj.write("<add><doc>\n") 
      fobj.write("<field name=\"id\">sukhansara.com_pg1Author"+author[0].string.encode('utf8')+"Count"+str(count)+"</field>\n") 
      fobj.write("<field name=\"title\">"+title[0].encode('utf8')+"</field>\n") 
      fobj.write("<field name=\"content\">") 

      count += 1 


      poetry = soup1.find('div',attrs={"class":"entry-content"}).findAll('div') 

      x=1 
      check = True 

      while check: 
       if poetry[x+1].string.encode('utf8') != author[0].string.encode('utf8'): 
         fobj.write(poetry[x].string.encode('utf8')+"|") 
         x+=1 
       else: 
         check = False 
      fobj.write(poetry[x].string.encode('utf8')) 

      fobj.write("</field>\n") 
      fobj.write("<field name=\"group\">ur_poetry</field>\n") 
      fobj.write("<field name=\"author\">"+author[0].string.encode('utf8')+"</field>\n") 
      fobj.write("<field name=\"url\">"+urls+"</field>\n") 
      fobj.write("<add><doc>\n\n") 



fobj.close() 

print "Done printing" 

Sonetimes我從24個網址24的詩歌,有時81.但也有近100網址嗎?每次當我達到81這個錯誤發生

AttributeError: 'NoneType' object has no attribute 'encode'

或某些時候設置超時錯誤。我究竟做錯了什麼?

回答

0

切換到requests和維護打開的會話應該使其工作:

import requests 

with requests.Session() as session: 
    main_url = "http://sukhansara.com/سخن-سرا-پر-خوش-آمدید/newposts/parveenshakir/psghazals/" 

    readHtml = session.get(main_url).content 
    soup = BeautifulSoup(readHtml, "html.parser") 

    # ...