2015-09-04 78 views
1

我正在嘗試使用BeautifulSoup來編寫一個Python腳本,抓取網頁http://tbc-python.fossee.in/completed-books/並從中收集必要的數據。基本上它必須將所有書籍的章節中的所有page loading errors, SyntaxErrors, NameErrors, AttributeErrors, etc提取到文本文件errors.txt。大約有273本書。編寫的腳本很好地完成了任務。我以很快的速度使用帶寬。但是代碼需要花費很多時間才能瀏覽所有書籍。請幫我必要的修改,以優化python腳本,也許使用的功能,等等感謝使用函數修改python網絡抓取代碼的提示

import urllib2, urllib 
from bs4 import BeautifulSoup 
website = "http://tbc-python.fossee.in/completed-books/" 
soup = BeautifulSoup(urllib2.urlopen(website)) 
errors = open('errors.txt','w') 

# Completed books webpage has data stored in table format 
BookTable = soup.find('table', {'class': 'table table-bordered table-hover'}) 
for BookCount, BookRow in enumerate(BookTable.find_all('tr'), start = 1): 
    # Grab book names 
    BookCol = BookRow.find_all('td') 
    BookName = BookCol[1].a.string.strip() 
    print "%d: %s" % (BookCount, BookName) 
    # Open each book 
    BookSrc = BeautifulSoup(urllib2.urlopen('http://tbc-python.fossee.in%s' %(BookCol[1].a.get("href")))) 
    ChapTable = BookSrc.find('table', {'class': 'table table-bordered table-hover'}) 

    # Check if each chapter page opens, if not store book & chapter name in error.txt 
    for ChapRow in ChapTable.find_all('tr'): 
     ChapCol = ChapRow.find_all('td') 
     ChapName = (ChapCol[0].a.string.strip()).encode('ascii', 'ignore') # ignores error : 'ascii' codec can't encode character u'\xef' 
     ChapLink = 'http://tbc-python.fossee.in%s' %(ChapCol[0].a.get("href")) 

     try: 
      ChapSrc = BeautifulSoup(urllib2.urlopen(ChapLink)) 
     except: 
      print '\t%s\n\tPage error' %(ChapName) 
      errors.write("Page; %s;%s;%s;%s" %(BookCount, BookName, ChapName, ChapLink)) 
      continue 

     # Check for errors in chapters and store the errors in error.txt 
     EgError = ChapSrc.find_all('div', {'class': 'output_subarea output_text output_error'}) 
     if EgError: 
      for e, i in enumerate(EgError, start=1): 
       errors.write("Example;%s;%s;%s;%s\n" %(BookCount,BookName,ChapName,ChapLink)) if 'ipython-input' or 'Error' in i.pre.get_text() else None   
      print '\t%s\n\tExample errors: %d' %(ChapName, e)  

errors.close() 

回答

0

你可能要考慮multiprocessing和吐涎工作量。

如果您一次只使用1個連接,則連接速度並不重要。

+0

@ OneOfOne。我一次使用1個連接。還有其他建議嗎?謝謝。 –

+0

@ThirumaleshHS我沒有看到讓它更快分解的方法,但也許別人會這麼做。祝你好運。 – OneOfOne

0

我試圖分解代碼並用函數表示它。 任何建議再次即興代碼?如何將從網站獲取的錯誤轉儲爲具有包含錯誤的書籍和章節細節的表格格式的新html文件。

下面是更新後的代碼:

import urllib2, sys 
from bs4 import BeautifulSoup 

def get_details(link, index): 
    """ 
    This function takes in two arguments and returns a list which contains details of 
    books and/or chapters like: 
    * name of the book or chapter 
    * link of the book or chapter 

    Getting details from book or chapter is set by index value 
    * index = 1 --> gets details of the book 
    * index = 0 --> gets details of the chapter 
    """ 
    details_list = [] 

    src = BeautifulSoup(urllib2.urlopen(link)) 
    table = src.find('table') 
    for row in table.find_all('tr'): 
     column = row.find_all('td') 
     name, link = column[index].a.string, column[index].a.get("href") 
     details_list.append([name, link]) 

    return details_list 


def get_chapter_errors(chap_link): 
    """ 
    This function takes in chapter link from chapter_details_list as argument and returns 
    * Number of example errors(SyntaxErrors, NameErrors, ValueErrors, etc) present in the chapter 
       OR 
    * HTTPError while loading the chapter 
    """ 
    try: 
     chp_src = BeautifulSoup(urllib2.urlopen(chap_link)) 
     example_errors = chp_src.find_all('div', {'class': 'output_subarea output_text output_error'}) 
     error = len(example_errors) 
     if not example_errors: 
      error = None 

    except urllib2.HTTPError as e: 
     print e 
     error = "Page fetch error" 

    return error 


def main(): 
    log_dict = {} 
    book_dict = {} 

    url = sys.argv[1] # accept url as argument 
    book_details_list = get_details(url, index=1) 
    for book_name, book_link in book_details_list: 
     chapter_details_list = get_details('http://tbc-python.fossee.in%s' % book_link, index=0) 
     _id = book_link.strip('/book-details') 
     book_dict = {'name': book_name, 
        'url': 'http://tbc-python.fossee.in%s' % book_link, 
        'id': _id, 
        'chapters': [] 
        } 

     for chap_name, chap_link in chapter_details_list: 
      error = get_chapter_errors('http://tbc-python.fossee.in%s' % chap_link) 
      book_dict.get('chapters').append({'name': chap_name, 
               'url': 'http://tbc-python.fossee.in%s' % chap_link, 
               'errors': error 
              }) 

     log_dict.update({_id: book_dict}) 

     print log_dict 
     print "\n\n\n\n" 


if __name__ == '__main__': 
    main()