2014-01-24 36 views
0

儘管我在過去的幾個小時內盡了最大努力,我仍無法運行一個履帶式(名爲searchengine.py)。它似乎無法成功地索引頁面。我會爲您提供完整的抓取工具代碼。那種錯誤,我收到類似下面需要幫助調試Python網絡爬蟲

Indexing http://www.4futureengineers.com/company.html 
Could not parse page http://www.4futureengineers.com/company.html 

我在我的Python交互式會話(殼),輸入以下命令調用searchengine.py外觀。

>> import searchengine 
>> crawler=searchengine.crawler('searchindex.db') 
>> pages= \ 
.. ['http://www.4futureengineers.com/company.html'] 
>> crawler.crawl(pages) 

它給人的錯誤,即不成功解析命令crawler.crawl(pages)

這裏是searchengine.py的完整的源代碼之後

import urllib2 
from BeautifulSoup import * 
from urlparse import urljoin 
from pysqlite2 import dbapi2 as sqlite 


# Create a list of words to ignore 
ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} 


class crawler: 
    # Initialize the crawler with the name of database 
    def __init__(self,dbname): 
    self.con=sqlite.connect(dbname) 

    def __del__(self): 
    self.con.close() 

    def dbcommit(self): 
    self.con.commit() 


    # Auxilliary function for getting an entry id and adding 
    # it if it's not present 
    def getentryid(self,table,field,value,createnew=True): 
    cur=self.con.execute(
    "select rowid from %s where %s='%s'" % (table,field,value)) 
    res=cur.fetchone() 
    if res==None: 
     cur=self.con.execute(
     "insert into %s (%s) values ('%s')" % (table,field,value)) 
     return cur.lastrowid 
    else: 
     return res[0] 


    # Index an individual page 
    def addtoindex(self,url,soup): 
    if self.isindexed(url): return 
    print 'Indexing '+url 

    # Get the individual words 
    text=self.gettextonly(soup) 
    words=self.separatewords(text) 

    # Get the URL id 
    urlid=self.getentryid('urllist','url',url) 

    # Link each word to this url 
    for i in range(len(words)): 
     word=words[i] 
     if word in ignorewords: continue 
     wordid=self.getentryid('wordlist','word',word) 
     self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) 


    # Extract the text from an HTML page (no tags) 
    def gettextonly(self,soup): 
    v=soup.string 
    if v==Null: 
     c=soup.contents 
     resulttext='' 
     for t in c: 
     subtext=self.gettextonly(t) 
     resulttext+=subtext+'\n' 
     return resulttext 
    else: 
     return v.strip() 

    # Seperate the words by any non-whitespace character 
    def separatewords(self,text): 
    splitter=re.compile('\\W*') 
    return [s.lower() for s in splitter.split(text) if s!=''] 



    def isindexed(self,url): 
    u=self.con.execute \ 
     ("select rowid from urllist where url='%s'" % url).fetchone() 
    if u!=None: 
     #Check if it has actually been crawled 
     v=self.con.execute(
     'select * from wordlocation where urlid=%d' % u[0]).fetchone() 
     if v!=None: return True 
    return False 



    def crawl(self,pages,depth=2): 
    for i in range(depth): 
     newpages={} 
     for page in pages: 
     try: 
      c=urllib2.urlopen(page) 
     except: 
      print "Could not open %s" % page 
      continue 

     try: 
      soup=BeautifulSoup(c.read()) 
      self.addtoindex(page,soup) 

      links=soup('a') 
      for link in links: 
      if ('href' in dict(link.attrs)): 
       url=urljoin(page,link['href']) 
       if url.find("'")!=-1: continue 
       url=url.split('#')[0] # remove location portion 
       if url[0:4]=='http' and not self.isindexed(url): 
       newpages[url]=1 
       linkText=self.gettextonly(link) 
       self.addlinkref(page,url,linkText) 

      self.dbcommit() 
     except: 
      print "Could not parse page %s" % page 


     pages=newpages 



    # Create the database tables 
    def createindextables(self): 
    self.con.execute('create table urllist(url)') 
    self.con.execute('create table wordlist(word)') 
    self.con.execute('create table wordlocation(urlid,wordid,location)') 
    self.con.execute('create table link(fromid integer,toid integer)') 
    self.con.execute('create table linkwords(wordid,linkid)') 
    self.con.execute('create index wordidx on wordlist(word)') 
    self.con.execute('create index urlidx on urllist(url)') 
    self.con.execute('create index wordurlidx on wordlocation(wordid)') 
    self.con.execute('create index urltoidx on link(toid)') 
    self.con.execute('create index urlfromidx on link(fromid)') 
    self.dbcommit() 

回答

1

crawl處理錯誤做出了調試極其困難:

try: 
    # too much stuff here 
except: # bare except 
    print "Could not parse page %s" % page # generic message 

儘管非常穩定(即如果出現任何問題,程序會繼續運行),這使得無法找出問題所在,只能知道try塊中的十三行中的一行出錯了。使用更短的try塊重構此代碼段並測試特定錯誤(請參閱"the evils of except")。

你可以嘗試沒有任何錯誤都處理運行(註釋掉try:except:print ...線,並在try塊目前迪登線)和讀取特定錯誤回溯到一起幫助你,然後把相應的錯誤處理回在以後。

+0

嘗試按照您的筆記,但到目前爲止沒有運氣。 – anayb

+1

「至今沒有運氣」並不是很有幫助。你究竟做了什麼?如果你按照我的最後一段,你應該得到更具體的錯誤追溯;他們是什麼? – jonrsharpe