2013-10-06 181 views
-5

我有一個searchengine.py文件,我也爲此創建了一個索引。Python中的全局名稱錯誤

searchengine.py

import sqlite3 
import urllib2 
from bs4 import BeautifulSoup 
from urlparse import urljoin 

# Create a list of words to igonre 
ignorewords=set(['the','of','to','and','a','in','is','it']) 

class crawler: 
    # Initialize the crawler with the name of database 
    def __init__(self,dbname): 
     self.con=sqlite3.connect(dbname) 

    def __del__(self): 
     self.con.close() 

    def dbcommit(self): 
     pass 

    # Auxilliary function for getting an entry id and 
    # adding it if not present 
    def getentryid(self, table, field, value, createnew=True): 
     cur=self.con.execute("select rowid from %s where %s='%s'" % (table,field,value)) 
     res=cur.fetchone() 
     if res==None: 
      cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value)) 
      return cur.lastrowid 
     else: 
      return res[0] 

    # Index an individual page 
    def addtoindex(self,url,soup): 
     if self.isindexed(url): return 
     print 'Indexing %s' %url 

     # Get the individual words 
     text=self.gettextonly(soup) 
     words=self.separatewords(text) 

     # Get the URL id 
     urlid=self.getentryid('urllist','url',url) 

     # Link each word to this url 
     for i in range(len(words)): 
      word=words[i] 
      if word in ignorewords: continue 
      wordid=self.getentryid('wordlist','word',word) 
      self.con.execute("insert into wordlocation(urlid,wordid,location) \ 
       values (%d,%d,%d)" % (urlid,wordid,i)) 


    # Extract the text from an HTML page (no tags) 
    def gettextonly(self,soup): 
     v=soup.string 
     if v==None: 
      c=soup.contents 
      resulttext='' 
      for t in c: 
       subtext=self.gettextonly(t) 
       resulttext+=subtext+'\n' 
      return resulttext 
     else: 
      return v.strip() 


    # Sepetate the words by any non-whitespace character 
    def separatewords(self, text): 
     splitter=re.compile('\\W*') 
     return [s.lower() for s in splitter.split(text) if s!=''] 

    # Return true if this url is already indexed 
    def isindexed(self, url): 
     u=self.con.execute("select rowid from urllist where url='%s'" % url).fetchone() 
     if u!=None: 
      # Check if it has actually been crawled 
      v=self.con.execute('select * from wordlocation where urlid=%d' % u[0]).fetchone() 
      if v!=None: return True 
     return False 

    # Add a link between two pages 
    def addlinkref(self,urlFrom,urlTo,linkText): 
     pass 

    # Starting with a list of pages, do a breadth first search to 
    # the given depth, indexing pages as we go 
    def crawl(self,pages,depth=2): 
     pass 

    # Create the database tables 
    def createindextables(self): 
     pass 

    def crawl(self,pages,depth=2): 
     for i in range(depth): 
      newpages=set() 
      for page in pages: 
       try: 
        c=urllib2.urlopen(page) 
       except: 
        print "Could not open %s" % page 
        continue 
       soup=BeautifulSoup(c.read()) 
       self.addtoindex(page,soup) 

       links=soup('a') 
       for link in links: 
        if ('href' in dict(link.attrs)): 
         url=urljoin(page,link['href']) 
         if url.find("'")!=-1: continue 
         url=url.split('#')[0] # remove location portion 
         if url[0:4]=='http' and not self.isindexed(url): 
          newpages.add(url) 
         linkText=self.gettextonly(link) 
         self.addlinkref(page,url,linkText) 

       self.dbcommit() 

      pages=newpages 

    # Creating index tables 
    def createindextables(self): 
     self.con.execute('create table urllist(url)') 
     self.con.execute('create table wordlist(word)') 
     self.con.execute('create table wordlocation(urlid,wordid,location)') 
     self.con.execute('create table link(fromid integer,toid integer)') 
     self.con.execute('create table linkwords(wordid,linkid)') 
     self.con.execute('create index wordid on wordlist(word)') 
     self.con.execute('create index urlid on urllist(url)') 
     self.con.execute('create index wordurlidx on wordlocation(wordid)') 
     self.con.execute('create index urltoidx on link(toid)') 
     self.con.execute('create index urlfromidx on link(fromid)') 
     self.dbcommit() 

創建的索引 -

>>> reload(searchengine) 
>>> crawler=searchengine.crawler('searchindex.db') 
>>> crawler.createindextables() 

我嘗試使用它像這樣,但它引發錯誤searchindex.db使用python外殼:

>>> reload(searchengine) 
>>> crawler=searchengine.crawler('searchindex.db') 
>>> pages=['http://kiwitobes.co/wiki/Categorical_list_of_programming_languages.html'] 
>>> crawler.crawl(pages) 
Indexing http://www.tartarus.org/~martin/PorterStemmer/index.html 

Traceback (most recent call last): 
    File "<pyshell#22>", line 1, in <module> 
    crawler.crawl(pages) 
    File "C:/Users/dj/Desktop\searchengine.py", line 103, in crawl 
    self.addtoindex(page,soup) 
    File ""C:/Users/dj/Desktop\searchengine.py", line 38, in addtoindex 
    words=self.separatewords(text) 
    File ""C:/Users/dj/Desktop\searchengine.py", line 68, in separatewords 
    splitter=re.compile('\\W*') 
NameError: global name 're' is not defined 

蟒蛇版本:2.7,操作系統:windows 8

+0

缺少'import re'。 – falsetru

回答

0

您使用re模塊代碼:

def separatewords(self, text): 
    splitter=re.compile('\\W*') 
    # here --^ 
    return [s.lower() for s in splitter.split(text) if s!=''] 

但沒有一次我沒有看到你有:

import re 

它加載re模塊到內存中。試圖使用未加載到內存中的模塊會生成NameError

因此,要解決您的問題,只需將import re添加到您的腳本的頂部並加入所有其他導入。

+0

我收到了大膽回答這個問題的回覆(相同的答案),然後OP在評論中發佈了一個新的回溯;一個HTML解析錯誤。我沒有時間正確回覆,因此現在刪除了我的回答。 –

+0

@MartijnPieters - 我明白了。那麼,我不能真正幫助,因爲我沒有安裝BeautifulSoup(諷刺的是,我知道,但我還沒有運行與我使用Python的需求)。我仍然會保留我的帖子,因爲它解決了他在問題中提出的問題。畢竟,SO是爲了回答你的問題/解決你的_specific_問題......而不是調試你的整個代碼。他應該問一個新問題,如果他需要做其他事情。 – iCodez