2013-10-10 50 views
0

我正在用python製作一個簡單的抓取程序,所以我使用了MySQL,Python 。但是當我執行這個簡單的程序時,會出現一個錯誤,然後從網絡爬取的內容沒有在MySQL表上更新,這個錯誤消息顯示了ProgrammingError和語法錯誤,但我不認爲我輸入了錯誤的代碼。因爲在這個問題點HTML標籤。爲什麼HTML標籤出現在錯誤信息。我認爲有MySQL和Python的東西之間的問題。這裏是錯誤消息。ProgrammingError:(1064,'您的SQL語法有錯誤;請查看與您的MySQL服務器版本對應的手冊,查看正確的語法

Traceback (most recent call last): 
    File "crawl.py", line 237, in <module> 
    parseArticle(u) 
    File "crawl.py", line 166, in parseArticle 
    db.updateURL(url , contents) 
    File "crawl.py", line 206, in updateURL 
    self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url)) 
    File "/usr/lib/python2.7/dist-packages/MySQLdb/cursors.py", line 174, in execute 
    self.errorhandler(self, exc, value) 
    File "/usr/lib/python2.7/dist-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler 
    raise errorclass, errorvalue 
ProgrammingError: (1064, 'You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near \'\xeb\x8f\x8b\xec\x9b\x80\', dotum, sans-serif; }\r\n\t//--&gt;\r\n\t</style>\n<p style="TEXT-ALIGN: center\' at line 1') 

這裏是源代碼。感謝您看到我的幫助。

# -*- coding: utf-8 -*- 

from bs4 import BeautifulSoup 
import robotparser 
import urllib2 
import time, traceback, re, sys, os 
import MySQLdb 

crawler_name = 'daum_blog_crawler'  
mainpage = 'http://blog.daum.net/'  

rp = robotparser.RobotFileParser(mainpage + 'robots.txt') 
rp.read() 

def canFetch(url): 
     return rp.can_fetch(crawler_name, url) 

def getContent(url, delay=1): 
     time.sleep(delay) 

     if not canFetch(url): 
       print 'This url can NOT be fetched by our crawler :', url 
       return None 
     try: 
       opener = urllib2.build_opener() 
       opener.addheaders = [('User-agent',crawler_name)] 
       contents = opener.open(url).read() 
     except: 
       traceback.print_exc() 
       return None 
     return contents 

def getArticleInfo(soup): 

     rBlog = re.compile('.+blog.daum.net/\w+/\d+.*?') 
     URLs = soup('a',{'href':rBlog}) 

     return [ u.get('href').split('?')[0] for u in URLs ] 

def getOwnArticles(contents):   
     ret = [] 
     soup = BeautifulSoup(contents) 
     rBlog = re.compile('.+/BlogTypeView.+') 
     for u in soup('a',{'href':rBlog}): 
       href = u.get('href') 
       article = href.split('articleno=')[1].split('&')[0] 
       if ret.count(article)<1: 
         ret.append(article) 
     return ret 

def gatherNeighborInfo(soup): 

     rBlog = re.compile('http://blog.daum.net/\w+') 
     Neighbors = soup('a',{'href':rBlog}) 
     cnt = 0 
     for n in Neighbors: 
       url = n.get('href') 
       blogname = url.split('/')[-1] 
       if url and url.startswith('http://') and db.isCrawledURL(url)<1: 
         db.insertURL(url, 1) 

         url2 = getRedirectedURL(url) 
         if not url2: continue 
         re_url = 'http://blog.daum.net' + url2 
         body = getContent(re_url, 0) 
         if body: 
           for u in getOwnArticles(body): 

             fullpath = 'http://blog.daum.net/'+blogname+'/'+u 
             cnt+=db.insertURL(fullpath) 

     if cnt>0: print '%d neighbor articles inserted'%cnt 

def getRedirectedURL(url): 
     contents = getContent(url) 
     if not contents: return None 

     #redirect 
     try: 
       soup = BeautifulSoup(contents) 
       frame = soup('frame')   
       src = frame[0].get('src') 
     except: 
       src = None 
     return src 

def getBody(soup, parent): 

     rSrc = re.compile('.+/ArticleContentsView.+') 
     iframe = soup('iframe',{'src':rSrc}) 
     if len(iframe)>0: 
       src = iframe[0].get('src') 
       iframe_src = 'http://blog.daum.net'+src 


       req = urllib2.Request(iframe_src) 
       req.add_header('Referer', parent) 
       body = urllib2.urlopen(req).read() 
       soup = BeautifulSoup(body) 
       strbody= str(soup.body) 
       return strbody 
     else: 
       print 'NULL contents' 
       return '' 

def parseArticle(url): 

     article_id = url.split('/')[-1] 
     blog_id = url.split('/')[-2] 

     #for debugging, temp 
     if blog_id.isdigit(): 
       print 'digit:', url.split('/') 

     newURL = getRedirectedURL(url) 

     if newURL: 

       newURL = 'http://blog.daum.net'+newURL 
       print 'redirecting', newURL 
       contents = getContent(newURL, 0) 
       if not contents: 
         print 'Null Contents...' 

         db.updateURL(url, -1) 
         return 


       soup = BeautifulSoup(contents) 


       gatherNeighborInfo(soup)    


       n=0 
       for u in getArticleInfo(soup): 
         n+=db.insertURL(u) 
       if n>0: print 'inserted %d urls from %s'%(n,url) 


       sp = contents.find('<title>') 
       if sp>-1: 
         ep = contents[sp+7:].find('</title>') 
         title = contents[sp+7:sp+ep+7] 
       else: 
         title = '' 


       contents = getBody(soup, newURL) 


       db.updateURL(url , contents) 

     else: 
       print 'Invalid blog article...' 

       db.updateURL(url, 'None', -1) 

class DB: 
     "MySQL wrapper class" 
     def __init__(self): 
       self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='qltkd') 
       self.conn.query("set character_set_connection=utf8;") 
       self.conn.query("set character_set_server=utf8;") 
       self.conn.query("set character_set_client=utf8;") 
       self.conn.query("set character_set_results=utf8;") 
       self.conn.query("set character_set_database=utf8;") 
       self.cursor = self.conn.cursor() 
       self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url CHAR(150), state INT, content TEXT)') 
     def commit(self): 
       self.conn.commit() 
     def __del__(self): 
       self.conn.commit() 
       self.cursor.close() 

     def insertURL(self, url, state=0, content=None): 
       #'/' delete 
       if url[-1]=='/': url=url[:-1] 
       try:  
         self.cursor.execute("INSERT INTO urls VALUES ('%s',%d,'%s')"%(url,state,content)) 
       except: 
         return 0 
       else: 
         return 1 

     def selectUncrawledURL(self): 
       self.cursor.execute("SELECT * FROM urls where state=0") 
       return [ row[0] for row in self.cursor.fetchall() ] 

     def updateURL(self, url, content, state=1): 
       if url[-1]=='/': url=url[:-1] 
     self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url)) 

     def isCrawledURL(self, url): 
       if url[-1]=='/': url=url[:-1] 
       self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s' AND state=1"%url) 
       ret = self.cursor.fetchone() 
       return ret[0] 

db = DB() 

if __name__=='__main__': 
     print 'starting crawl.py...' 


     contents = getContent(mainpage) 
     URLs = getArticleInfo(BeautifulSoup(contents)) 
     nSuccess = 0 
     for u in URLs: 
       nSuccess += db.insertURL(u) 
     print 'inserted %d new pages.'%nSuccess 


     while 1: 
       uncrawled_urls = db.selectUncrawledURL() 
       if not uncrawled_urls: break 
       for u in uncrawled_urls: 

         print 'downloading %s'%u 
         try: 
           parseArticle(u) 
         except: 
           traceback.print_exc() 
           db.updateURL(u, -1) 
         db.commit() 
       #bs.UpdateIndex() 
+0

什麼是實際完整的SQL語句在這裏?字符串引用問題? –

+0

我不知道僅僅是編程錯誤:(1064,'你的SQL語法有錯誤;請查看與你的MySQL服務器版本相對應的手冊,以便在'\ xeb \ x8f \ x8b \ xec \ x9b \ x80 \',dotum,sans-serif;} \ r \ n \ t // - > \ r \ n \ t \ n

+0

你不知道?它是你的代碼,不是我們的,請提供請求的信息。添加一個簡單的調試打印語句是直截了當的。不能提供這一小塊信息,請避免編程 –

回答

2

你可以試試;

self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,self.conn.escape_string(content),url)) 
+0

哇!!!!謝謝!它現在可行!錯誤需要我很多時間,但是mccakici,請你向我解釋一下在錯了,我不知道什麼?如果你這樣做,那將是一件非常愉快的事情。謝謝。 –

+0

你的問題是單一配額。例如, ; 「UPDATE urls SET state =%d,content ='bla'vs vs'.......... if use escape_string 」UPDATE urls SET state =%d,content ='bla \'vs vs '.......... – mccakici

相關問題