我正在用python製作一個簡單的抓取程序,所以我使用了MySQL,Python 。但是當我執行這個簡單的程序時,會出現一個錯誤,然後從網絡爬取的內容沒有在MySQL表上更新,這個錯誤消息顯示了ProgrammingError和語法錯誤,但我不認爲我輸入了錯誤的代碼。因爲在這個問題點HTML標籤。爲什麼HTML標籤出現在錯誤信息。我認爲有MySQL和Python的東西之間的問題。這裏是錯誤消息。ProgrammingError:(1064,'您的SQL語法有錯誤;請查看與您的MySQL服務器版本對應的手冊,查看正確的語法
Traceback (most recent call last):
File "crawl.py", line 237, in <module>
parseArticle(u)
File "crawl.py", line 166, in parseArticle
db.updateURL(url , contents)
File "crawl.py", line 206, in updateURL
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
File "/usr/lib/python2.7/dist-packages/MySQLdb/cursors.py", line 174, in execute
self.errorhandler(self, exc, value)
File "/usr/lib/python2.7/dist-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
ProgrammingError: (1064, 'You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near \'\xeb\x8f\x8b\xec\x9b\x80\', dotum, sans-serif; }\r\n\t//-->\r\n\t</style>\n<p style="TEXT-ALIGN: center\' at line 1')
這裏是源代碼。感謝您看到我的幫助。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re, sys, os
import MySQLdb
crawler_name = 'daum_blog_crawler'
mainpage = 'http://blog.daum.net/'
rp = robotparser.RobotFileParser(mainpage + 'robots.txt')
rp.read()
def canFetch(url):
return rp.can_fetch(crawler_name, url)
def getContent(url, delay=1):
time.sleep(delay)
if not canFetch(url):
print 'This url can NOT be fetched by our crawler :', url
return None
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent',crawler_name)]
contents = opener.open(url).read()
except:
traceback.print_exc()
return None
return contents
def getArticleInfo(soup):
rBlog = re.compile('.+blog.daum.net/\w+/\d+.*?')
URLs = soup('a',{'href':rBlog})
return [ u.get('href').split('?')[0] for u in URLs ]
def getOwnArticles(contents):
ret = []
soup = BeautifulSoup(contents)
rBlog = re.compile('.+/BlogTypeView.+')
for u in soup('a',{'href':rBlog}):
href = u.get('href')
article = href.split('articleno=')[1].split('&')[0]
if ret.count(article)<1:
ret.append(article)
return ret
def gatherNeighborInfo(soup):
rBlog = re.compile('http://blog.daum.net/\w+')
Neighbors = soup('a',{'href':rBlog})
cnt = 0
for n in Neighbors:
url = n.get('href')
blogname = url.split('/')[-1]
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
db.insertURL(url, 1)
url2 = getRedirectedURL(url)
if not url2: continue
re_url = 'http://blog.daum.net' + url2
body = getContent(re_url, 0)
if body:
for u in getOwnArticles(body):
fullpath = 'http://blog.daum.net/'+blogname+'/'+u
cnt+=db.insertURL(fullpath)
if cnt>0: print '%d neighbor articles inserted'%cnt
def getRedirectedURL(url):
contents = getContent(url)
if not contents: return None
#redirect
try:
soup = BeautifulSoup(contents)
frame = soup('frame')
src = frame[0].get('src')
except:
src = None
return src
def getBody(soup, parent):
rSrc = re.compile('.+/ArticleContentsView.+')
iframe = soup('iframe',{'src':rSrc})
if len(iframe)>0:
src = iframe[0].get('src')
iframe_src = 'http://blog.daum.net'+src
req = urllib2.Request(iframe_src)
req.add_header('Referer', parent)
body = urllib2.urlopen(req).read()
soup = BeautifulSoup(body)
strbody= str(soup.body)
return strbody
else:
print 'NULL contents'
return ''
def parseArticle(url):
article_id = url.split('/')[-1]
blog_id = url.split('/')[-2]
#for debugging, temp
if blog_id.isdigit():
print 'digit:', url.split('/')
newURL = getRedirectedURL(url)
if newURL:
newURL = 'http://blog.daum.net'+newURL
print 'redirecting', newURL
contents = getContent(newURL, 0)
if not contents:
print 'Null Contents...'
db.updateURL(url, -1)
return
soup = BeautifulSoup(contents)
gatherNeighborInfo(soup)
n=0
for u in getArticleInfo(soup):
n+=db.insertURL(u)
if n>0: print 'inserted %d urls from %s'%(n,url)
sp = contents.find('<title>')
if sp>-1:
ep = contents[sp+7:].find('</title>')
title = contents[sp+7:sp+ep+7]
else:
title = ''
contents = getBody(soup, newURL)
db.updateURL(url , contents)
else:
print 'Invalid blog article...'
db.updateURL(url, 'None', -1)
class DB:
"MySQL wrapper class"
def __init__(self):
self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='qltkd')
self.conn.query("set character_set_connection=utf8;")
self.conn.query("set character_set_server=utf8;")
self.conn.query("set character_set_client=utf8;")
self.conn.query("set character_set_results=utf8;")
self.conn.query("set character_set_database=utf8;")
self.cursor = self.conn.cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url CHAR(150), state INT, content TEXT)')
def commit(self):
self.conn.commit()
def __del__(self):
self.conn.commit()
self.cursor.close()
def insertURL(self, url, state=0, content=None):
#'/' delete
if url[-1]=='/': url=url[:-1]
try:
self.cursor.execute("INSERT INTO urls VALUES ('%s',%d,'%s')"%(url,state,content))
except:
return 0
else:
return 1
def selectUncrawledURL(self):
self.cursor.execute("SELECT * FROM urls where state=0")
return [ row[0] for row in self.cursor.fetchall() ]
def updateURL(self, url, content, state=1):
if url[-1]=='/': url=url[:-1]
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
def isCrawledURL(self, url):
if url[-1]=='/': url=url[:-1]
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s' AND state=1"%url)
ret = self.cursor.fetchone()
return ret[0]
db = DB()
if __name__=='__main__':
print 'starting crawl.py...'
contents = getContent(mainpage)
URLs = getArticleInfo(BeautifulSoup(contents))
nSuccess = 0
for u in URLs:
nSuccess += db.insertURL(u)
print 'inserted %d new pages.'%nSuccess
while 1:
uncrawled_urls = db.selectUncrawledURL()
if not uncrawled_urls: break
for u in uncrawled_urls:
print 'downloading %s'%u
try:
parseArticle(u)
except:
traceback.print_exc()
db.updateURL(u, -1)
db.commit()
#bs.UpdateIndex()
什麼是實際完整的SQL語句在這裏?字符串引用問題? –
我不知道僅僅是編程錯誤:(1064,'你的SQL語法有錯誤;請查看與你的MySQL服務器版本相對應的手冊,以便在'\ xeb \ x8f \ x8b \ xec \ x9b \ x80 \',dotum,sans-serif;} \ r \ n \ t // - > \ r \ n \ t \ n
你不知道?它是你的代碼,不是我們的,請提供請求的信息。添加一個簡單的調試打印語句是直截了當的。不能提供這一小塊信息,請避免編程 –