我使用網絡爬蟲來抓取數據,並將結果 - (來自Twitter頁面的推文)作爲單獨的html文件存儲在我正在爬取的每個用戶中。我打算稍後解析html文件並將數據存儲到數據庫中以供分析。但是,我有一個奇怪的問題。Python網絡爬蟲的文件存儲問題
當我運行下面的程序 - 從整體履帶小片段 - 我能獲得針對每個跟隨一個單獨的HTML文件:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
我知道這可能無助於你的問題在這裏 - 但一段時間後,我在twitter上進行數據挖掘,請問爲什麼你不只是使用API? – eWizardII
Python28 ?????? –
啊..我正在使用API來獲取用戶的朋友列表,但我不想使用該API來獲取由於費率限制而發出的推文。我相信,使用當前的代碼(鑑於爬行程序必須暫停以獲取用戶的所有推文),我不會超過當前的速率限制。 – snehoozle