0
我已經在教程的幫助下製作了一個網絡爬蟲,該教程從給定的url獲取所有鏈接,並且您可以傳遞一個數字,它對應於步驟/鏈接距離的深度。現在,當您在scraperOut = scraper(url,3)
中定義一個當前編號爲3的數字時,搜尋器會深入三步,並將鏈接附加到同一列表。我的問題是如何以及如何在代碼中進行修改,以便我可以選擇單獨列出每個列表而不是全部列在一個列表中,或者例如打印出第二步列表?整個代碼看起來是這樣的:Python網絡爬蟲,在不同的列表中打印每一步
import urllib
import re
import time
from threading import Thread
import MySQLdb
import mechanize
import readability
from bs4 import BeautifulSoup
from readability.readability import Document
import urlparse
url = "http://www.adbnews.com/area51/"
def scraper(root,steps):
urls = [root]
visited = [root]
counter = 0
while counter < steps:
step_url = scrapeStep(urls)
urls = []
for u in step_url:
if u not in visited:
urls.append(u)
visited.append(u)
counter +=1
return visited
def scrapeStep(root):
result_urls = []
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Firefox')]
for url in root:
try:
br.open(url)
for link in br.links():
newurl = urlparse.urljoin(link.base_url, link.url)
result_urls.append(newurl)
except:
print "error"
return result_urls
d = {}
threadlist = []
def getReadableArticle(url):
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Firefox')]
html = br.open(url).read()
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
soup = BeautifulSoup(readable_article)
final_article = soup.text
links = soup.findAll('img', src=True)
return readable_title
return final_article
def dungalo(urls):
article_text = getReadableArticle(urls)[0]
d[urls] = article_text
def getMultiHtml(urlsList):
for urlsl in urlsList:
try:
t = Thread(target=dungalo, args=(urls1,))
threadlist.append(t)
t.start()
except:
nnn = True
for g in threadlist:
g.join()
return d
scraperOut = scraper(url,3)
for s in scraperOut:
print s
#print scraperOut
謝謝,我已經改變了,但那不是我的主要問題 – dzordz