2
我在嘗試學習如何使用asyncio
構建異步Web爬網程序。以下是粗爬蟲來測試框架:使用aiohttp檢測http響應編碼
import asyncio, aiohttp
from bs4 import BeautifulSoup
@asyncio.coroutine
def fetch(url):
with (yield from sem):
print(url)
response = yield from aiohttp.request('GET',url)
response = yield from response.read_and_close()
return response.decode('utf-8')
@asyncio.coroutine
def get_links(url):
page = yield from fetch(url)
soup = BeautifulSoup(page)
links = soup.find_all('a',href=True)
return [link['href'] for link in links if link['href'].find('www') != -1]
@asyncio.coroutine
def crawler(seed, depth, max_depth=3):
while True:
if depth > max_depth:
break
links = yield from get_links(seed)
depth+=1
coros = [asyncio.Task(crawler(link,depth)) for link in links]
yield from asyncio.gather(*coros)
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
loop.run_until_complete(crawler("http://www.bloomberg.com",0))
雖然asyncio
似乎被證明相當不錯,aiohttp
似乎很少有文檔,所以我努力工作的一些事情了我自己。
首先,有沒有辦法讓我們檢測頁面響應的編碼? 其次,我們可以請求會話中的連接保持活動嗎?或者這是默認情況下正如在requests
中那樣?