我想寫一個webcrawler,但我卡住,因爲我不能看到無限循環在我的代碼中的某處。看不到無限循環
class Crawler(object):
def __init__(self, url, query, dir = os.path.dirname(__file__)):
self.start_url = url
self.start_parsed = urllib3.util.parse_url(url)
self.query = re.compile(query, re.IGNORECASE)
self.dir = dir
self.__horizon = set()
self.log = []
self.__horizon.add(url)
self.log.append(url)
print("initializing crawler....")
print(locals())
def start(self, depth= 5, url = '/'):
print(url, depth)
self.log.append(url)
if depth > 0:
pool = urllib3.PoolManager()
data = pool.request("GET", self.start_url if url == '/' else url).data.decode('utf-8')
valid_list = []
self.add_horizon(parser_soup.get_links(data), valid_list)
if re.search(self.query, parser_soup.get_text(data)):
self.output(data)
for u in valid_list:
self.start(depth = (depth-1), url = u)
def output(self, data):
with open(os.path.join(self.dir, get_top_domain(self.start_parsed.host) + '.' + str(time.time()) + '.html'), 'w+') as f:
f.write(data)
def add_horizon(self, url_list, valid_list = []):
for url in url_list:
if get_top_domain(url) == get_top_domain(self.start_parsed.host) \
and (not str(url) in self.log or not str(url) in self.__horizon):
valid_list.append(str(url))
self.__horizon.update(valid_list)
它永遠運行。我應該如何確保消除重複鏈接?
你是什麼意思「看不到無限循環?」 –
@uoɥʇʎPʎzɐɹC他不明白爲什麼他的代碼會陷入無限循環。 –
與您的問題無關,但有一個建議:使'__init__'中的PoolManager成爲可能,並在整個過程中使用它以獲得最大收益。 – shazow