0
我想跟蹤所有內部鏈接,同時跟蹤網站的所有內部和外部鏈接。我剛剛開始使用Scrapy,我無法弄清楚在網站中的所有內部鏈接時如何抓取。Scrapy在抓取網站時無法關注內部鏈接
它只是獲取深度鏈接,但不遵循它們。
class BRS(CrawlSpider):
name = "brs"
rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def __init__(self):
global start_urls
#settings.overrides['DEPTH_LIMIT'] = 10
path = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(path,"urls.txt"), "rt") as f:
self.start_urls = filter(None,[url.strip() for url in f.readlines()])
start_urls = self.start_urls
def parse(self, response):
brsitem = BrsItem()
brsitem['url'] = response.url
internal = LinkExtractor(allow_domains=[response.url])
external = LinkExtractor(deny_domains=[response.url])
links = internal.extract_links(response)
internal = []
fd = open('output.txt','a+')
for link in links:
internal.append(link.url)
links = external.extract_links(response)
external = []
for link in links:
external.append(link.url)
for link in internal:
fd.write(link+"\tinternal\n")
for link in external:
fd.write(link+"\texternal\n")
return brsitem
我urls.txt包含截至目前: http://www.stackoverflow.com
任何幫助表示讚賞。