4
我不明白爲什麼Scrapy抓取第一頁但沒有跟蹤鏈接來抓取後續頁面。這必須與規則有關。非常感激。謝謝!Scrapy抓取第一頁,但沒有遵循鏈接
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistItem
class MySpider(CrawlSpider):
name = "craig"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/acc/"]
rules = (Rule (SgmlLinkExtractor(allow=("index100\.html",),restrict_xpaths=('//p[@id="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CraigslistItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/@href").extract()
items.append(item)
return(items)
spider = MySpider()
craigs_sample.items中有什麼?你是否也可以分享這段代碼片段,以便從craigslist_sample.items導入CraigslistItem'起作用。 –