我製作了scrapy蜘蛛,我想抓取所有頁面,但只抓取到第二頁然後停止。 似乎在if next_page:
循環中url只會變成第二頁然後粘在那裏。我想我誤解了http響應的工作方式,因爲它似乎只抓取起始頁面上的下一頁鏈接。Scrapy只抓取一頁
import scrapy
from tutorial.items import TriniCarsItem
class TCS(scrapy.Spider):
name = "TCS"
allowed_domains = ["TCS.com"]
start_urls = [
"http://www.TCS.com/database/featuredcarsList.php"]
def parse(self, response):
for href in response.css("table > tr > td > a::attr('href')"):
url = response.urljoin(href.extract())
yield(scrapy.Request(url, callback=self.parse_dir_contents))
next_page = response.css("body > table > tr > td > font > b > a::attr('href')")
if next_page:
url = response.urljoin(next_page[0].extract())
print("THIS IS THE URL =----------------------------- " + url)
yield(scrapy.Request(url, self.parse))
def parse_dir_contents(self, response):
for sel in response.xpath('//table[@width="543"]/tr/td/table/tr/td[2]/table'):
item = TCSItem()
item['id'] = sel.xpath('tr[1]/td[1]//text()').extract()
item['make'] = sel.xpath('tr[3]/td[2]//text()').extract()
item['model'] = sel.xpath('tr[4]/td[2]//text()').extract()
item['year'] = sel.xpath('tr[5]/td[2]//text()').extract()
item['colour'] = sel.xpath('tr[6]/td[2]//text()').extract()
item['engine_size'] = sel.xpath('tr[7]/td[2]//text()').extract()
item['mileage'] = sel.xpath('tr[8]/td[2]//text()').extract()
item['transmission'] = sel.xpath('tr[9]/td[2]//text()').extract()
item['features'] = sel.xpath('tr[11]/td[2]//text()').extract()
item['additional_info'] = sel.xpath('tr[12]/td[2]//text()').extract()
item['contact_name'] = sel.xpath('tr[14]/td[2]//text()').extract()
item['contact_phone'] = sel.xpath('tr[15]/td[2]//text()').extract()
item['contact_email'] = sel.xpath('tr[16]/td[2]//text()').extract()
item['asking_price'] = sel.xpath('tr[17]/td[2]//text()').extract()
item['date_added'] = sel.xpath('tr[19]/td[2]//text()').extract()
item['page_views'] = sel.xpath('tr[20]/td[2]//text()').extract()
#print(make, model, year, colour, engine_size, mileage, transmission, features,
#additional_info, contact_name, contact_phone, contact_email, asking_price, date_added,
#page_views)
yield(item)
你是否print'語句返回你期待的URL? –
它確實,但只有第二頁,它沒有得到第三頁,等等 – Jimbo
是否一遍又一遍地打印相同的URL或只打印一次? –