2
我是使用Scrapy的新手,並嘗試使用Xpath獲取頁面上所有列表的URL。在Scrapy中使用XPath
第一XPath的工作原理
sel.xpath('//[contains(@class, "attraction_element")]')
但第二XPath是給了一個錯誤
get_parsed_string(snode_attraction, '//[@class="property_title"]/a/@href')
什麼是錯的,我們如何解決這個問題?
Scrapy代碼
def clean_parsed_string(string):
if len(string) > 0:
ascii_string = string
if is_ascii(ascii_string) == False:
ascii_string = unicodedata.normalize('NFKD', ascii_string).encode('ascii', 'ignore')
return str(ascii_string)
else:
return None
def get_parsed_string(selector, xpath):
return_string = ''
extracted_list = selector.xpath(xpath).extract()
if len(extracted_list) > 0:
raw_string = extracted_list[0].strip()
if raw_string is not None:
return_string = htmlparser.unescape(raw_string)
return return_string
class TripAdvisorSpider(Spider):
name = 'tripadvisor'
allowed_domains = ["tripadvisor.com"]
base_uri = "http://www.tripadvisor.com"
start_urls = [
base_uri + '/Attractions-g155032-Activities-c47-t163-Montreal_Quebec.html'
]
# Entry point for BaseSpider
def parse(self, response):
tripadvisor_items = []
sel = Selector(response)
snode_attractions = sel.xpath('//[contains(@class, "attraction_element")]')
# Build item index
for snode_attraction in snode_attractions:
print clean_parsed_string(get_parsed_string(snode_attraction, '//[@class="property_title"]/a/@href'))