我正在尋找使用python和scrapy爬行的網站,其中有ajax分頁。 我可以抓取第一頁。 但是如果我在第二頁完成ajax加載後,我無法獲得其他頁面的鏈接 。 所以請指導我如何獲取ajax頁面的鏈接。需要幫助的AJAX分頁爬行python scrapy
類SItenameSpider(CrawlSpider): ' start_urls = [] 規則=( 規則(SgmlLinkExtractor(允許=('/趨勢/」, '/基調/')),回調= 'parse_item'), )
def parse_item(self, response):
print('Hi, crawling this page! %s' % response.url)
extract_tuple_list = site_product_crawl.parse_product_page('site url')
items = []
for extract_tuple in extract_tuple_list:
item = SitenameItem()
item['site_id'] = extract_tuple[0]
item['name'] = extract_tuple[1]
item['price'] = extract_tuple[2]
item['rating']= extract_tuple[3]
item['num_reviews']= extract_tuple[4]
item['category']= cat_code
item['url'] = response.url
item['date'] = date_created
item['description'] = extract_tuple[6]
items.append(item)
return items
從BS4進口BeautifulSoup作爲bsoup 導入請求 進口pprint 進口重新
高清return_html(URL): 嘗試: 返回requests.get(URL)的.text 除例外爲e: 打印E 返回無
DEF parse_product_page(prod_url): #PRINT prod_url 湯= bsoup(return_html(prod_url)) tuple_list = [] AVG_RATING =無 NUM_REVIEWS =無 prod_category =無 PROD_NAME =無 prod_price =無 prod_number =無
prod_price = '0' # the price is not available on site so it was put 0
#num_rev_div = soup.find('a', {'class' : 'bv-rating-label bv-text-link bv-focusable', 'href' : 'javascript:void(0)'})
url_split_prod_number = prod_url.split('://')
prod_number = url_split_prod_number[1].split('/')[1] + '_' + url_split_prod_number[1].split('/')[2].strip().encode('utf-8');
print prod_number
prod_description = soup.find('div', {'class' : 'articleText'}).get_text().strip().replace('<br/>','').encode('utf-8')
print prod_description
prod_name_div = soup.find('div', id = 'titleSection')
prod_name = prod_name_div.h2.get_text().strip().encode('utf-8');
print prod_name
num_reviews = soup.find('span',itemprop='votes').get_text().strip().encode('utf-8').replace(',','');
avg_rating = soup.find('span',{'class' :'featuredstatbox'}).find('span',itemprop='rating').get_text().strip().encode('utf-8') #get_text().strip().encode('utf-8').replace(',','');
#print price_text
#if price_text != None:
#prod_price = price_text.get_text().strip().encode('utf-8').replace('$','').replace(',','').split('-')[0].strip()
#print prod_price
tuple = (prod_number,
prod_name.strip().encode('utf-8'),
prod_price,
avg_rating,
num_reviews,
prod_category,
prod_description.replace('\n','').replace("'","''"))
tuple_list.append(tuple)
pprint.pprint(tuple_list)
return tuple_list
DEF主(): parse_product_page( '站點名')
如果名 == '主': 主()