2012-05-18 40 views
6

我正在使用scrapy框架取消一個網站,並且無法單擊JavaScript鏈接打開另一個頁面。執行Javascript在Python中使用scrapy提交表單函數

我可以在頁面上識別碼爲:

<a class="Page" alt="Click to view job description" title="Click to view job description" href="javascript:sysSubmitForm('frmSR1');">Accountant&nbsp;</a> 

任何一個可以建議我如何執行在scaroy JavaScript和得到另一個頁面通過我可以獲取從該網頁的數據。

在此先感謝

回答

8

簽出以下關於如何使用硒與scrapy的剪輯。由於您不只是下載html,所以爬網會更慢,但您將完全訪問DOM。

注意:由於之前提供的鏈接不再有效,因此我已複製粘貼此代碼段。

# Snippet imported from snippets.scrapy.org (which no longer works) 

from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.selector import HtmlXPathSelector 
from scrapy.http import Request 

from selenium import selenium 

class SeleniumSpider(CrawlSpider): 
    name = "SeleniumSpider" 
    start_urls = ["http://www.domain.com"] 

    rules = (
     Rule(SgmlLinkExtractor(allow=('\.html',)), 
     callback='parse_page',follow=True), 
    ) 

    def __init__(self): 
     CrawlSpider.__init__(self) 
     self.verificationErrors = [] 
     self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com") 
     self.selenium.start() 

    def __del__(self): 
     self.selenium.stop() 
     print self.verificationErrors 
     CrawlSpider.__del__(self) 

    def parse_page(self, response): 
     item = Item() 

     hxs = HtmlXPathSelector(response) 
     #Do some XPath selection with Scrapy 
     hxs.select('//div').extract() 

     sel = self.selenium 
     sel.open(response.url) 

     #Wait for javscript to load in Selenium 
     time.sleep(2.5) 

     #Do some crawling of javascript created content with Selenium 
     sel.get_text("//div") 
     yield item 
+0

這兩個鏈接都沒有幫助了,這就是爲什麼stackoverflow會要求您至少在這裏總結網頁。你可以多說一些,或找到最初的答案?謝謝! – nealmcb

3

據我所知,在urrlib2和urllib的實施零碎履帶顯然不具有JS工作。爲了使用js,您可以使用qt webkit或selenium。或者您可以在頁面上找到所有ajax鏈接,並查看如何實現與服務器的數據交換並間接向服務器api發送響應。

+0

@Dennis:感謝您的回答 –

7

如果你想看看它採用scrapy和硒,檢查出https://github.com/nicodjimenez/bus_catchers一個相當巨大的,實用的代碼基礎。這是一個更簡單的例子。

# stripped down BoltBus script 
from selenium import webdriver 
from selenium.common.exceptions import TimeoutException 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.common.keys import Keys 
from scrapy.selector import HtmlXPathSelector 
from scrapy.http import Response 
from scrapy.http import TextResponse 
import time 

# set dates, origin, destination 
cityOrigin="Baltimore" 
cityDeparture="New York" 
day_array=[0] 
browser = webdriver.Firefox() 

# we are going the day of the days of the month from 15,16,...,25 
# there is a discrepancy between the index of the calendar days and the day itself: for example day[10] may correspond to Feb 7th 
for day in day_array: 

    # Create a new instance of the Firefox driver 
    browser.get("http://www.boltbus.com") 

    # click on "region" tab 
    elem_0=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstRegion_textBox") 
    elem_0.click() 
    time.sleep(5) 

    # select Northeast 
    elem_1=browser.find_element_by_partial_link_text("Northeast") 
    elem_1.click() 
    time.sleep(5) 

    # click on origin city 
    elem_2=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstOrigin_textBox") 
    elem_2.click() 
    time.sleep(5) 

    # select origin city 
    elem_3=browser.find_element_by_partial_link_text(cityOrigin) 
    elem_3.click() 
    time.sleep(5) 

    # click on destination city 
    elem_4=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstDestination_textBox") 
    elem_4.click() 
    time.sleep(5) 

    # select destination city 
    elem_5=browser.find_element_by_partial_link_text(cityDeparture) 
    elem_5.click() 
    time.sleep(5) 

    # click on travel date 
    travel_date_elem=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_imageE") 
    travel_date_elem.click()  

    # gets day rows of table 
    date_rows=browser.find_elements_by_class_name("daysrow") 

    # select actual day (use variable day) 
    # NOTE: you must make sure these day elements are "clickable" 
    days=date_rows[0].find_elements_by_xpath("..//td") 
    days[day].click() 
    time.sleep(3) 

    # retrieve actual departure date from browser 
    depart_date_elem=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_txtDepartureDate") 
    depart_date=str(depart_date_elem.get_attribute("value")) 

    # PARSE TABLE 

    # convert html to "nice format" 
    text_html=browser.page_source.encode('utf-8') 
    html_str=str(text_html) 

    # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) 
    resp_for_scrapy=TextResponse('none',200,{},html_str,[],None) 

    # takes a "TextResponse" object and feeds it to a scrapy function which will convert the raw HTML to a XPath document tree 
    hxs=HtmlXPathSelector(resp_for_scrapy) 

    # the | sign means "or" 
    table_rows=hxs.select('//tr[@class="fareviewrow"] | //tr[@class="fareviewaltrow"]') 
    row_ct=len(table_rows) 

    for x in xrange(row_ct): 

     cur_node_elements=table_rows[x] 
     travel_price=cur_node_elements.select('.//td[@class="faresColumn0"]/text()').re("\d{1,3}\.\d\d") 

     # I use a mixture of xpath selectors to get me to the right location in the document, and regular expressions to get the exact data 

     # actual digits of time 
     depart_time_num=cur_node_elements.select('.//td[@class="faresColumn1"]/text()').re("\d{1,2}\:\d\d") 

     # AM or PM (time signature) 
     depart_time_sig=cur_node_elements.select('.//td[@class="faresColumn1"]/text()').re("[AP][M]") 

     # actual digits of time 
     arrive_time_num=cur_node_elements.select('.//td[@class="faresColumn2"]/text()').re("\d{1,2}\:\d\d") 

     # AM or PM (time signature) 
     arrive_time_sig=cur_node_elements.select('.//td[@class="faresColumn2"]/text()').re("[AP][M]") 

     print "Depart date: " + depart_date 
     print "Depart time: " + depart_time_num[0] + " " + depart_time_sig[0] 
     print "Arrive time: " + arrive_time_num[0] + " " + arrive_time_sig[0] 
     print "Cost: " + "$" + travel_price[0] 
     print "\n" 
+0

嘿@nicodjimenez,感謝您的代碼。我明白了,除了選擇日期之外。當你說「我們要從15,16,...,25的月份那天開始」時,我不明白。此外,你注意到:「你必須確保這些日子元素是」可點擊的「。」你能詳細解釋一下嗎? – cd98