1

如何通過一個Scrapy響應網址硒然後硒響應回Scrapy傳遞Scrapy響應的URL硒然後硒響應回Scrapy

我有這個Scrapy蜘蛛first.py

# -*- coding: utf-8 -*- 
import scrapy 
import re 
import json 


class FirstSpider(scrapy.Spider): 
    name = "first" 
    allowed_domains = ["someautosite.co.uk"] 
    start_urls = (
     'http://www.someautosite.co.uk/some_specific_search_results', 
    ) 

    def parse(self, response): 
     for car_url in response.xpath('//article[contains(@class, "standard")]/div/div[2]/div[1]/h1/a/@href').extract(): 
      absoluteurl = response.urljoin(car_url) 
      # yield {'URL': absoluteurl} 
      yield scrapy.Request(absoluteurl, callback=self.parse_car) 

    def parse_car(self, response): 
     pattern = re.compile(r"var utag_data = ({.*?});", re.MULTILINE | re.DOTALL) 
     utag_data = response.xpath('//script[contains(.,"var utag")]/text()').re(pattern)[0] 
     utag_data_obj = json.loads(utag_data) 
     # make = utag_data_obj['make'] 
     # model = utag_data_obj['model'] 
     # yield {'Make':utag_data_obj['make'], 
     #  'model':utag_data_obj['model'], 
     #  } 
     # yield utag_data 
     tel = response.xpath('//article/div[3]/section/div/div[@itemprop="telephone"]/text()').extract_first() 
     # tel_json_str = '{"tel":"' + str(tel) + '"}' 
     # tel_json_obj = json.loads(tel_json_str) 
     # Combine 2 JSON objects into one: 
     car_json = utag_data_obj.copy() 
     car_json.update({"tel": tel}) 
     yield car_json 
     quotations_url = response.xpath('/html/body/article/section/ul/li[2]/a/@href').extract_first() 
     yield scrapy.Request(quotations_url, callback=self.parse_quotations) 

    def parse_quotations(self, response): # parse insurance quotation website link with selenium 
     import filldata2 

然後我有一個Selenium filldata2.py模塊,它試圖從上面的scrapy蜘蛛代碼的parse_car方法中提取的url鏈接中獲取汽車報價。

現在selenum模塊開始是這樣的:

from selenium import webdriver 

from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0 
from selenium.webdriver.common.keys import Keys 
import time 
import six 
from six.moves.configparser import SafeConfigParser 

regno = 'AA00AAA' 
mile = '15000' 
firstname = 'John' 
lastname = 'Smith' 
[...] 

def yesno(idul): 
    idxpath = '//*[@id="{}"]'.format(idul) 
    return idxpath 


def findid(idul): 
    found = driver.find_element_by_id(idul) 
    return found 


def clickyes(idul): 
    idxpath = '//*[@id="{}"]'.format(idul) 
    arg = '{}//span[contains(text(), "Yes")]'.format(idxpath) 
    return driver.find_element_by_xpath(arg).click() 


def clickno(idul): 
    idxpath = '//*[@id="{}"]'.format(idul) 
    arg = '{}//span[contains(text(), "No")]'.format(idxpath) 
    return driver.find_element_by_xpath(arg).click() 


def clickspan(idul): 
    idxpath = '//*[@id="{}"]'.format(idul) 
    arg = '{}//span[1]'.format(idxpath) 
    driver.find_element_by_xpath(arg).click() 


class DivSelect(object): 
    def __init__(self, idul, divtext): 
     self.idul = idul 
     self.divtext = divtext 
     # exemplu: '//div[contains(text(), "Right Hand")]' 
     # self.divulxpath = '//div[contains(text(), "{}")]'.format(self.divtext) 
     self.idxpath = '//*[@id="{}"]'.format(self.idul) 

    def findid(self): 
     el = 'driver.find_element_by_id({})'.format(self.idul) 
     return el 

    @property 
    def clicky(self): # merge doar la selectare de divuri 
     if len(str(self.divtext)) >= 2 and not self.divtext.isdigit(): 
      arg = '{}//div[contains(text(), "{}")]'.format(self.idxpath, self.divtext) 
     else: 
      arg = '{}//div[{}]/label/div'.format(self.idxpath, self.divtext) 
      print('driver.find_element_by_xpath("{}").click()'.format(arg)) 
     driver.find_element_by_xpath(arg).click() 


def printval(cee, cssid): 
    def getval(): 
     val = driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).get_attribute('value') 
     if not val: 
      val = input('Care e valoarea masinii:\n') 
     driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).click() 
     fillin(cssid, val) 
     time.sleep(2) 
     # print(val) 
     # assert isinstance(val, object) 
     return val 

    valoare = getval() 
    if valoare.lower() == 'pret': 
     print('{} estimat este : £ {} '.format(cee, valoare)) if valoare else 'Nu era nici un {}({}) estimat'.format(
      cee, cssid) 
    else: 
     print('{} estimat/a/e este : {} '.format(cee, valoare)) if valoare else 'Nu era nici un {}({}) estimat'.format(
      cee, cssid) 


def clickbutton(cssid): 
    driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).click() 


def fillin(cssid, var): 
    return driver.find_element_by_id(str(cssid)).send_keys(var) 


def fillinsugestionbox(cssid, var): 
    driver.find_element_by_id(str(cssid)).send_keys(var) 
    return driver.find_element_by_xpath('//*[@id=\"{0}\"]'.format(cssid)).send_keys(Keys.RETURN) 


knowsRegistrationNumber = Yesno('knows-registration-number').clickyes 

# 1.2 Then please enter it here to get started: 
registrationNumber = driver.find_element_by_id('registration-number') 
registrationNumber.send_keys(regno) 

# 1.3 Find your vehicle find-vehicle-by-reg 
findVehicleByReg = driver.find_element_by_id('find-vehicle-by-reg') 
findVehicleByReg.click() 
time.sleep(1) 

# TODO : if no other variants 
# 1.3.1 multiple-vehicles-section : a select list with more options 
# multipleVehiclesSection = driver.find_element_by_id('multiple-vehicles-section') 
# multipleVehiclesSection.click() 
#  possible-vehicles : the select list id 
try: 
    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "possible-vehicles"))) 
    possibleVehicles = driver.find_element_by_id('possible-vehicles') 
    possibleVehicles.click() 
    print('am asteptat destul') 
    dropdown = possibleVehicles.find_elements_by_tag_name('option') 
    print('Am selectat :\n  {} \n dintre urmatoarele:'.format(dropdown[1].text)) 
    for option in dropdown[1:]: 
     print(option.text) 

    if dropdown: 
     dropdown[1].click() 
except: 
    print('Elementul possible-vehicles nu e prezent') 
# finally: 

time.sleep(2) 

# //*[@id="has-tracker"] Yes/No 
hasTracker = Yesno('has-tracker').clickno 

# //*[@id="imported"] Yes/No 
imported = Yesno('imported').clickno 

# //*[@id="steering"] - 2 Divs 
# Choose from options : 
# Left Hand or # Right Hand 
steering = DivSelect('steering', 'Right Hand').clicky 

# TODO: vezi ce faci daca nu are pret setat. Pune tu unul 
# //*[@id="current-value"] - citeste valoarea 
# driver.find_element_by_xpath('//*[@id="current-value"]') 

printval('Pret', 'current-value') 
# print('Pretul estimat este : £ {} '.format(currentValue)) if currentValue else 'Nu era nici un pret estimat' 

printval('scaune', 'numberOfSeats-dropdown') 

# //*[@id="has-modifications"] 
hasModifications = Yesno('has-modifications').clickno 

# clik next button 
# //*[@id="vehicle-lookup-next"] 
clickbutton('vehicle-lookup-next') 
time.sleep(1) 
# ============================================ 
# 2. Vehicle usage       | 
# ============================================ 
# 2.1 When did you buy the car? 
# //*[@id="vehicle-usage"]//span[1] 
vehicleUsage = Yesno('vehicle-usage').clickspan # I haven't bought this car yet 

# 2.2 What do you use your car for? 
# //*[@id="use-of-vehicle"]/ol/li[2]/div[2]/label/div/div[2] 
# //*[@id="use-of-vehicle"]//div[2] 
useOfVehicle = DivSelect('use-of-vehicle', '2').clicky # Social, Domestic, Pleasure and Commuting (SDPC) 

# 2.3 What would you say your annual personal mileage is? 
# //*[@id="annual-mileage"] 
annualMileage = driver.find_element_by_id('annual-mileage') 
annualMileage.send_keys(mile) 
[...much more...] 
... 
... 

fillin('email', email) 
# Main telephone number 
# Let the insurance providers answer your queries 
# Let us keep you up to date 
# //*[@id="communication-options"]/ol/li[2]/div[4]/label/div/div[2] 
DivSelect('communication-options', 'Post').clicky 
# Please tick this box to confirm you have read and understood our website Terms and Conditions, \ 
# any assumptions we may have made and Your Rewards Terms and Conditions. \ 
# If you do not understand any items within this document please contact us. 
# //*[@id="contact-details"]/div/ol/li[6]/ol/li[2]/div[2]/label/span 
# Yesno('contact-details').clickspan - nu merge 
driver.find_element_by_xpath('//*[@id="contact-details"]/div/ol/li[6]/ol/li[2]/div[2]/label/span').click() 
# //*[@id="contact-details-next"] 
clickbutton('contact-details-next') 
driver.implicitly_wait(10) 
try: 
    element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "quotes"))) 
    print('element = ', element) 
    try: 
     """ 
     wait for loading bar to go away: 
     """ 
     element2 = WebDriverWait(driver, 60).until(EC.invisibility_of_element_located((By.XPATH, '//*[@id="quotes-loading-container"]/div/div[1]'))) 
     print('element2 = ', element2) 
    except: 
     print('bara de loading inca este activa. butonul more details cu cotatii nu e vizibil') 
except: 
    print('tabelul cu cotatii nu e vizibil') 

source_code = driver.find_element_by_id('quotes').get_attribute('innerHTML') 
# element.get_attribute('innerHTML') 
f = open('C:\\Users\\ZZZ\\PycharmProjects\\selenscrapy\\'+str(regno)+'.html', 'wb') 
f.write(source_code.encode('utf-8')) 
f.close() 

我知道代碼是凌亂。我是一名蟒蛇初學者,我正在玩這個代碼,以便從銷售網站的汽車中剔除一些汽車,並嘗試從其他網站爲他們獲得保險報價。外部保險報價網站(全是JavaScript,這就是爲什麼我需要Selenium網絡驅動器)的鏈接是從汽車銷售網站的重定向鏈接,因爲這兩個網站協作。 現在,正如我之前所說,這個引用url需要被selenium解析,我希望將它保存在單獨的模塊文件中,甚至可以是2個單獨的文件,一個是配置文件,一個是需要執行的操作。

如何將通過FirstSpider parse_quotations()方法從scrapy FirstSpider parse_car()方法獲得的保險報價URL傳遞給selenium模塊和selenium腳本(在上面第二個模塊中稱爲source_code)的響應。

謝謝!

+0

我也有興趣知道scrapy的響應是否可以直接**傳遞給Selenium webdriver以繼續抓取。我的情況是,有一段時間我的蜘蛛會碰到一個用戶驗證頁面,這需要人工干預來解決一些Capcha問題,這樣蜘蛛可以繼續工作。 – panc

回答

0

而不是在first.py中產生請求quotations_url,您可以創建一個Selenium webdriver並開始在webdriver中進行刮取?

def parse_car(self, response): 
    ... 
    quotations_url = response.xpath('/html/body/article/section/ul/li[2]/a/@href').extract_first() 
    # Start to work in a webdriver 
    browser = webdriver.Chrome() 
    browser.get(quotations_url) 
    # ... do whatever you want in the webdriver ... 
    # yield your item