2017-09-26 58 views
0

網站我再殺是:我沒有生病的頁面後,湊一個ASP網站10

http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx

我得到第10頁我的代碼是看頁碼數和遍歷它們,但它想要通過第10頁時失敗,因爲有三個點(...),如果您在瀏覽器中單擊,它將加載第11頁(在第20頁,第30頁等後)。我如何更新我的代碼,以便它可以處理這個錯誤而不會中斷?

我使用的代碼是:

import re 
import string 
import urlparse 

from selenium import webdriver 
from selenium.webdriver.support.ui import Select 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.common.exceptions import NoSuchElementException 
from bs4 import BeautifulSoup 

class DoctorScraper(object): 
    def __init__(self): 
     self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx" 
     self.driver = webdriver.PhantomJS() 
     self.driver.set_window_size(1120, 550) 

    def scrape(self): 
     self.driver.get(self.url) 

     # choose to search using the region 
     try: 
      self.driver.find_element_by_id('SearchChkb_5').click() 
     except NoSuchElementException: 
      pass 

     #get the provinces that are available 
     select = Select(self.driver.find_element_by_id('ddlProvince')) 
     option_indexes = range(1, len(select.options)) 

     #iterate through the provinces 
     for index in option_indexes[:3]: 
      select.select_by_index(index) 
      #click the search button 
      self.driver.find_element_by_id('cmdSearch').click() 

      pageno = 2 

      while True: 
       #create a beautiful soup of the page source code 
       s = BeautifulSoup(self.driver.page_source) 
       #get all links that match seeing practitioner profile 
       r1 = re.compile(r'^PractitionerView\.aspx\?FILENO=([A-Z0-9-]+)$') 
       #create a dictionary of the attributes 
       x = {'href': r1} 

       #so in the page source, find all links that have the attributes stated in x 
       for a in s.findAll('a', attrs=x): 
        print 'View Doctor URL: ', urlparse.urljoin(self.driver.current_url, a['href']) 
        print 

       # Pagination 
       try:      
        next_page_elem = self.driver.find_element_by_xpath("//a[text()='%d']" % pageno) 
        print "Next page: ", next_page_elem 
       except NoSuchElementException: 
        break # no more pages 

       print 'page ', pageno, '\n' 
       next_page_elem.click() 

       pageno += 1 

     self.driver.quit() 

if __name__ == '__main__': 
    scraper = DoctorScraper() 
    scraper.scrape() 

我收到此錯誤:

StaleElementReferenceException: {"errorMessage":"Element does not exist in cache","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"121","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:63135","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"tag name\", \"sessionId\": \"ef6d0590-a2d6-11e7-91fa-5773b3326267\", \"id\": \":wdc:1506442969197\", \"value\": \"option\"}","url":"/elements","urlParsed":{"anchor":"","query":"","file":"elements","directory":"/","path":"/elements","relative":"/elements","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/elements","queryKey":{},"chunks":["elements"]},"urlOriginal":"/session/ef6d0590-a2d6-11e7-91fa-5773b3326267/element/:wdc:1506442969197/elements"}} 

回答

0

與本網站的主要問題是,點擊元素常常超越了視覺和它拋出element not clickable錯誤。不過,我已經修復了它。如果您的計算機上安裝了ChromeDriver,只需運行它即可看到魔法。無論它們有多少,它都會完美地遍歷所有頁面。我檢查過了。

from selenium import webdriver ; import time 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import Select 
from selenium.common.exceptions import NoSuchElementException 
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 

main_link = 'http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx' 

def get_content(driver,wait,link): 
    driver.get(link) 
    driver.find_element_by_id('SearchChkb_5').click() 
    select = Select(driver.find_element_by_id('ddlProvince')) 
    select.select_by_visible_text('WESTERN CAPE') 
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
    elem = wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch'))) 
    elem.click() 
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
    page_counter = 2 
    while True: 
     try: 
      if not page_counter % 10 == 1: 
       driver.find_element_by_link_text(str(page_counter)).click() 
       page_counter += 1 
      else: 
       driver.find_elements_by_link_text("...")[-1].click() 
       time.sleep(2) 
       driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
       page_counter += 1 
     except NoSuchElementException: 
      break 

if __name__ == '__main__': 
    driver = webdriver.Chrome() 
    wait = WebDriverWait(driver, 10) 
    try: 
     get_content(driver,wait,main_link) 
    finally:  
     driver.close() 

而使用類:

from selenium import webdriver ; import time 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import Select 
from selenium.common.exceptions import NoSuchElementException 
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 

class DoctorScraper(object): 
    def __init__(self): 
     self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx" 
     self.driver = webdriver.Chrome() 
     self.wait = WebDriverWait(self.driver, 10) 

    def __del__(self): 
     self.driver.close() 

    def controlling_pagination(self): 
     self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
     page_counter = 2 
     while True: 
      try: 
       if not page_counter % 10 == 1: 
        self.driver.find_element_by_link_text(str(page_counter)).click() 
        page_counter += 1 
       else: 
        self.driver.find_elements_by_link_text("...")[-1].click() 
        time.sleep(2) 
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
        page_counter += 1 
      except NoSuchElementException: 
       break 

    def get_content(self): 
     self.driver.get(self.url) 
     self.driver.find_element_by_id('SearchChkb_5').click() 
     select = Select(self.driver.find_element_by_id('ddlProvince')) 
     select.select_by_visible_text('WESTERN CAPE') 
     self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
     elem = self.wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch'))) 
     elem.click() 
     self.controlling_pagination() 

if __name__ == '__main__': 
    scraper = DoctorScraper() 
    scraper.get_content() 

順便說一下,看看圖片,你可以看到頁面的變化的底部:

enter image description here

+0

謝謝你Shahil。它在第10頁完美工作,但它在第20頁卡住了,並給出了與我的問題類似的錯誤。任何想法爲什麼? –

+0

在我的情況下,它經歷了100頁而沒有發生任何錯誤。但是,再次檢查。可能有任何技術困難。謝謝。 – SIM

+0

你說得對。這是一些錯誤,但我不知道它來自哪裏。每次都停在不同的頁面上。現在停止在第12頁。 –

相關問題