2017-10-08 71 views
1
from bs4 import BeautifulSoup 
from selenium import webdriver 
#import urllib2 
import time 

driver = webdriver.Chrome() 
driver.maximize_window() 
driver.get("https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/20432063_zpid/51617_rid/12m_days/globalrelevanceex_sort/34.048605,-118.340178,33.963223,-118.47785_rect/12_zm/") 
time.sleep(3) 
driver.find_element_by_class_name("collapsible-header").click() 
soup = BeautifulSoup(driver.page_source,"lxml") 

region = soup.find("div",{"id":"hdp-price-history"}) 
table = region.find('table',{'class':'zsg-table yui3-toggle-content-minimized'}) 
print table 

我嘗試在zillow中取消稅/表格價格,但是我得到的結果是無。我如何得到那張桌子? enter image description here如何在zillow中取消價格/稅表歷史記錄?

回答

1

以下使用requestsBeautifulSoup來獲取數據,不需要硒(等等)。

from bs4 import BeautifulSoup 
import requests 
import re 

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0"}  
r = requests.get("https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/20432063_zpid/51617_rid/12m_days/globalrelevanceex_sort/34.048605,-118.340178,33.963223,-118.47785_rect/12_zm/", headers=headers) 
urls = re.findall(re.escape('AjaxRender.htm?') + '(.*?)"', r.content) 
url = "https://www.zillow.com/AjaxRender.htm?{}".format(urls[4]) 
r = requests.get(url, headers=headers) 
soup = BeautifulSoup(r.content.replace('\\', ''), "html.parser") 
data = [] 

for tr in soup.find_all('tr'): 
    data.append([td.text for td in tr.find_all('td')]) 

for row in data[:5]:  # Show first 5 entries  
    print row 

這顯示前5項是:

[u'06/16/17', u'Sold', u'$940,000-0.9%', u'K. Miller, A. Masket', u''] 
[u'06/14/17', u'Price change', u'$949,000-1.0%', u'', u''] 
[u'05/08/17', u'Pending sale', u'$959,000', u'', u''] 
[u'04/17/17', u'Price change', u'$959,000+1.1%', u'', u''] 
[u'02/27/17', u'Pending sale', u'$949,000', u'', u''] 

所需的HTML不存在於所述第一GET但它當Price/Tax History部已展開按需生成。這會在瀏覽器中觸發一個AJAX請求。這些代碼在最初的HTML中搜索所有這些請求併發出相同的請求。第四個這樣的請求用於獲取所需的部分。返回的HTML需要\被刪除,然後可以傳遞給BeautifulSoup作爲表進行解析。

+0

你幫了我很多!非常感謝 ! –

+0

很高興能幫到你!不要忘記點擊上/下按鈕上的灰色勾號來選擇一個答案作爲接受的解決方案。 –

1

您不需要使用BeautifulSoup。您可以得到所需的表下面的代碼:

from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait as wait 
from selenium.webdriver.support import expected_conditions as EC 

driver = webdriver.Chrome() 
driver.maximize_window() 
driver.get("https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/20432063_zpid/51617_rid/12m_days/globalrelevanceex_sort/34.048605,-118.340178,33.963223,-118.47785_rect/12_zm/") 
wait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "collapsible-header"))).click() 
table = wait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div#hdp-price-history table.zsg-table.yui3-toggle-content-minimized"))) 
print(table.text) 

所需的表產生dinamically,所以你需要等待一段時間,直到它在DOM存在。這就是爲什麼你點擊後無法在頁面源中找到表的原因

+0

它的工作原理,非常感謝! –