2017-07-05 50 views
1

我試圖抓取page!對於我正在進行的一個項目。我想獲取每輛車的所有頁面的詳細信息(價格,里程,傳輸和年齡)。我下面我的代碼遇到的問題是:當第一頁鏈接不同於其他人時,用BeautifulSoup刮掉多個網頁

  1. 第一頁的鏈接是從其他人(無頁碼1 < &頁= 1>)
  2. 汽車的價格是不是裏面的不同點擊每個廣告以獲取詳細信息後,

我想知道如果有人願意幫助我看看這個建議。由於

from bs4 import BeautifulSoup 
    import requests 
    import urllib.parse 
    import csv 

    # the Toyota Camry model page is used 
    url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry" 
    r = requests.get(url) 
    data = r.text 

    soup = BeautifulSoup(data, "html.parser") 
    carLinks = set() 
    pageLinks = set() 
    data_set = [] 

    parsed = urllib.parse.urlparse(soup.select('a')[0].get('href')) 
    nbPage = urllib.parse.parse_qs(parsed.query)['page'][1] 
    print("There are " + str(nbPage) + " web pages to process") 

    # for each web page that contains a grid of car offers 
    for i in range(1, int(nbPage), 1): 

    print("Processing web page: " + str(i)) 

    # each car offer link is saved into the carLinks 
    for link in soup.select('#listContainer > div > section > div > tr > a'): 
    carLinks.add(link.get('href').replace("//", "http://")) 

    # the next url page is set 
    url = "https://www.olx.com.ng/vehicles/cars/toyota/? 
    search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry&page= + str(i) + " 
    r = requests.get(url) 
    data = r.text 
    soup = BeautifulSoup(data, "html.parser") 

    #for each car link 
    for carLink in carLinks: 

    print("Processing car page: " + carLink) 

    # we load the car page 
    r = requests.get(carLink) 
    data = r.text 
    soup = BeautifulSoup(data, "html.parser") 
    km = 0 
    transmission = "" 
    age = 0 
    price = 0 

    # for each attribute of the car 
    for info in soup.select("table.item tr div.pricelabel"): 

    # we keep the ones that we need 
    if info.select('.item')[0].text == u'Mileage': 
     km = int(info.select('.value')[0].text.replace(" ", "").replace("KM", "")) 
    if info.select('.item')[0].text == u'Transmission': 
     transmission = info.select('.value')[0].text 
    if info.select('.item')[0].text == u'Year': 
     age = 2017 - int(info.select('.value')[0].text) 
    if info.select('.pricelabel')[0].text == u'Price': 
     price = int(info.select('.pricelabel')[0].text.replace(" ", "").replace(u"₦", "")) 

    # each car is an array of four features added to the data_set 
    data_set.append([km, transmission, age, price]) 

    # the data_set is save into the CSV file 
    fl = open('car_features.csv', 'w') 
    writer = csv.writer(fl) 
    writer.writerow(['km', 'transmission', 'age', 'price']) 
    for values in data_set: 
    writer.writerow(values) 

    fl.close() 

回答

0

部位被嚴重破,如果繼續下去,到下一個頁面,你最終打在瀏覽器中的裝載區,對我來說501頁返回到500,如果我直接粘貼它,如果我使用了500的下一個我可以看到的501,我得到一個永不結束的加載循環,我們使用一個重定向回到前一個終止我們的循環。

我也用lxml.htmlcssselect,您可以使用bs4如果願意的話,邏輯是一樣的,但我會強烈建議使用lxml,該DEPS概述here,你還需要pip install cssselect

import requests 
from lxml import html 
from typing import Iterator 

url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry" 


def parse_data(node_: html.Element, price: str) -> dict: 
    """Parses the details section per individual car details page and returns a dict""" 

    # Price we pulled from main page. 
    details = {"price": price.strip("₦ ")} 

    # Details are in a table. 
    details_table = node_.cssselect("table.details")[0] 

    # The th has the description, the anchor has the value. 
    # we lower case and join the description, i.e "Type of car" -> type_of_car. 
    data = iter(details_table.cssselect("tr th, .value")) 
    details.update(("_".join(th.text.lower().split()), "".join(td.xpath(".//text()")).strip()) 
        for th, td in (zip(data, data))) 
    return details 


def get_link_and_price(s: requests.Session, node_: html.Element) -> Iterator[dict]: 
    """Gets the link and the associated price from each tr.""" 

    for child in node_.cssselect("table.offers td.offer"): 
     link, price = child.cssselect("a.link")[0].get("href"), child.cssselect(".price strong")[0].text 

     yield (parse_data(html.fromstring(s.get(link).content), price)) 


def start_request(url: str): 
    with requests.Session() as s: 
     get_ = s.get(url) 
     node = html.fromstring(get_.content) 

     # yield from subsequent iterators, i.e a dict of details. 
     yield from get_link_and_price(s, node) 

     # The site is broken, you click the next page button, 
     # and eventually you get stuck in a loading loop. 
     # At some stage the next should disappear, 
     # or only go back but it is wrongly implemented. 
     # This will stop when we try a next page, 
     # and end up back at the current url i.e ?page=501 -> ?page=500. 
     current_url = get_.url 
     next_page = node.cssselect("a.pageNextPrev")[-1].get("href") 
     get_next = s.get(next_page) 
     node = html.fromstring(get_next.content) 

     # Keep going through pages till our break condition is met. 
     while current_url != get_next.url: 
      node = html.fromstring(get_next.content) 
      yield from get_link_and_price(s, node) 
      current_url = get_next.url 
      next_page = node.cssselect("a.pageNextPrev")[-1].get("href") 
      get_next = s.get(next_page) 

for dict_ in start_request(url): 
    print(dict_) 

輸出的一個片段:

{'price': '3,300,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '65000'} 
{'price': '3,000,000', 'offer_from': 'Individual', 'year': '2007', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '86500'} 
{'price': '4,200,000', 'offer_from': 'Individual', 'year': '2013', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '39011'} 
{'price': '4,500,000', 'offer_from': 'Business', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '93000'} 
{'price': '890,000', 'offer_from': 'Business', 'year': '2001', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '110000'} 
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2005', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'} 
{'price': '1,500,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'} 
{'price': '1,150,000', 'offer_from': 'Individual', 'year': '2002', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '167'} 
{'price': '2,200,000', 'offer_from': 'Individual', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '24689'} 
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2004', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '92,000'} 
+0

哇哦!我只是看到這一點。非常感謝你@ padraic-cunningham – TechySupport