1
我試圖抓取page!對於我正在進行的一個項目。我想獲取每輛車的所有頁面的詳細信息(價格,里程,傳輸和年齡)。我下面我的代碼遇到的問題是:當第一頁鏈接不同於其他人時,用BeautifulSoup刮掉多個網頁
- 第一頁的鏈接是從其他人(無頁碼1 < &頁= 1>)
- 汽車的價格是不是裏面的不同點擊每個廣告以獲取詳細信息後,
我想知道如果有人願意幫助我看看這個建議。由於
from bs4 import BeautifulSoup
import requests
import urllib.parse
import csv
# the Toyota Camry model page is used
url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
carLinks = set()
pageLinks = set()
data_set = []
parsed = urllib.parse.urlparse(soup.select('a')[0].get('href'))
nbPage = urllib.parse.parse_qs(parsed.query)['page'][1]
print("There are " + str(nbPage) + " web pages to process")
# for each web page that contains a grid of car offers
for i in range(1, int(nbPage), 1):
print("Processing web page: " + str(i))
# each car offer link is saved into the carLinks
for link in soup.select('#listContainer > div > section > div > tr > a'):
carLinks.add(link.get('href').replace("//", "http://"))
# the next url page is set
url = "https://www.olx.com.ng/vehicles/cars/toyota/?
search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry&page= + str(i) + "
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
#for each car link
for carLink in carLinks:
print("Processing car page: " + carLink)
# we load the car page
r = requests.get(carLink)
data = r.text
soup = BeautifulSoup(data, "html.parser")
km = 0
transmission = ""
age = 0
price = 0
# for each attribute of the car
for info in soup.select("table.item tr div.pricelabel"):
# we keep the ones that we need
if info.select('.item')[0].text == u'Mileage':
km = int(info.select('.value')[0].text.replace(" ", "").replace("KM", ""))
if info.select('.item')[0].text == u'Transmission':
transmission = info.select('.value')[0].text
if info.select('.item')[0].text == u'Year':
age = 2017 - int(info.select('.value')[0].text)
if info.select('.pricelabel')[0].text == u'Price':
price = int(info.select('.pricelabel')[0].text.replace(" ", "").replace(u"₦", ""))
# each car is an array of four features added to the data_set
data_set.append([km, transmission, age, price])
# the data_set is save into the CSV file
fl = open('car_features.csv', 'w')
writer = csv.writer(fl)
writer.writerow(['km', 'transmission', 'age', 'price'])
for values in data_set:
writer.writerow(values)
fl.close()
哇哦!我只是看到這一點。非常感謝你@ padraic-cunningham – TechySupport