2017-07-15 132 views
1

我在下面的鏈接本地HTML文件:https://pastebin.com/L3iFQgQH提取辭典從本地HTML文件

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml"> 
<head><title> 
    335i | autoTRADER.ca 
</title><link id="ctl00_ctl00_canonical" rel="canonical" href="http://www.autotrader.ca/cars/bmw/3%20series/2013/" /><meta name="viewport" content="width=device-width, height=device-height, user-scalable=0, minimum-scale=0.75, maximum-scale=1.0" /><meta name="SKYPE_TOOLBAR" content="SKYPE_TOOLBAR_PARSER_COMPATIBLE" /><script> 
var dataLayer = [ 
{ 
    'search': { 
    'pageNumber': '1', 
    'searchType': 'unique', 
    'filterFieldsUsed': '10', 
    'category': 'Cars, Trucks & SUVs', 
    'minPrice': 'not used', 
    'maxPrice': 'not used', 
    'make': 'BMW', 
    'model': '3 Series', 
    'new': 'yes', 
    'used': 'yes', 
    'CPO': 'yes', 
    'distance': 'national', 
    'location': 'canada', 
    'searchLocation': 'advancedSearch', 
    'minYear': '2013', 
    'maxYear': '2013', 
    'transmission': 'Automatic', 
    'fuelType': 'not used', 
    'exteriorColor': 'not used', 
    'refinedKeywords': '335i', 
    'bodyType': 'not used', 
    'minKms': 'not used', 
    'maxKms': 'not used', 
    'damaged': 'yes', 
    'dealer': 'yes', 
    'privateSeller': 'yes', 
    'withPrice': 'yes', 
    'withPhotos': 'yes', 
    'withFreeCarProof': 'not used', 
    'sortOrder': 'Price: High to Low' 
}, 
'lists': [ 
    { 
    'key': 'advancedSearch', 
    'vehicles': [ 
     { 
     'make': 'BMW', 
     'model': '3 Series', 
     'year': '2013', 
     'category': 'PassengerVehicles', 
     'price': '37800', 
     'condition': 'used', 
     'adType': 'dealer', 
     'adID': '5-33635639', 
     'dealerID': '5-BS2004915125635', 
     'listingPosition': 'ppl', 
     'upgradeExecUpgrade': 'no', 
     'upgradePL': 'no', 
     'upgradeHL': 'no', 
     'upgradePPL': 'no', 
     'mobialsParticipation': 'no', 
     'strikethrough': 'no', 
     'vehicleSpecialist': 'no', 
     'priceHistory': '1', 
     'priceAnalysis': 'above average', 
     'transparency': 'yes', 
     'car360enabled': 'no', 
     'province': 'BC', 
     'financingPrice': 'no', 
     'merchandising': 'gold' 
     }, 
     { 
     'make': 'BMW', 
     'model': '3 Series', 
     'year': '2013', 
     'category': 'PassengerVehicles', 
     'price': '33995', 
     'condition': 'used', 
     'adType': 'dealer', 
     'ad 
     } 
    ] 
    } 
], 
'pageType': 'search-results', 
'mvt': null 
} 
]; 
dataLayer.push({'ShowNewCoPath': 'True'}); 

</script> 
<!--Google Tag Manager --> 
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': 
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], 
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); 
})(window,document,'script','dataLayer','GTM-K7JHZJ');</script> 
<!-- End Google Tag Manager --> 

在最頂端,有一個可變的數據層,這是一個字典,它隨後由很多html和其他東西。我想提取這個變量並將其存儲在使用python的json字典中。 現在,我使用拆分功能,但它是非常具體的 是否有任何方法來處理更廣泛的html文件?

+0

您是否試過將它加載到BeautifulSoup中?如果你再打電話script_tag = soup.script,你幾乎就在那裏...... – jlaur

回答

0

一種選擇是,先提取使用的腳本內容,例如,BeautiulSoup HTML解析器,然後用JavaScript分析器像slimitpyjsparser提取dataLayer變量的值,然後後處理這一點,使JSON-加載。然後,負載爲一個Python列表,經由json.loads()

工作示例使用slimit

from ast import literal_eval 
import json 

from bs4 import BeautifulSoup 

from slimit import ast 
from slimit.parser import Parser 
from slimit.visitors import nodevisitor 


data = """ 
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml"> 
<head><title> 
    335i | autoTRADER.ca 
</title><link id="ctl00_ctl00_canonical" rel="canonical" href="http://www.autotrader.ca/cars/bmw/3%20series/2013/" /><meta name="viewport" content="width=device-width, height=device-height, user-scalable=0, minimum-scale=0.75, maximum-scale=1.0" /><meta name="SKYPE_TOOLBAR" content="SKYPE_TOOLBAR_PARSER_COMPATIBLE" /><script> 
var dataLayer = [ 
{ 
    'search': { 
    'pageNumber': '1', 
    'searchType': 'unique', 
    'filterFieldsUsed': '10', 
    'category': 'Cars, Trucks & SUVs', 
    'minPrice': 'not used', 
    'maxPrice': 'not used', 
    'make': 'BMW', 
    'model': '3 Series', 
    'new': 'yes', 
    'used': 'yes', 
    'CPO': 'yes', 
    'distance': 'national', 
    'location': 'canada', 
    'searchLocation': 'advancedSearch', 
    'minYear': '2013', 
    'maxYear': '2013', 
    'transmission': 'Automatic', 
    'fuelType': 'not used', 
    'exteriorColor': 'not used', 
    'refinedKeywords': '335i', 
    'bodyType': 'not used', 
    'minKms': 'not used', 
    'maxKms': 'not used', 
    'damaged': 'yes', 
    'dealer': 'yes', 
    'privateSeller': 'yes', 
    'withPrice': 'yes', 
    'withPhotos': 'yes', 
    'withFreeCarProof': 'not used', 
    'sortOrder': 'Price: High to Low' 
}, 
'lists': [ 
    { 
    'key': 'advancedSearch', 
    'vehicles': [ 
     { 
     'make': 'BMW', 
     'model': '3 Series', 
     'year': '2013', 
     'category': 'PassengerVehicles', 
     'price': '37800', 
     'condition': 'used', 
     'adType': 'dealer', 
     'adID': '5-33635639', 
     'dealerID': '5-BS2004915125635', 
     'listingPosition': 'ppl', 
     'upgradeExecUpgrade': 'no', 
     'upgradePL': 'no', 
     'upgradeHL': 'no', 
     'upgradePPL': 'no', 
     'mobialsParticipation': 'no', 
     'strikethrough': 'no', 
     'vehicleSpecialist': 'no', 
     'priceHistory': '1', 
     'priceAnalysis': 'above average', 
     'transparency': 'yes', 
     'car360enabled': 'no', 
     'province': 'BC', 
     'financingPrice': 'no', 
     'merchandising': 'gold' 
     }, 
     { 
     'make': 'BMW', 
     'model': '3 Series', 
     'year': '2013', 
     'category': 'PassengerVehicles', 
     'price': '33995', 
     'condition': 'used', 
     'adType': 'dealer' 
     } 
    ] 
    } 
], 
'pageType': 'search-results', 
'mvt': null 
} 
]; 
dataLayer.push({'ShowNewCoPath': 'True'}); 

</script> 
<!--Google Tag Manager --> 
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': 
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], 
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); 
})(window,document,'script','dataLayer','GTM-K7JHZJ');</script> 
<!-- End Google Tag Manager -->""" 


soup = BeautifulSoup(data, "html.parser") 
script = soup.find("script", text=lambda text: text and "dataLayer" in text).get_text() 

parser = Parser() 
tree = parser.parse(script) 

data_layer = next(node.initializer.items[0].to_ecma().replace("'", '"') 
        for node in nodevisitor.visit(tree) 
        if isinstance(node, ast.VarDecl) and node.identifier.value == 'dataLayer') 

print(json.loads(data_layer)) 

另一種選擇,可能更實用,但整體不太可靠,是使用正則表達式 - 匹配所需的對象,從HTML字符串中提取它,後處理並使用json模塊加載到Python對象中。工作代碼:

import json 
from pprint import pprint 
import re 


html = """your HTML here (same as above)""" 

match = re.search(r"var dataLayer = (\[.*?\]);$", html, re.MULTILINE | re.DOTALL) 
if match: 
    data = match.group(1).replace("'", '"') 
    data = json.loads(data) 
    pprint(data)