如何解析JavaScript類型Html

<script type="text/javascript"> 
var modelData = [{"Id":958,"Date":"20160428","Title":"Design","Description":"London Auction 28 April 2016","Department":"Design","Location":"LONDON","Permalink":"/auctions/auction/UK050116","Year":"2016","Image":"/Xigen/image.ashx?path=\\\\diskstation\\website\\Certificates\\UK050116\\UK050116.jpg\u0026width=308\u0026height=222","addThis":" addthis:url=\"https://www.example.com/auctions/auction/UK050116\" ","results_html":"\u003cli class=\"expandable past-auction-exp closed\"\u003e\u003ca href=\"#\"\u003eVIEW RESULTS\u003c/a\u003e\u003cdiv class=\"panel\" style=\"display:none\"\u003e\u003ca href=\"/auctions/auction/UK050116\"\u003eOnline\u003c/a\u003e\u003ca target=\"_blank\" href=\"/Xigen/file.ashx?path=\\\\diskstation\\website\\Media\\Auction\\auctionResultsFile_UK050116.pdf\"\u003ePDF\u003c/a\u003e\u003c/div\u003e\u003c/li\u003e","Download_catalog_html":"\u003cli class=\"expandable past-auction-exp closed\"\u003e\u003ca href=\"#\"\u003eCATALOGUES\u003c/a\u003e\u003cdiv class=\"panel\" style=\"display:none\"\u003e\u003ca target=\"_blank\" id=\"linkDownloadCatalog\" href=\"http://www.example.com/Xigen/file.ashx?path=\\\\diskstation\\website\\Certificates/UK050116/UK050116_catalog.pdf\"\u003eDownload Catalogue\u003c/a\u003e\u003ca href=\"/catalogues/buy\"\u003ePurchase Catalogue\u003c/a\u003e\u003c/div\u003e\u003c/li\u003e"}]</script>

我想解析日期，標題，鏈接，我如何解析它。我嘗試使用PyQt4，但也無法做到這一點。如何解析JavaScript類型Html

來源

2016-05-01 CodeNinja101

假設，這個位於script標籤內，您可以使用BeautifulSoup module解析HTML和相同的正則表達式，您將使用提取modelData價值定位script。然後，after fixing the modelData value to be "loadable" with json.loads()，你將有一個Python數據結構，你可以很容易地一起工作：

import json 
from bs4 import BeautifulSoup 

import re 

data = """ 
<script> 
var modelData = [{"Id":958,"Date":"20160428","Title":"Design","Description":"London Auction 28 April 2016","Department":"Design","Location":"LONDON","Permalink":"/auctions/auction/UK050116","Year":"2016","Image":"/Xigen/image.ashx?path=\\\\diskstation\\website\\Certificates\\UK050116\\UK050116.jpg\u0026width=308\u0026height=222","addThis":" addthis:url=\"https://www.example.com/auctions/auction/UK050116\" ","results_html":"\u003cli class=\"expandable past-auction-exp closed\"\u003e\u003ca href=\"#\"\u003eVIEW RESULTS\u003c/a\u003e\u003cdiv class=\"panel\" style=\"display:none\"\u003e\u003ca href=\"/auctions/auction/UK050116\"\u003eOnline\u003c/a\u003e\u003ca target=\"_blank\" href=\"/Xigen/file.ashx?path=\\\\diskstation\\website\\Media\\Auction\\auctionResultsFile_UK050116.pdf\"\u003ePDF\u003c/a\u003e\u003c/div\u003e\u003c/li\u003e","Download_catalog_html":"\u003cli class=\"expandable past-auction-exp closed\"\u003e\u003ca href=\"#\"\u003eCATALOGUES\u003c/a\u003e\u003cdiv class=\"panel\" style=\"display:none\"\u003e\u003ca target=\"_blank\" id=\"linkDownloadCatalog\" href=\"http://www.example.com/Xigen/file.ashx?path=\\\\diskstation\\website\\Certificates/UK050116/UK050116_catalog.pdf\"\u003eDownload Catalogue\u003c/a\u003e\u003ca href=\"/catalogues/buy\"\u003ePurchase Catalogue\u003c/a\u003e\u003c/div\u003e\u003c/li\u003e"}] 
</script> 
""" 

soup = BeautifulSoup(data, 'lxml') 

pattern = re.compile(r"var modelData = (\[.*?\])", re.MULTILINE | re.DOTALL) 
script = soup.find("script", text=pattern) 

s = pattern.search(script.text).group(1).encode('unicode_escape') 
while True: 
    try: 
     result = json.loads(s) # try to parse... 
     break     # parsing worked -> exit loop 
    except Exception as e: 
     # "Expecting , delimiter: line 34 column 54 (char 1158)" 
     # position of unexpected character after '"' 
     unexp = int(re.findall(r'\(char (\d+)\)', str(e))[0]) 
     # position of unescaped '"' before that 
     unesc = s.rfind(r'"', 0, unexp) 
     s = s[:unesc] + r'\"' + s[unesc+1:] 
     # position of correspondig closing '"' (+2 for inserted '\') 
     closg = s.find(r'"', unesc + 2) 
     s = s[:closg] + r'\"' + s[closg+1:] 

item = result[0] 
print(item["Id"]) 
print(item["Title"])

打印（只有在這種狀態下工作在Python 2中）：

958 
Design

來源

2016-05-01 03:53:09 alecxe

如何解析JavaScript類型Html

回答

相關問題