2013-04-09 25 views
2

我正在從我需要某些信息的網站上刮取。我需要的信息是Sw.preloadedData["overview"] =後的詞典:如何使用BeautifulSoup從JSON數據製作Python字典

<script type="text/javascript"> 
    Sw.preloadedData = {}; 
    Sw.preloadedData["overview"] = {"Title":"Facebook","Description":"A social utility that connects people, to keep up with friends, upload photos, share links and videos.","GlobalRank":[1,28115594,0],"Country":840,"CountryRanks":{"12":[1,830254,0],"818":[1,463162,0],"604":[1,599566,0],"608":[1,986465,0],"484":[1,1329484,0],"504":[1,672216,0],"862":[1,724854,0],"688":[1,534093,0],"702":[1,427637,0],"703":[1,341310,0],"756":[1,903074,0],"840":[1,5142062,0],"250":[1,1887449,0],"724":[2,1432992,0],"764":[1,857348,0],"76":[1,2733763,0],"784":[1,564929,0],"376":[1,390754,0],"792":[1,979507,0],"804":[8,1073943,1],"344":[1,284415,0],"348":[1,471458,0],"643":[8,1424933,1],"682":[1,692392,0],"380":[1,1441457,0],"392":[3,979893,1],"170":[1,1048409,0],"191":[1,348589,0],"620":[1,841554,0],"642":[1,814441,0],"356":[2,2356839,1],"528":[1,1092022,0],"616":[2,1430485,0],"360":[1,1541560,0],"372":[1,361215,0],"458":[1,851821,0],"36":[1,857177,0],"578":[1,349987,0],"586":[1,553155,0],"704":[2,918752,0],"710":[2,439567,1],"826":[1,2062694,0],"124":[1,1950051,0],"752":[1,577990,0],"300":[1,654931,0],"203":[1,623702,0],"208":[1,350294,0],"32":[1,1223765,0],"100":[1,473283,0],"554":[1,268216,0],"56":[1,1124680,0],"152":[1,725504,0],"156":[25,375144,-1],"158":[1,408462,0],"276":[1,2752131,0],"40":[1,700209,0],"410":[1,327519,0],"246":[1,387528,0]},"Category":"Internet_and_Telecom/Social_Network","CategoryRank":[1,27564,0],"TrafficReach":[0.32364475161620337,0.32385066912312122,0.32476481437213323,0.31948943452696626,0.310612833573507,0.30867420840432391,0.30666509584041279,0.31334128772658171,0.33551546090119239,0.3260064922555041,0.33396164810609369,0.33999592327084549,0.33711315799626795,0.32152719433964483,0.31986157880865085,0.32069766148623413,0.3306823871380894,0.32266565637788247,0.29034777869603251,0.29286953998372667,0.29969130766646174,0.3071060984450904,0.28517166164955293,0.29038329556338477,0.2845053957123595],"TrafficReachStart":1346457600,"TrafficReachEnd":1362096000,"Engagments":[{"Year":2012,"Month":9,"Reach":[0.32364839148251978,0.012621437484750864],"Time":[1225.8536260294338,0.00090266734593069664],"PPV":[21.312597646825566,0.034059623863791355],"Bounce":[0.18813037420762707,0.043481349041723627]},{"Year":2012,"Month":10,"Reach":[0.31325536305080282,-0.032112096661782052],"Time":[1308.5613956266043,0.0674695313053506],"PPV":[25.612224490959978,0.20174109770119109],"Bounce":[0.17672838267013638,-0.060606861520974054]},{"Year":2012,"Month":11,"Reach":[0.33350274816471975,0.064635398151613677],"Time":[1300.8263833937028,-0.0059110808699942563],"PPV":[24.020971463806184,-0.062128653749518592],"Bounce":[0.186024790640559,0.052602801145837264]},{"Year":2012,"Month":12,"Reach":[0.32441610872340648,-0.027246070658540122],"Time":[1331.3137947173564,0.023436956470790138],"PPV":[24.916914500937356,0.03729836815638965],"Bounce":[0.18107629094748873,-0.026601291559208651]},{"Year":2013,"Month":1,"Reach":[0.29998222452228729,-0.075316494909170029],"Time":[1334.5042854365543,0.0023964979044441836],"PPV":[25.52485794831804,0.024398825438752159],"Bounce":[0.18097482510209897,-0.00056034859593612207]},{"Year":2013,"Month":2,"Reach":[0.2842911869016958,-0.052306557982157109],"Time":[1281.8427161473487,-0.039461521303379321],"PPV":[23.201378273544368,-0.091028113828417134],"Bounce":[0.18673378186827794,0.031821866731629678]}],"TrafficSources":{"Search":0.12679771428369516,"Social":0.0095590714393366649,"Mail":0.018352638254343783,"Paid Referrals":0.0010665044954870533,"Direct":0.60148809501325917,"Referrals":0.24273597651387802},"RedirectUrl":"facebook.com"}; 
    Sw.period = { month:2 ,year:2013,period:6 }; 
    Sw.siteDomain = "Facebook.com"; 
    Sw.siteCategory = "Internet_and_Telecom/Social_Network"; 
    Sw.siteCountry = "840"; 

</script> 

我若與beautifulsoup選擇腳本標籤,怎能我得到(JSON?)字典作爲一個Python字典?

首先,我需要只選擇那個JSON對象 - 我該怎麼做?

而且我需要將該JSON對象翻譯爲Python Dict。

回答

5

你需要做一些文字處理:

import json 

scriptline = next((line for line in scripttag.string.splitlines() 
    if 'Sw.preloadedData["overview"]' in line)) 
data = scriptline.split('=', 1)[1].strip(' ;') 
data = json.loads(data) 

next(..., '')調用選擇包含Sw.preloadedData["overview"]第一線。然後,我們在=上分割一行,取其餘部分,刪除空格和分號,然後將其解釋爲JSON。

這給了我:

{u'Category': u'Internet_and_Telecom/Social_Network', 
u'CategoryRank': [1, 27564, 0], 
u'Country': 840, 
u'CountryRanks': {u'100': [1, 473283, 0], 
        u'12': [1, 830254, 0], 
        u'124': [1, 1950051, 0], 
        u'152': [1, 725504, 0], 
        u'156': [25, 375144, -1], 
        u'158': [1, 408462, 0], 
        u'170': [1, 1048409, 0], 
        u'191': [1, 348589, 0], 
        u'203': [1, 623702, 0], 
        u'208': [1, 350294, 0], 
        u'246': [1, 387528, 0], 
        u'250': [1, 1887449, 0], 
        u'276': [1, 2752131, 0], 
        u'300': [1, 654931, 0], 
        u'32': [1, 1223765, 0], 
        u'344': [1, 284415, 0], 
        u'348': [1, 471458, 0], 
        u'356': [2, 2356839, 1], 
        u'36': [1, 857177, 0], 
        u'360': [1, 1541560, 0], 
        u'372': [1, 361215, 0], 
        u'376': [1, 390754, 0], 
        u'380': [1, 1441457, 0], 
        u'392': [3, 979893, 1], 
        u'40': [1, 700209, 0], 
        u'410': [1, 327519, 0], 
        u'458': [1, 851821, 0], 
        u'484': [1, 1329484, 0], 
        u'504': [1, 672216, 0], 
        u'528': [1, 1092022, 0], 
        u'554': [1, 268216, 0], 
        u'56': [1, 1124680, 0], 
        u'578': [1, 349987, 0], 
        u'586': [1, 553155, 0], 
        u'604': [1, 599566, 0], 
        u'608': [1, 986465, 0], 
        u'616': [2, 1430485, 0], 
        u'620': [1, 841554, 0], 
        u'642': [1, 814441, 0], 
        u'643': [8, 1424933, 1], 
        u'682': [1, 692392, 0], 
        u'688': [1, 534093, 0], 
        u'702': [1, 427637, 0], 
        u'703': [1, 341310, 0], 
        u'704': [2, 918752, 0], 
        u'710': [2, 439567, 1], 
        u'724': [2, 1432992, 0], 
        u'752': [1, 577990, 0], 
        u'756': [1, 903074, 0], 
        u'76': [1, 2733763, 0], 
        u'764': [1, 857348, 0], 
        u'784': [1, 564929, 0], 
        u'792': [1, 979507, 0], 
        u'804': [8, 1073943, 1], 
        u'818': [1, 463162, 0], 
        u'826': [1, 2062694, 0], 
        u'840': [1, 5142062, 0], 
        u'862': [1, 724854, 0]}, 
u'Description': u'A social utility that connects people, to keep up with friends, upload photos, share links and videos.', 
u'Engagments': [{u'Bounce': [0.18813037420762707, 0.04348134904172363], 
        u'Month': 9, 
        u'PPV': [21.312597646825566, 0.034059623863791355], 
        u'Reach': [0.3236483914825198, 0.012621437484750864], 
        u'Time': [1225.8536260294338, 0.0009026673459306966], 
        u'Year': 2012}, 
       {u'Bounce': [0.17672838267013638, -0.060606861520974054], 
        u'Month': 10, 
        u'PPV': [25.612224490959978, 0.20174109770119109], 
        u'Reach': [0.3132553630508028, -0.03211209666178205], 
        u'Time': [1308.5613956266043, 0.0674695313053506], 
        u'Year': 2012}, 
       {u'Bounce': [0.186024790640559, 0.052602801145837264], 
        u'Month': 11, 
        u'PPV': [24.020971463806184, -0.06212865374951859], 
        u'Reach': [0.33350274816471975, 0.06463539815161368], 
        u'Time': [1300.8263833937028, -0.005911080869994256], 
        u'Year': 2012}, 
       {u'Bounce': [0.18107629094748873, -0.02660129155920865], 
        u'Month': 12, 
        u'PPV': [24.916914500937356, 0.03729836815638965], 
        u'Reach': [0.3244161087234065, -0.027246070658540122], 
        u'Time': [1331.3137947173564, 0.023436956470790138], 
        u'Year': 2012}, 
       {u'Bounce': [0.18097482510209897, -0.0005603485959361221], 
        u'Month': 1, 
        u'PPV': [25.52485794831804, 0.02439882543875216], 
        u'Reach': [0.2999822245222873, -0.07531649490917003], 
        u'Time': [1334.5042854365543, 0.0023964979044441836], 
        u'Year': 2013}, 
       {u'Bounce': [0.18673378186827794, 0.03182186673162968], 
        u'Month': 2, 
        u'PPV': [23.201378273544368, -0.09102811382841713], 
        u'Reach': [0.2842911869016958, -0.05230655798215711], 
        u'Time': [1281.8427161473487, -0.03946152130337932], 
        u'Year': 2013}], 
u'GlobalRank': [1, 28115594, 0], 
u'RedirectUrl': u'facebook.com', 
u'Title': u'Facebook', 
u'TrafficReach': [0.3236447516162034, 
        0.3238506691231212, 
        0.3247648143721332, 
        0.31948943452696626, 
        0.310612833573507, 
        0.3086742084043239, 
        0.3066650958404128, 
        0.3133412877265817, 
        0.3355154609011924, 
        0.3260064922555041, 
        0.3339616481060937, 
        0.3399959232708455, 
        0.33711315799626795, 
        0.32152719433964483, 
        0.31986157880865085, 
        0.32069766148623413, 
        0.3306823871380894, 
        0.32266565637788247, 
        0.2903477786960325, 
        0.29286953998372667, 
        0.29969130766646174, 
        0.3071060984450904, 
        0.28517166164955293, 
        0.29038329556338477, 
        0.2845053957123595], 
u'TrafficReachEnd': 1362096000, 
u'TrafficReachStart': 1346457600, 
u'TrafficSources': {u'Direct': 0.6014880950132592, 
        u'Mail': 0.018352638254343783, 
        u'Paid Referrals': 0.0010665044954870533, 
        u'Referrals': 0.24273597651387802, 
        u'Search': 0.12679771428369516, 
        u'Social': 0.009559071439336665}} 

如果你的價值定義跨越多行,我們可以使用JSONDecoder.raw_decode() method作出解析這些信息出來更容易一些:

import json 

script_rest = scripttag.string.split('Sw.preloadedData["overview"]', 1)[1].lstrip(' =') 
decoder = json.JSONDecoder() 
data, _ = decoder.raw_decode(script_rest) 

raw_decode()調用解析JSON即使存在尾隨數據,也會嘗試找到一個完整的JSON對象,該對象從Sw.preloadedData["overview"]文本後面的=開始。

+0

這,當然,只有當所有的JSON數據的定義是在一個單一的線上工作。 OP將不得不詳細說明它 - 前面的行處理和掃描直到下一個,沒有引用的,「;」讓它在不止一行的情況下處理數據。 – jsbueno 2013-04-09 12:45:44

+0

@jsbueno:我們也可以使用'raw_parse'選項;我會更新。 – 2013-04-09 12:47:31

3

喜歡的東西:

import re, json 
jsondata = json.loads(re.search(r'Sw\.preloadedData\["overview"\] = (.*)', data).group(1).rstrip(';'))