以下是我想從中提取「名稱」和「event_place」的頁面的一部分html代碼。但是,我從來沒有見過數據塞進這種複雜的方式。在標籤中,有'var person',其中的名字出現在「personBestName」下,即'John Stuart'。從BeautifulSoup中提取數據<script>和var
同樣,對於「event_place」,這是根據「變種人」 ......等事件發生的實體應該是「B,漢密爾頓(城市/引文),安大略省,加拿大」
<script>
var person = {"id":"p_14062397399","links":{"record":{"href":"https://familysearch.org/platform/records/records/9MFX-7VLY"},"persona":{"href":"https://familysearch.org/platform/records/personas/KH21-F11"}},"extracted":true,"identifiers":{"http://gedcomx.org/Persistent":["https://familysearch.org/ark:/61903/1:1:KH21-F11"],"$":["https://familysearch.org/platform/externalId/easy/1001080442645"]},"principal":true,"gender":{"type":"http://gedcomx.org/Male","fields":[{"type":"http://gedcomx.org/Gender","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_SEX_CODE","text":"Male","resource":"http://gedcomx.org/Male"}]}]},"names":[{"type":"http://gedcomx.org/BirthName","nameForms":[{"fullText":"John Stuart","parts":[{"type":"http://gedcomx.org/Given","value":"John","fields":[{"type":"http://gedcomx.org/Given","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME_GN","text":"John"}]}]},{"type":"http://gedcomx.org/Surname","value":"Stuart","fields":[{"type":"http://gedcomx.org/Surname","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME_SURN","text":"Stuart"}]}]}],"fields":[{"type":"http://gedcomx.org/Name","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME","text":"John Stuart"}]}]}]}],"facts":[{"type":"http://gedcomx.org/MaritalStatus","value":"Single","fields":[{"type":"http://gedcomx.org/MaritalStatus","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_MARITAL_STATUS","text":"Single"}]}]},{"type":"http://gedcomx.org/Religion","value":"Presbyterian","fields":[{"type":"http://gedcomx.org/Religion","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_RELIGION","text":"Presbyterian"}]}]},{"type":"http://gedcomx.org/Nationality","value":"Canadian","fields":[{"type":"http://gedcomx.org/Nationality","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NATIONALITY","text":"Canadian"}]}]},{"type":"http://gedcomx.org/Census","date":{"original":"31 Mar 1901","fields":[{"type":"http://gedcomx.org/Date","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_DATE","text":"31 Mar 1901"}]},{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_YEAR","text":"1901"}]}]},"place":{"original":"B, Hamilton (city/cité), Ontario, Canada","fields":[{"type":"http://gedcomx.org/Place","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_PLACE","text":"B, Hamilton (city/cité), Ontario, Canada"}]}]},"primary":true},{"type":"http://gedcomx.org/Birth","date":{"original":"1831","fields":[{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_BIRTH_YEAR_ESTIMATED","text":"1831"}]}]},"place":{"original":"Scotland","fields":[{"type":"http://gedcomx.org/Place","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_BIRTH_PLACE","text":"Scotland"}]}]}},{"type":"http://gedcomx.org/Immigration","date":{"original":"1848","fields":[{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_IMMIGRATION_YEAR","text":"1848"}]}]}}],"fields":[{"type":"http://gedcomx.org/Age","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_AGE_ORIG","text":"70"}]},{"type":"http://familysearch.org/types/fields/UniqueIdentifier","values":[{"type":"http://gedcomx.org/Original","labelId":"UNIQUE_IDENTIFIER","text":"1001080442645"}]},{"type":"http://familysearch.org/types/fields/HouseholdId","values":[{"type":"http://gedcomx.org/Original","labelId":"HOUSEHOLD_ID","text":"66"}]},{"type":"http://gedcomx.org/RelationshipToHead","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_RELATIONSHIP_TO_HEAD","text":"Head"}]},{"type":"http://familysearch.org/types/fields/RelationshipToHeadCode","values":[{"type":"http://gedcomx.org/Original","labelId":"RELATIONSHIP_CODE","text":"SELF"}]},{"type":"http://familysearch.org/types/fields/CollectionId","values":[{"type":"http://gedcomx.org/Original","labelId":"COLLECTION_ID","text":"1584557"}]},{"type":"http://familysearch.org/types/fields/EventDistrict","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_DISTRICT","text":"Hamilton (city/cité)"}]},{"type":"http://familysearch.org/types/fields/EventProvince","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_PROVINCE","text":"Ontario"}]},{"type":"http://familysearch.org/types/fields/EventSubDistrict","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_SUB_DISTRICT","text":"B"}]},{"type":"http://familysearch.org/types/fields/EventType","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_TYPE","text":"Census"}]},{"type":"http://familysearch.org/types/fields/Id","values":[{"type":"http://gedcomx.org/Original","labelId":"ID","text":"z002-z000067618"}]},{"type":"http://familysearch.org/types/fields/Page","values":[{"type":"http://gedcomx.org/Original","labelId":"PAGE","text":"8"}]},{"type":"http://familysearch.org/types/fields/Pid","values":[{"type":"http://gedcomx.org/Original","labelId":"PID","text":"11335440"}]},{"type":"http://familysearch.org/types/fields/PpqId","values":[{"type":"http://gedcomx.org/Original","labelId":"PPQ_ID","text":"08-0278"}]},{"type":"http://familysearch.org/types/fields/PrAgeInYears","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_AGE_IN_YEARS","text":"70"}]},{"type":"http://familysearch.org/types/fields/PrRacialOrTribalOrigin","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_RACIAL_OR_TRIBAL_ORIGIN","text":"Scotch"}]},{"type":"http://familysearch.org/types/fields/RollNumber","values":[{"type":"http://gedcomx.org/Original","labelId":"ROLL_NUMBER","text":"CC1901_47"}]},{"type":"http://familysearch.org/types/fields/SortKey","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"SORT_KEY","text":"z002-z000067618_0000066_11335440_1001080442645"}]}],"url":"https://familysearch.org/ark:/61903/1:1:KH21-F11","personBestName":"John Stuart","localizedGender":"Male","title":"John Stuart, \"Canada Census, 1901\"","personRecordTitle":"John Stuart","metadata":{"bibliographicCitation":"\"Canada Census, 1901,\" , <i>FamilySearch</i> (https://familysearch.org/ark:/61903/1:1:KH21-F11 : accessed 14 August 2015), John Stuart, B, Hamilton (city/cité), Ontario, Canada; citing p. 8, Library and Archives of Canada, Ottawa."},"imageMeta":{"thirdPartyHostName":"","isExternalImage":false,"thirdPartyURL":"","imageURL":"","wikiCollectionURL":"/learn/wiki/en/api.php?action=query&list=search&srwhat=text&format=json&srsearch=CID1584557"}};
我能夠從中提取名稱實體的另一部分(未示出)來自標籤和指定類的html。
# coding=utf-8
import urllib2
import re
import csv
from bs4 import BeautifulSoup
import time
from unicodedata import normalize
Url = "https://familysearch.org/pal:/MM9.1.1/KHR6-D6D"
Page = urllib2.urlopen(Url)
Soup = BeautifulSoup(Page)
Page.close()
x = Soup.find("h3", { "class" : "print-only print-title" })
sx = x.string.encode('utf-8')
k = sx.split(', "Can')
kk = k[0].split(' in household')
name = kk[0]
print name
編輯:
# Get other fields
rawJ = Soup.find_all('script')
J = str(rawJ[10])
J1 = J.split('var person = ')
J2 = J1[1].rsplit('var record =')
J3 = J2[0].rsplit(';', 1)
JsonText = J3[0]
#print JsonText
s = json.loads(JsonText)
print s["personBestName"]
# UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 5: ordinal not in range(128)
這就是JSON。您可以使用python中的'json'模塊輕鬆讀取。 – Rishav