from bs4 import BeautifulSoup
from urllib.request import urlopen
#html = '''<p>
#<b>Father:</b> Michael Haughton
#<br>
#<b>Mother:</b> Diane
#<br><b>Brother:</b>
#Rashad Haughton<br>
#<b>Husband:</b> <a href="/people/540/000024468/">R. Kelly</a> (m. 1994, annulled that same year)
#<br><b>Boyfriend:</b> <a href="/people/420/000109093/">Damon Dash</a> (Roc-a-Fella co-CEO)<br></p>'''
page = urlopen('http://www.nndb.com/people/742/000024670/')
source = page.read()
soup = BeautifulSoup(source)
needed_p = soup.find_all('p')[8]
bs = needed_p.find_all('b')
res = {}
for b in bs:
if b.find_next('a').text:
res[b.text] = b.find_next('a').text.strip().strip('\n')
if b.next_sibling != ' ':
res[b.text] = b.next_sibling.strip().strip('\n')
res
輸出:
{'Brother:': 'Rashad Haughton',
'Mother:': 'Diane',
'Husband:': 'R. Kelly',
'Father:': 'Michael Haughton',
'Boyfriend:': 'Damon Dash'}
編輯: 有關頁面頂部的附加信息:
... (code above) ...
soup = BeautifulSoup(source)
needed_p = soup.find_all('p')[1:4] + [soup.find_all('p')[8]] # here explicitly selecting needed p-tags for further parsing
res = {}
for p in needed_p:
bs = p.find_all('b')
for b in bs:
if b.find_next('a').text:
res[b.text] = b.find_next('a').text.strip().strip('\n')
if b.next_sibling != ' ':
res[b.text] = b.next_sibling.strip().strip('\n')
res
輸出:
{'Race or Ethnicity:': 'Black',
'Husband:': 'R. Kelly',
'Died:': '25-Aug',
'Nationality:': 'United States',
'Executive summary:': 'R&B singer, died in plane crash',
'Mother:': 'Diane',
'Birthplace:': 'Brooklyn, NY',
'Born:': '16-Jan',
'Boyfriend:': 'Damon Dash',
'Sexual orientation:': 'Straight',
'Occupation:': 'Singer',
'Cause of death:': 'Accident - Airplane',
'Brother:': 'Rashad Haughton',
'Remains:': 'Interred,',
'Gender:': 'Female',
'Father:': 'Michael Haughton',
'Location of death:': 'Marsh Harbour, Abaco Island, Bahamas'}
對於precisel Y本頁面,您還可以湊高中,例如,像這樣:
res['High School'] = soup.find_all('p')[9].text.split(':')[1].strip()
你介意解釋你的代碼嗎? –
@Rightleg,你不明白的是什麼? –
@DmitriyFialkovskiy對URL運行時,它會給出錯誤: 'res [b.text] = b.next_sibling.strip(url ='http://www.nndb.com/ people/742/000024670 /'' ).strip('\ n') AttributeError:'NoneType'對象沒有屬性'strip'' – Volatil3