什麼
from bs4 import BeautifulSoup
from urlparse import urlparse, parse_qs
html = '''
<html>
<body>
<table>
<tr>
<td class="rec_title_ppnlist">
<div><a class=" link_gen " href="SHW?FRST=0">Wambold von Umstadt, Anselm Kasimir, 1583-1647 (Zeit, Lebensdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=1">Wambold von Umstadt, Anselm Kasimir, 1583-1647 (Zeit, Lebensdaten)</a></div>
<div class="rec_sep"><img alt="" border="" height="1" src="http://gsowww.gbv.de/images/gui/empty.gif" title="" width="1"/></div>
</td>,
<td class="rec_title_ppnlist">
<div><a class=" link_gen " href="SHW?FRST=2">Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=2">Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div class="rec_sep"><img alt="" border="" height="1" src="http://gsowww.gbv.de/images/gui/empty.gif" title="" width="1"/></div>.
<div><a class=" link_gen " href="SHW?FRST=3">Wambold von Umstadt, Anselm Kasimir, 1583-1647 (Zeit, Lebensdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=4">4Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=5">5Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=6">6Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=7">7Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=8">8Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=9">9Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=10">10Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=11">11Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=12">12Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=13">13Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=25000">13Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
<div><a class=" link_gen " href="SHW?FRST=25001">13Vomelius, Cyprianus, 1535-1587 (Zeit, Wirkungsdaten)</a></div>
</tr>
</table>
</html>
'''
soup = BeautifulSoup(html)
tdefs = soup.find_all('td', {'class': 'rec_title_ppnlist'})
with open('data.txt', 'w') as outfile:
for tdef in tdefs:
links = tdef.find_all('a', {'class': 'link_gen'})
for link in links:
url = urlparse(link['href'])
vals = url.query.split('=')
if vals[0] == 'FRST':
if(1 <= int(vals[1]) <= 25000):
print "%s %s" % (vals[1], link.get_text())
outfile.write(link.get_text() + '\n')
我敢肯定,它讀取查詢字符串可以做出更好的(urlparse.parse_qs返回列表的字典,覺得奇怪,我)的部分。
此代碼不檢查輸入數據的有效性(例如鏈接是否具有href屬性),但它提供了關於如何執行解析的想法。
你只需要href鏈接? – Hackaholic 2014-11-02 22:06:05
其實我需要的是相關的HREF鏈接的跨度,例如神學。德國:Libellus黃色葡萄球菌特設預估短的Et Praegnans Quomodo坐Exuendus VETUS智人Induendusque諾瓦斯 – user2278505 2014-11-02 23:19:17