import requests
from bs4 import BeautifulSoup
import lxml
import urllib2
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
f =open('ala2009link.csv','r')
s=open('2009alanews.csv','w')
for row in csv.reader(f):
url=row[0]
print url
res = requests.get(url)
print res.content
soup = BeautifulSoup(res.content)
print soup
data=soup.find_all("article",{"class":"article-wrapper news"})
#data=soup.find_all("main",{"class":"main-content"})
for item in data:
title= item.find_all("h2",{"class","article-headline"})[0].text
s.write("%s \n"% title)
content=soup.find_all("p")
for main in content:
k=main.text.encode('utf-8')
s.write("%s \n"% k)
#k=csv.writer(s)
#k.writerow('%s\n'% (main))
s.close()
f.close()
這是我的代碼來提取網頁數據中提取數據,但我不知道爲什麼我不能提取數據,這是廣告攔截警告來阻止我beautifulsoup? 這就是例子鏈接:http://www.rolltide.com/news/2009/6/23/Bert_Bank_Passes_Away.aspx?path=footballBeautifulsoup不能在本網站
這是for循環for csv.reader(f)中的行的最後一行是什麼?是否有機會提供示例html或鏈接? – pawelty
你能提供鏈接嗎? –
樣本:樣本:http://www.rolltide.com/news/2009/6/23/Bert_Bank_Passes_Away.aspx?path=football @pawelty –