0
我正在寫一個beatifulsoup webcrawler。該腳本獲取正確的數據並以csv格式將其寫入文件。但是,當試圖讀取數據時(靠近代碼結尾),我使用不同的變量名稱再次打開文件並嘗試讀取。但是,最後一行打印的輸出是來自原始網站的一堆HTML代碼。我認爲它來自'湯'字符串。到底是怎麼回事?來自錯誤變量的數據 - CSV
import datetime
import csv
import urllib
from bs4 import BeautifulSoup
import urllib2
file_name = "https://stackoverflow.com/users/ripple/Dropbox/Python/FinViz.txt"
file = open(file_name,"w")
url = "http://www.finviz.com"
print 'Grabbing from: ' + url + '...\n'
try:
r = urllib2.urlopen(url)
except urllib2.URLError as e:
r = e
if r.code in (200, 401):
#get the table data from the page
data = urllib.urlopen(url).read()
#send to beautiful soup
soup = BeautifulSoup(data)
i=1
for table in soup("table", { "class" : "t-home-table"}):
#First and second tables
if i==1 or i==2:
for tr in table.findAll('tr')[1:]:
if i<3:
col = tr.findAll('td')
ticker = col[0].get_text().encode('ascii','ignore')
price = col[1].get_text().encode('ascii','ignore')
change = col[2].get_text().encode('ascii','ignore')
volume = col[3].get_text().encode('ascii','ignore')
metric = col[5].get_text().encode('ascii','ignore')
record = ticker + ',' + price + ',' + change + ',' + volume + ',' + metric + '\n'
print record
file.write(record)
if i==3:
file.write('END\n')
# Third and fourth tables
if i==3 or i==4:
for tr in table.findAll('tr')[1:]:
col = tr.findAll('td')
ticker1 = col[0].get_text().encode('ascii','ignore')
ticker2 = col[1].get_text().encode('ascii','ignore')
ticker3 = col[2].get_text().encode('ascii','ignore')
ticker4 = col[3].get_text().encode('ascii','ignore')
metric = col[5].get_text().encode('ascii','ignore')
record = ticker1 + ',' + ticker2 + ',' + ticker3 + ',' + ticker4 + ',' + metric + '\n'
print record
file.write(record)
i+=1
#if the page does not open
else:
print "ERROR:"
file.close()
#open written file to read tickers and download tables from finviz
file = open(file_name,"r")
finviz_csv = csv.reader(file)
for row in finviz_csv:
stock = col[0]
print stock