2011-07-30 162 views
0

所以我有這個程序搜索年度報告(10-K)SEC SEC埃德加數據庫,並返回列表框中的40個不同項目的列表。好吧,我想創建一個顯示在列表框中的下一個40個項目,其中下面的代碼完成了「未來40」按鈕:問題與按鈕命令Tkinter Python

def Next(): 

global entryWidget 

page = 'http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + entryWidget.get().strip() + '&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany' 
sock = urllib.urlopen(page) 
raw = sock.read() 
soup = BeautifulSoup(raw) 

npar = str(soup.find(value="Next 40")) 
index = npar.find('/cgi') 
index2 = npar.find('count=40') + len('count=40') 
nextpage = 'http://www.sec.gov' + npar[index:index2] 

sock2 = urllib.urlopen(nextpage) 
raw2 = sock2.read() 
soup2 = BeautifulSoup(raw2) 

psoup = str(soup2.findAll(nowrap=True)) 

myparser = MyParser() 
myparser.parse(psoup) 

filinglist = myparser.get_descriptions() 
linklist = myparser.get_hyperlinks() 

filinglist = [s for s in filinglist if s != 'Documents'] 
filinglist = [s for s in filinglist if s != 'Documents Interactive Data'] 
filinglist = [s for s in filinglist if not re.match(r'\d{3}-', s)] 

linklist = [s for s in linklist if not s.startswith('/cgi-')] 

Lb1.delete(0, END) 

counter = 0 

while counter < len(filinglist): 
    Lb1.insert(counter, filinglist[counter]) 
    counter = counter +1 

當按下按鈕時,你可以看到,它讀取原始鏈接(頁面)而不是在html網站(頁面)上查找「Next 40」超鏈接。然後解析新的html文檔(nextpage),然後獲取項目名稱和關聯的鏈接。現在,此代碼成功從原始頁面轉到下一頁,但它只能顯示下一頁。

那麼我怎麼能夠使(nextpage)進入原始(頁面),然後能夠列出(nextnextpage)html文件中的項目,每次我按下'下一步'按鈕?對不起,如果這是令人困惑的,我真的不知道任何其他方式來解釋它。

欲瞭解更多的解釋,這裏是我想要解析的實際站點鏈接:http://www.sec.gov/cgi-bin/browse-edgar ... getcompany 我想要'下一步'按鈕來繼續檢索該網站'下一個40'按鈕的HTML超鏈接。

這裏是我的情況下,整個程序代碼,您需要:

import BeautifulSoup 
from BeautifulSoup import BeautifulSoup 
import urllib 
import sgmllib 
from Tkinter import * 
import tkMessageBox 
import re 

class MyParser(sgmllib.SGMLParser): 

def parse(self, psoup): 
    self.feed(psoup) 
    self.close() 

def __init__(self, verbose=0): 
    sgmllib.SGMLParser.__init__(self, verbose) 
    self.descriptions = [] 
    self.hyperlinks = [] 
    self.inside_td_element = 0 
    self.starting_description = 0 

def start_td(self, attributes): 
    for name, value in attributes: 
     if name == "nowrap": 
      self.inside_td_element = 1 
      self.starting_description = 1 

def end_td(self): 
    self.inside_td_element = 0 

def start_a(self, attributes): 
    for name, value in attributes: 
     if name == "href": 
      self.hyperlinks.append(value) 

def handle_data(self, data): 
    if self.inside_td_element: 
     if self.starting_description: 
      self.descriptions.append(data) 
      self.starting_description = 0 
     else: 
      self.descriptions[-1] += data 

def get_descriptions(self): 
    return self.descriptions 

def get_hyperlinks(self): 
    return self.hyperlinks 

def Submit(): 

global entryWidget 

if entryWidget.get().strip() == "": 
    tkMessageBox.showerror("Tkinter Entry Widget", "Enter a text value") 
else: 
    page = 'http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + entryWidget.get().strip() + '&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany' 
    sock = urllib.urlopen(page) 
    raw = sock.read() 
    soup = BeautifulSoup(raw) 
    psoup = str(soup.findAll(nowrap=True)) 
    myparser = MyParser() 
    myparser.parse(psoup) 

    filinglist = myparser.get_descriptions() 
    linklist = myparser.get_hyperlinks() 

    filinglist = [s for s in filinglist if s != 'Documents'] 
    filinglist = [s for s in filinglist if s != 'Documents Interactive Data'] 
    filinglist = [s for s in filinglist if not re.match(r'\d{3}-', s)] 

    linklist = [s for s in linklist if not s.startswith('/cgi-')] 

    counter = 0 

    while counter < len(filinglist): 
     Lb1.insert(counter, filinglist[counter]) 
     counter = counter +1 

    downloadbutton.configure(state=NORMAL) 
    nextbutton.configure(state=NORMAL) 

def Next(): 

global entryWidget 

page = 'http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + entryWidget.get().strip() + '&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany' 
sock = urllib.urlopen(page) 
raw = sock.read() 
soup = BeautifulSoup(raw) 

npar = str(soup.find(value="Next 40")) 
index = npar.find('/cgi') 
index2 = npar.find('count=40') + len('count=40') 
nextpage = 'http://www.sec.gov' + npar[index:index2] 

sock2 = urllib.urlopen(nextpage) 
raw2 = sock2.read() 
soup2 = BeautifulSoup(raw2) 

psoup = str(soup2.findAll(nowrap=True)) 

myparser = MyParser() 
myparser.parse(psoup) 

filinglist = myparser.get_descriptions() 
linklist = myparser.get_hyperlinks() 

filinglist = [s for s in filinglist if s != 'Documents'] 
filinglist = [s for s in filinglist if s != 'Documents Interactive Data'] 
filinglist = [s for s in filinglist if not re.match(r'\d{3}-', s)] 

linklist = [s for s in linklist if not s.startswith('/cgi-')] 

Lb1.delete(0, END) 

counter = 0 

while counter < len(filinglist): 
    Lb1.insert(counter, filinglist[counter]) 
    counter = counter +1 

previousbutton.configure(state=NORMAL) 
nextbutton.configure(state=DISABLED) 

def Previous(): 

global entryWidget 

page = 'http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + entryWidget.get().strip() + '&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany' 
sock = urllib.urlopen(page) 
raw = sock.read() 
soup = BeautifulSoup(raw) 

psoup = str(soup.findAll(nowrap=True)) 

myparser = MyParser() 
myparser.parse(psoup) 

filinglist = myparser.get_descriptions() 
linklist = myparser.get_hyperlinks() 

filinglist = [s for s in filinglist if s != 'Documents'] 
filinglist = [s for s in filinglist if s != 'Documents Interactive Data'] 
filinglist = [s for s in filinglist if not re.match(r'\d{3}-', s)] 

linklist = [s for s in linklist if not s.startswith('/cgi-')] 

Lb1.delete(0, END) 

counter = 0 

while counter < len(filinglist): 
    Lb1.insert(counter, filinglist[counter]) 
    counter = counter +1 

nextbutton.configure(state=NORMAL) 
previousbutton.configure(state=DISABLED) 

if __name__ == "__main__": 

root = Tk() 
root.title("SEC Edgar Search") 
root["padx"] = 10 
root["pady"] = 25 

top = Frame(root) 
bottom = Frame(root) 
bottom2 = Frame(root) 
top.pack(side=TOP) 
bottom.pack(side=BOTTOM, fill=BOTH, expand=True) 
bottom2.pack(side=BOTTOM, fill=BOTH, expand=True) 

textFrame = Frame(root) 

entryLabel = Label(textFrame) 
entryLabel["text"] = "Ticker symbol:" 
entryLabel.pack(side=TOP) 

entryWidget = Entry(textFrame) 
entryWidget["width"] = 15 
entryWidget.pack(side=LEFT) 

textFrame.pack() 

scrollbar = Scrollbar(root) 
scrollbar.pack(side=RIGHT, fill=Y) 

Lb1 = Listbox(root, width=20, height=15, yscrollcommand=scrollbar.set, selectmode=EXTENDED) 
Lb1.pack() 

scrollbar.config(command=Lb1.yview) 

submitbutton = Button(root, text="Submit", command=Submit) 
submitbutton.pack(in_=bottom2, side=TOP) 

downloadbutton = Button(root, text="Download") 
downloadbutton.pack(in_=bottom2, side=TOP) 
downloadbutton.configure(state=DISABLED) 

previousbutton = Button(root, text="Previous 40", command=Previous) 
previousbutton.pack(in_=bottom, side=LEFT) 
previousbutton.configure(state=DISABLED) 

nextbutton = Button(root, text="Next 40", command=Next) 
nextbutton.pack(in_=bottom, side=LEFT) 
nextbutton.configure(state=DISABLED) 

root.mainloop() 

回答

1

使用應用程序類,而不是全局的。目前你總是下載第一頁。但是,你的應用程序類應該緩存的當前頁面,該頁面next使用從「下一個40」的形式按鈕獲得的onClick價值的「湯」:

class Application(Frame): 
    def __init__(self, parent=None): 
     Frame.__init__(self, parent) 
     self.pack() 

     self.top = Frame(self) 
     self.bottom = Frame(self) 
     self.bottom2 = Frame(self) 
     self.top.pack(side=TOP) 
     self.bottom.pack(side=BOTTOM, fill=BOTH, expand=True) 
     self.bottom2.pack(side=BOTTOM, fill=BOTH, expand=True) 
     #... 
     self.submitbutton = Button(self, text="Submit", command=self.submit) 
     self.submitbutton.pack(in_=self.bottom2, side=TOP) 
     #... 

    #... 

    def submit(self): 
     page = ('http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + 
       self.entryWidget.get().strip() + 
       '&filenum=&State=&Country=&SIC=&owner=exclude' 
       '&Find=Find+Companies&action=getcompany') 
     #... 
     self.soup = ... 

    def next(self): 
     #... 
     #there must be a better way than this to extract the onclick value 
     #but I don't use/know BeautifulSoup to help with this part 

     npar = str(self.soup.find(value="Next 40")) 
     index1 = npar.find('/cgi') 
     index2 = npar.find('count=40') + len('count=40') 
     page = 'http://www.sec.gov' + npar[index1:index2] 

     sock = urllib.urlopen(page) 
     raw = sock.read() 
     self.soup = BeautifulSoup(raw) 

     #... 

if __name__ == '__main__': 
    root = Tk() 
    root.title("SEC Edgar Search") 
    root["padx"] = 10 
    root["pady"] = 25 

    app = Application(root) 

    app.mainloop() 
    root.destroy() 

對於每一個新頁面的的onClick鏈接更新&開始參數。所以或者你可以在你的班級增加一個計數器,而不用費力地解析當前的湯來獲得價值。

+0

我嘗試使用以下代碼創建一個新類:Class Application():def submit(self):。 。 。等等。但我不斷收到此異常:Tkinter回調中的異常 Traceback(最近調用最後一次): 文件「C:\ Python27 \ lib \ lib-tk \ Tkinter.py」,第1410行,在__call__中 return self.func * args) TypeError:無約束方法必須使用Application實例作爲第一個參數調用Submit()(取而代之)。任何想法是什麼造成這個? – kr21

+0

是的,這個作品非常完美,非常感謝! – kr21