Tkinter Python中的按钮命令问题
我有一个程序,它会在SEC的Edgar数据库中搜索年度报告(10-K),并在一个列表框中返回40个不同的项目。现在我想创建一个“下一组40”按钮,来显示列表框中的下40个项目,下面的代码就实现了这个功能:
def Next():
global entryWidget
page = 'http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + entryWidget.get().strip() + '&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany'
sock = urllib.urlopen(page)
raw = sock.read()
soup = BeautifulSoup(raw)
npar = str(soup.find(value="Next 40"))
index = npar.find('/cgi')
index2 = npar.find('count=40') + len('count=40')
nextpage = 'http://www.sec.gov' + npar[index:index2]
sock2 = urllib.urlopen(nextpage)
raw2 = sock2.read()
soup2 = BeautifulSoup(raw2)
psoup = str(soup2.findAll(nowrap=True))
myparser = MyParser()
myparser.parse(psoup)
filinglist = myparser.get_descriptions()
linklist = myparser.get_hyperlinks()
filinglist = [s for s in filinglist if s != 'Documents']
filinglist = [s for s in filinglist if s != 'Documents Interactive Data']
filinglist = [s for s in filinglist if not re.match(r'\d{3}-', s)]
linklist = [s for s in linklist if not s.startswith('/cgi-')]
Lb1.delete(0, END)
counter = 0
while counter < len(filinglist):
Lb1.insert(counter, filinglist[counter])
counter = counter +1
你可以看到,当按钮被按下时,它会读取原始链接(页面),然后在网页的HTML中查找“下一组40”的超链接。接着,它会解析新的HTML文档(nextpage),然后获取项目名称和相关链接。现在这段代码成功地从原始页面跳转到了下一页,但它只能显示下一页的内容。
那么,我该如何把(nextpage)变成原始的(page),然后每次按下“下一组”按钮时都能列出(nextnextpage)HTML文档中的项目呢?抱歉,如果这样说让你感到困惑,我也不知道怎么更好地解释。
为了更清楚,这里是我想解析的实际网站链接:http://www.sec.gov/cgi-bin/browse-edgar ... getcompany 我希望“下一组”按钮能够不断从该网站的“下一组40”按钮中获取HTML超链接。
这是我的整个程序代码,以防你需要:
import BeautifulSoup
from BeautifulSoup import BeautifulSoup
import urllib
import sgmllib
from Tkinter import *
import tkMessageBox
import re
class MyParser(sgmllib.SGMLParser):
def parse(self, psoup):
self.feed(psoup)
self.close()
def __init__(self, verbose=0):
sgmllib.SGMLParser.__init__(self, verbose)
self.descriptions = []
self.hyperlinks = []
self.inside_td_element = 0
self.starting_description = 0
def start_td(self, attributes):
for name, value in attributes:
if name == "nowrap":
self.inside_td_element = 1
self.starting_description = 1
def end_td(self):
self.inside_td_element = 0
def start_a(self, attributes):
for name, value in attributes:
if name == "href":
self.hyperlinks.append(value)
def handle_data(self, data):
if self.inside_td_element:
if self.starting_description:
self.descriptions.append(data)
self.starting_description = 0
else:
self.descriptions[-1] += data
def get_descriptions(self):
return self.descriptions
def get_hyperlinks(self):
return self.hyperlinks
def Submit():
global entryWidget
if entryWidget.get().strip() == "":
tkMessageBox.showerror("Tkinter Entry Widget", "Enter a text value")
else:
page = 'http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + entryWidget.get().strip() + '&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany'
sock = urllib.urlopen(page)
raw = sock.read()
soup = BeautifulSoup(raw)
psoup = str(soup.findAll(nowrap=True))
myparser = MyParser()
myparser.parse(psoup)
filinglist = myparser.get_descriptions()
linklist = myparser.get_hyperlinks()
filinglist = [s for s in filinglist if s != 'Documents']
filinglist = [s for s in filinglist if s != 'Documents Interactive Data']
filinglist = [s for s in filinglist if not re.match(r'\d{3}-', s)]
linklist = [s for s in linklist if not s.startswith('/cgi-')]
counter = 0
while counter < len(filinglist):
Lb1.insert(counter, filinglist[counter])
counter = counter +1
downloadbutton.configure(state=NORMAL)
nextbutton.configure(state=NORMAL)
def Next():
global entryWidget
page = 'http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + entryWidget.get().strip() + '&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany'
sock = urllib.urlopen(page)
raw = sock.read()
soup = BeautifulSoup(raw)
npar = str(soup.find(value="Next 40"))
index = npar.find('/cgi')
index2 = npar.find('count=40') + len('count=40')
nextpage = 'http://www.sec.gov' + npar[index:index2]
sock2 = urllib.urlopen(nextpage)
raw2 = sock2.read()
soup2 = BeautifulSoup(raw2)
psoup = str(soup2.findAll(nowrap=True))
myparser = MyParser()
myparser.parse(psoup)
filinglist = myparser.get_descriptions()
linklist = myparser.get_hyperlinks()
filinglist = [s for s in filinglist if s != 'Documents']
filinglist = [s for s in filinglist if s != 'Documents Interactive Data']
filinglist = [s for s in filinglist if not re.match(r'\d{3}-', s)]
linklist = [s for s in linklist if not s.startswith('/cgi-')]
Lb1.delete(0, END)
counter = 0
while counter < len(filinglist):
Lb1.insert(counter, filinglist[counter])
counter = counter +1
previousbutton.configure(state=NORMAL)
nextbutton.configure(state=DISABLED)
def Previous():
global entryWidget
page = 'http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' + entryWidget.get().strip() + '&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany'
sock = urllib.urlopen(page)
raw = sock.read()
soup = BeautifulSoup(raw)
psoup = str(soup.findAll(nowrap=True))
myparser = MyParser()
myparser.parse(psoup)
filinglist = myparser.get_descriptions()
linklist = myparser.get_hyperlinks()
filinglist = [s for s in filinglist if s != 'Documents']
filinglist = [s for s in filinglist if s != 'Documents Interactive Data']
filinglist = [s for s in filinglist if not re.match(r'\d{3}-', s)]
linklist = [s for s in linklist if not s.startswith('/cgi-')]
Lb1.delete(0, END)
counter = 0
while counter < len(filinglist):
Lb1.insert(counter, filinglist[counter])
counter = counter +1
nextbutton.configure(state=NORMAL)
previousbutton.configure(state=DISABLED)
if __name__ == "__main__":
root = Tk()
root.title("SEC Edgar Search")
root["padx"] = 10
root["pady"] = 25
top = Frame(root)
bottom = Frame(root)
bottom2 = Frame(root)
top.pack(side=TOP)
bottom.pack(side=BOTTOM, fill=BOTH, expand=True)
bottom2.pack(side=BOTTOM, fill=BOTH, expand=True)
textFrame = Frame(root)
entryLabel = Label(textFrame)
entryLabel["text"] = "Ticker symbol:"
entryLabel.pack(side=TOP)
entryWidget = Entry(textFrame)
entryWidget["width"] = 15
entryWidget.pack(side=LEFT)
textFrame.pack()
scrollbar = Scrollbar(root)
scrollbar.pack(side=RIGHT, fill=Y)
Lb1 = Listbox(root, width=20, height=15, yscrollcommand=scrollbar.set, selectmode=EXTENDED)
Lb1.pack()
scrollbar.config(command=Lb1.yview)
submitbutton = Button(root, text="Submit", command=Submit)
submitbutton.pack(in_=bottom2, side=TOP)
downloadbutton = Button(root, text="Download")
downloadbutton.pack(in_=bottom2, side=TOP)
downloadbutton.configure(state=DISABLED)
previousbutton = Button(root, text="Previous 40", command=Previous)
previousbutton.pack(in_=bottom, side=LEFT)
previousbutton.configure(state=DISABLED)
nextbutton = Button(root, text="Next 40", command=Next)
nextbutton.pack(in_=bottom, side=LEFT)
nextbutton.configure(state=DISABLED)
root.mainloop()
1 个回答
1
使用一个应用程序类来代替全局变量。现在你总是下载第一页。但是你的应用程序类应该缓存当前页面的内容,也就是“汤”,这样 next
就可以从“下一页40个”按钮中获取点击值:
class Application(Frame):
def __init__(self, parent=None):
Frame.__init__(self, parent)
self.pack()
self.top = Frame(self)
self.bottom = Frame(self)
self.bottom2 = Frame(self)
self.top.pack(side=TOP)
self.bottom.pack(side=BOTTOM, fill=BOTH, expand=True)
self.bottom2.pack(side=BOTTOM, fill=BOTH, expand=True)
#...
self.submitbutton = Button(self, text="Submit", command=self.submit)
self.submitbutton.pack(in_=self.bottom2, side=TOP)
#...
#...
def submit(self):
page = ('http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=' +
self.entryWidget.get().strip() +
'&filenum=&State=&Country=&SIC=&owner=exclude'
'&Find=Find+Companies&action=getcompany')
#...
self.soup = ...
def next(self):
#...
#there must be a better way than this to extract the onclick value
#but I don't use/know BeautifulSoup to help with this part
npar = str(self.soup.find(value="Next 40"))
index1 = npar.find('/cgi')
index2 = npar.find('count=40') + len('count=40')
page = 'http://www.sec.gov' + npar[index1:index2]
sock = urllib.urlopen(page)
raw = sock.read()
self.soup = BeautifulSoup(raw)
#...
if __name__ == '__main__':
root = Tk()
root.title("SEC Edgar Search")
root["padx"] = 10
root["pady"] = 25
app = Application(root)
app.mainloop()
root.destroy()
对于每一新页面,点击链接会更新 &Start 参数。所以你也可以在你的类中增加一个计数器,而不需要去解析当前的内容来获取这个值。