父类的__getattr__导致子类__init__递归错误
根据一个回答的建议:子类化BeautifulSoup的HTML解析器,出现类型错误,我正在尝试使用类组合,而不是直接继承BeautifulSoup
。
基本的Scraper类单独使用时运行得很好(至少在我有限的测试中是这样)。
Scraper类的代码如下:
from BeautifulSoup import BeautifulSoup
import urllib2
class Scrape():
"""base class to be subclassed
basically a wrapper that providers basic url fetching with urllib2
and the basic html parsing with beautifulsoupץ
some useful methods are provided with class composition with BeautifulSoup.
for direct access to the soup class you can use the _soup property."""
def __init__(self,file):
self._file = file
#very basic input validation
#import re
#import urllib2
#from BeautifulSoup import BeautifulSoup
try:
self._page = urllib2.urlopen(self._file) #fetching the page
except (urllib2.URLError):
print ('please enter a valid url starting with http/https/ftp/file')
self._soup = BeautifulSoup(self._page) #calling the html parser
#BeautifulSoup.__init__(self,self._page)
# the next part is the class compostion part - we transform attribute and method calls to the BeautifulSoup class
#search functions:
self.find = self._soup.find
self.findAll = self._soup.findAll
self.__iter__ = self._soup.__iter__ #enables iterating,looping in the object
self.__len__ = self._soup.__len__
self.__contains__ = self._soup.__contains__
#attribute fetching and setting - __getattr__ implented by the scraper class
self.__setattr__ = self._soup.__setattr__
self.__getattribute__ = self._soup.__getattribute__
#Called to implement evaluation of self[key]
self.__getitem__ = self._soup.__getitem__
self.__setitem__ = self._soup.__setitem__
self.__delitem__ = self._soup.__delitem__
self.__call__ = self._soup.__call__#Called when the instance is “called” as a function
self._getAttrMap = self._soup._getAttrMap
self.has_key = self._soup.has_key
#walking the html document methods
self.contents = self._soup.contents
self.text = self._soup.text
self.extract = self._soup.extract
self.next = self._soup.next
self.parent = self._soup.parent
self.fetch = self._soup.fetch
self.fetchText = self._soup.fetchText
self.findAllNext = self._soup.findAllNext
self.findChild = self._soup.findChild
self.findChildren = self._soup.findChildren
self.findNext = self._soup.findNext
self.findNextSibling = self._soup.findNextSibling
self.first = self._soup.first
self.name = self._soup.name
self.get = self._soup.get
self.getString = self._soup.getString
# comparison operators or similiar boolean checks
self.__eq__ = self._soup.__eq__
self.__ne__ = self._soup.__ne__
self.__hash__ = self._soup.__hash__
self.__nonezero__ = self._soup.__nonzero__ #not sure
# the class represntation magic methods:
self.__str__ = self._soup.__str__
self.__repr__ =self._soup.__repr__
#self.__dict__ = self._soup.__dict__
def __getattr__(self,method):
"""basically this 'magic' method transforms calls for unknown attributes to
and enables to traverse the html document with the .notation.
for example - using instancename.div will return the first div.
explantion: python calls __getattr__ if It didn't find any method or attribute correspanding to the call.
I'm not sure this is a good or the right use for the method """
return self._soup.find(method)
def clean(self,work=False,element=False):
"""clean method that provides:basic cleaning of head,scripts etc
input 'work' soup object to clean from unneccesary parts:scripts,head,style
has optional variable:'element' that can get a tuple of element
that enables to override what element to clean"""
self._work = work or self._soup
self._cleanelements=element or ("head","style","script")
#for elem in self._work.findAll(self._cleanelements):
for elem in self.findAll(self._cleanelements):
elem.extract()
但是当我对它进行子类化时,我遇到了某种递归循环,我搞不清楚原因。
这是子类的相关部分:
class MainTraffic(Scrape):
"""class traffic - subclasses the Scrape class
inputs a page url and a category"""
def __init__(self, file, cat, caller = False):
if not caller:
self._file = file
#import urllib2
#self._request = urllib2.Request(self._file)# request to post the show all questions
Scrape.__init__(self,self._file)
self.pagecat = cat
self.clean(self)
self.cleansoup = self.cleantotable(self)
self.fetchlinks(self.cleansoup)
#self.populatequestiondic()
#del (self.cleansoup)
def cleantotable(self):
pass
def fetchlinks(self,fetch):
pass
def length(self):
from sqlalchemy import func
self.len = session.query(func.count(Question.id)).scalar()
return int(self.len)
def __len__(self):
return self.length()
def __repr__(self):
self.repr = "traffic theory question, current number of questions:{0}".format(self.length())
return self.repr
def __getitem__(self,key):
try:
self._item = session.query(Question).filter_by(question_num=key).first()
return self._item
except (IndexError, KeyError):
print "no such key:{0}".format(key)
这是错误信息:
File "C:\Python27\learn\traffic.py", line 117, in __init__
Scrape.__init__(self,self._file)
File "C:\Python27\learn\traffic.py", line 26, in __init__
self._soup = BeautifulSoup(self._page) #calling the html parser
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
RuntimeError: maximum recursion depth exceeded
我怀疑问题出在我错误使用了__getattr__
,但我不知道该怎么改。
1 个回答
3
第一部分
你的代码不工作是因为 __getattr__()
在 self._soup
还没初始化之前就去访问它了。这是由于四行看起来无害的代码造成的:
try:
self._page = urllib2.urlopen(self._file)
except (urllib2.URLError):
print ('please enter a valid url starting with http/https/ftp/file')
你为什么捕获了异常却不去处理它呢?
接下来的一行代码访问了 self._page
,但如果 urlopen()
抛出了异常,这个值还没有被设置:
self._soup = BeautifulSoup(self._page)
因为它还没有被设置,所以访问它会调用 __getattr__()
,而这个方法又去访问 self._soup
,但 self._soup
还没设置好,所以又回到调用 __getattr__
。
最简单的“修复”方法是对 _soup
做特殊处理,以防止无限循环。此外,让 __getattr__
直接正常查找 soup
的属性似乎更合理:
def __getattr__(self,attr):
if attr == "_soup":
raise AttributeError()
return getattr(self._soup,attr)
第二部分
把所有的方法都复制过来可能效果不太好,而且似乎完全错过了类组合的要点。