Python中的“MemoryError”错误信息
我遇到了一个问题:我正在尝试解析一个很大的文本文件(大约15,000 KB),然后把它写入MySQL数据库。我使用的是Python 2.6,这个脚本能解析文件的一半并把它添加到数据库,但之后就卡住了。有时候它会显示这个信息:
内存错误。
其他时候它就直接卡住了。我原以为可以通过尽量使用生成器来避免这个问题,但看来我错了。
我到底哪里做错了呢?
当我按下 Ctrl + C 来中断程序时,它会显示这个错误信息:
...
sucessfully added vote # 2281
sucessfully added vote # 2282
sucessfully added vote # 2283
sucessfully added vote # 2284
floorvotes_db.py:35: Warning: Data truncated for column 'vote_value' at row 1
r['bill ID'] , r['last name'], r['vote'])
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "floorvotes_db.py", line 67, in addAllFiles
addFile(file)
File "floorvotes_db.py", line 61, in addFile
add(record)
File "floorvotes_db.py", line 35, in add
r['bill ID'] , r['last name'], r['vote'])
File "build/bdist.linux-i686/egg/MySQLdb/cursors.py", line 166, in execute
File "build/bdist.linux-i686/egg/MySQLdb/connections.py", line 35, in defaulte rrorhandler
KeyboardInterrupt
import os, re, datetime, string
# Data
DIR = '/mydir'
tfn = r'C:\Documents and Settings\Owner\Desktop\data.txt'
rgxs = {
'bill number': {
'rgx': r'(A|S)[0-9]+-?[A-Za-z]* {50}'}
}
# Compile rgxs for speediness
for rgx in rgxs: rgxs[rgx]['rgx'] = re.compile(rgxs[rgx]['rgx'])
splitter = rgxs['bill number']['rgx']
# Guts
class floor_vote_file:
def __init__(self, fn):
self.iterdata = (str for str in
splitter.split(open(fn).read())
if str and str <> 'A' and str <> 'S')
def iterVotes(self):
for record in self.data:
if record: yield billvote(record)
class billvote(object):
def __init__(self, section):
self.data = [line.strip() for line
in section.splitlines()]
self.summary = self.data[1].split()
self.vtlines = self.data[2:]
self.date = self.date()
self.year = self.year()
self.votes = self.parse_votes()
self.record = self.record()
# Parse summary date
def date(self):
d = [int(str) for str in self.summary[0].split('/')]
return datetime.date(d[2],d[0],d[1]).toordinal()
def year(self):
return datetime.date.fromordinal(self.date).year
def session(self):
"""
arg: 2-digit year int
returns: 4-digit session
"""
def odd():
return divmod(self.year, 2)[1] == 1
if odd():
return str(string.zfill(self.year, 2)) + \
str(string.zfill(self.year + 1, 2))
else:
return str(string.zfill(self.year - 1, 2))+ \
str(string.zfill(self.year, 2))
def house(self):
if self.summary[2] == 'Assembly': return 1
if self.summary[2] == 'Senate' : return 2
def splt_v_line(self, line):
return [string for string in line.split(' ')
if string <> '']
def splt_v(self, line):
return line.split()
def prse_v(self, item):
"""takes split_vote item"""
return {
'vote' : unicode(item[0]),
'last name': unicode(' '.join(item[1:]))
}
# Parse votes - main
def parse_votes(self):
nested = [[self.prse_v(self.splt_v(vote))
for vote in self.splt_v_line(line)]
for line in self.vtlines]
flattened = []
for lst in nested:
for dct in lst:
flattened.append(dct)
return flattened
# Useful data objects
def record(self):
return {
'date' : unicode(self.date),
'year' : unicode(self.year),
'session' : unicode(self.session()),
'house' : unicode(self.house()),
'bill ID' : unicode(self.summary[1]),
'ayes' : unicode(self.summary[5]),
'nays' : unicode(self.summary[7]),
}
def iterRecords(self):
for vote in self.votes:
r = self.record.copy()
r['vote'] = vote['vote']
r['last name'] = vote['last name']
yield r
test = floor_vote_file(tfn)
import MySQLdb as dbapi2
import floorvotes_parse as v
import os
# Initial database crap
db = dbapi2.connect(db=r"db",
user="user",
passwd="XXXXX")
cur = db.cursor()
if db and cur: print "\nConnected to db.\n"
def commit(): db.commit()
def ext():
cur.close()
db.close()
print "\nConnection closed.\n"
# DATA
DIR = '/mydir'
files = [DIR+fn for fn in os.listdir(DIR)
if fn.startswith('fvote')]
# Add stuff
def add(r):
"""add a record"""
cur.execute(
u'''INSERT INTO ny_votes (vote_house, vote_date, vote_year, bill_id,
member_lastname, vote_value) VALUES
(%s , %s , %s ,
%s , %s , %s )''',
(r['house'] , r['date'] , r['year'],
r['bill ID'] , r['last name'], r['vote'])
)
#print "added", r['year'], r['bill ID']
def crt():
"""create table"""
SQL = """
CREATE TABLE ny_votes (openleg_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
vote_house int(1), vote_date int(5), vote_year int(2), bill_id varchar(8),
member_lastname varchar(50), vote_value varchar(10));
"""
cur.execute(SQL)
print "\nCreate ny_votes.\n"
def rst():
SQL = """DROP TABLE ny_votes"""
cur.execute(SQL)
print "\nDropped ny_votes.\n"
crt()
def addFile(fn):
"""parse and add all records in a file"""
n = 0
for votes in v.floor_vote_file(fn).iterVotes():
for record in votes.iterRecords():
add(record)
n += 1
print 'sucessfully added vote # ' + str(n)
def addAllFiles():
for file in files:
addFile(file)
if __name__=='__main__':
rst()
addAllFiles()
4 个回答
1
这不是Python内存的问题,但也许值得考虑一下。之前的回答让我觉得你很快就能解决这个问题。
我在想MySQL中的回滚日志。如果一个事务太大了,或许你可以把它分成小块来处理。每次单独提交一小块,而不是试图回滚一个15MB的文件。
2
我注意到你用了很多slit()这个函数。根据这个链接的内容,使用这个函数会消耗很多内存。你可以开始研究一下这个问题。
7
生成器是个不错的主意,但你似乎忽略了一个很大的问题:
(str for str in splitter.split(open(fn).read()) if str and str <> 'A' and str <> 'S')
你一次性把整个文件都读进来了,尽管你其实只需要逐部分处理。你的代码对我来说太复杂了,我没法帮你修正,但你可以用文件的迭代器来完成你的任务:
(line for line in open(fn))