Python中的“MemoryError”错误信息

1 投票
4 回答
10574 浏览
提问于 2025-04-15 12:10

我遇到了一个问题:我正在尝试解析一个很大的文本文件(大约15,000 KB),然后把它写入MySQL数据库。我使用的是Python 2.6,这个脚本能解析文件的一半并把它添加到数据库,但之后就卡住了。有时候它会显示这个信息:

内存错误。

其他时候它就直接卡住了。我原以为可以通过尽量使用生成器来避免这个问题,但看来我错了。

我到底哪里做错了呢?

当我按下 Ctrl + C 来中断程序时,它会显示这个错误信息:

...
sucessfully added vote # 2281
sucessfully added vote # 2282
sucessfully added vote # 2283
sucessfully added vote # 2284
floorvotes_db.py:35: Warning: Data truncated for column 'vote_value' at row 1
  r['bill ID']  , r['last name'], r['vote'])
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "floorvotes_db.py", line 67, in addAllFiles
    addFile(file)
  File "floorvotes_db.py", line 61, in addFile
    add(record)
  File "floorvotes_db.py", line 35, in add
    r['bill ID']  , r['last name'], r['vote'])
  File "build/bdist.linux-i686/egg/MySQLdb/cursors.py", line 166, in execute
  File "build/bdist.linux-i686/egg/MySQLdb/connections.py", line 35, in defaulte     rrorhandler
KeyboardInterrupt


import os, re, datetime, string

# Data
DIR  = '/mydir'
tfn = r'C:\Documents and Settings\Owner\Desktop\data.txt'
rgxs = {
    'bill number': {
        'rgx': r'(A|S)[0-9]+-?[A-Za-z]* {50}'}
    }

# Compile rgxs for speediness
for rgx in rgxs: rgxs[rgx]['rgx'] = re.compile(rgxs[rgx]['rgx'])
splitter = rgxs['bill number']['rgx']

# Guts
class floor_vote_file:

    def __init__(self, fn):
        self.iterdata = (str for str in
                         splitter.split(open(fn).read())
                         if str and str <> 'A' and str <> 'S')

    def iterVotes(self):
        for record in self.data:
            if record: yield billvote(record)

class billvote(object):

    def __init__(self, section):
        self.data    = [line.strip() for line
                        in section.splitlines()]
        self.summary = self.data[1].split()
        self.vtlines = self.data[2:]
        self.date    = self.date()
        self.year    = self.year()
        self.votes   = self.parse_votes()
        self.record = self.record()

    # Parse summary date
    def date(self):
        d = [int(str) for str in self.summary[0].split('/')]
        return datetime.date(d[2],d[0],d[1]).toordinal()

    def year(self):
        return datetime.date.fromordinal(self.date).year

    def session(self):
        """
        arg: 2-digit year int
        returns: 4-digit session
        """
        def odd():
            return divmod(self.year, 2)[1] == 1

        if odd():
            return str(string.zfill(self.year, 2)) + \
                   str(string.zfill(self.year + 1, 2))
        else:
            return str(string.zfill(self.year - 1, 2))+ \
                   str(string.zfill(self.year, 2))

    def house(self):
        if self.summary[2] == 'Assembly': return 1
        if self.summary[2] == 'Senate'  : return 2

    def splt_v_line(self, line):
        return [string for string in line.split('   ')
                if string <> '']

    def splt_v(self, line):
        return line.split()

    def prse_v(self, item):
        """takes split_vote item"""
        return {
            'vote'     : unicode(item[0]),
            'last name': unicode(' '.join(item[1:]))
            }

    # Parse votes - main
    def parse_votes(self):
        nested = [[self.prse_v(self.splt_v(vote))
                   for vote in self.splt_v_line(line)]
                  for line in self.vtlines]
        flattened = []
        for lst in nested:
            for dct in lst:
                flattened.append(dct)
        return flattened

    # Useful data objects
    def record(self):
        return {
            'date'    : unicode(self.date),
            'year'    : unicode(self.year),
            'session' : unicode(self.session()),
            'house'   : unicode(self.house()),
            'bill ID' : unicode(self.summary[1]),
            'ayes'    : unicode(self.summary[5]),
            'nays'    : unicode(self.summary[7]),
            }

    def iterRecords(self):

        for vote in self.votes:
            r = self.record.copy()
            r['vote']      = vote['vote']
            r['last name'] = vote['last name']
            yield r

test = floor_vote_file(tfn)


import MySQLdb as dbapi2
import floorvotes_parse as v
import os

# Initial database crap
db = dbapi2.connect(db=r"db",
                    user="user",
                    passwd="XXXXX")
cur = db.cursor()

if db and cur: print "\nConnected to db.\n"

def commit(): db.commit()

def ext():
    cur.close()
    db.close()
    print "\nConnection closed.\n"

# DATA

DIR  = '/mydir'
files = [DIR+fn for fn in os.listdir(DIR)
         if fn.startswith('fvote')]

# Add stuff
def add(r):
    """add a record"""
    cur.execute(
u'''INSERT INTO ny_votes (vote_house, vote_date, vote_year, bill_id,
member_lastname, vote_value) VALUES
(%s            , %s       , %s          ,
 %s            , %s       , %s      )''',
(r['house']    , r['date']     , r['year'],
 r['bill ID']  , r['last name'], r['vote'])
)

    #print "added", r['year'], r['bill ID']

def crt():
    """create table"""
    SQL = """
CREATE TABLE ny_votes (openleg_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
vote_house int(1), vote_date int(5), vote_year int(2), bill_id varchar(8),
member_lastname varchar(50), vote_value varchar(10));
"""
    cur.execute(SQL)
    print "\nCreate ny_votes.\n"

def rst():
    SQL = """DROP TABLE ny_votes"""
    cur.execute(SQL)
    print "\nDropped ny_votes.\n"
    crt()

def addFile(fn):
    """parse and add all records in a file"""
    n = 0
    for votes in v.floor_vote_file(fn).iterVotes():
        for record in votes.iterRecords():
            add(record)
        n += 1
        print 'sucessfully added vote # ' + str(n)

def addAllFiles():
    for file in files:
        addFile(file)

if __name__=='__main__':
    rst()
    addAllFiles()

4 个回答

1

这不是Python内存的问题,但也许值得考虑一下。之前的回答让我觉得你很快就能解决这个问题。

我在想MySQL中的回滚日志。如果一个事务太大了,或许你可以把它分成小块来处理。每次单独提交一小块,而不是试图回滚一个15MB的文件。

2

我注意到你用了很多slit()这个函数。根据这个链接的内容,使用这个函数会消耗很多内存。你可以开始研究一下这个问题。

7

生成器是个不错的主意,但你似乎忽略了一个很大的问题:

(str for str in splitter.split(open(fn).read()) if str and str <> 'A' and str <> 'S')

你一次性把整个文件都读进来了,尽管你其实只需要逐部分处理。你的代码对我来说太复杂了,我没法帮你修正,但你可以用文件的迭代器来完成你的任务:

(line for line in open(fn))

撰写回答