使用SQLite3和Mutagen优化Python代码
我正在改进一个开源的音乐数据库,这个数据库可以读取我收藏的歌曲并把它们存储到一个SQLite数据库里。通过这个数据库,我可以找到重复的歌曲,查询我的音乐收藏,甚至(如果我想的话)找出重复的歌曲。为了读取音乐文件的元数据,我使用了Mutagen这个库,而存储这些元数据则用的是SQLite3。
我想在一个比较大的音乐收藏上测试我写的代码,所以我联系了一些同学和家人,最终得到了大约90,000首歌曲的测试数据。这些歌曲的格式也不一样,有.mp3、.ogg和.flac。
我面临的主要问题是速度——我写的代码虽然能工作,但运行得太慢了。目前的状态下,处理这些测试数据大约需要35:45。我主要想知道:我该怎么做才能提高这段代码的性能呢?我觉得这个问题可能和Mutagen或者SQLite3有关,不过我也欢迎其他提高效率的建议。
我已经对这段关键代码进行了两次改进。第一次改进后,运行时间缩短到了21:30,但这仍然很糟糕。我决定重构代码,减少函数调用的次数,试图提高性能。然而,结果却是性能下降了,虽然函数调用的次数大幅减少——第二次运行的时间接近51:51,这简直无法接受。
接下来展示的代码是“改进”后的运行时间和重构后的代码。同时也附上了每段代码的单独性能分析。
代码块 1: “改进”后的运行时间
def walk(self, d):
'''Walk down the file structure iteratively, gathering file names to be read in.'''
d = os.path.abspath(d)
dirpath = os.walk(d)
for folder in dirpath:
for f in folder[2]: # for each file in the folder...
supported = 'mp3', 'ogg', 'flac'
if f.split('.')[-1] in supported:
try:
self.parse(os.path.join(folder[0], f))
if self.filecount == 2000 or self.leftover:
self.filecount = 0
try:
self.db.execute_batch_insert_statement(u"INSERT INTO song VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", self.buf)
except Exception, e:
print e.__unicode__()
finally:
del self.buf
self.buf = [] # wipe the buffers clean so we can repeat a batch parse again.
except Exception, e:
print e.__unicode__()
try:
self.db.execute_batch_insert_statement(u"INSERT INTO song VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", self.buf)
except Exception, e:
print e.__unicode__()
finally:
del self.buf
self.buf = [] # wipe the buffers clean so we can repeat a batch parse again.
def parse(self, filename):
'''Process and parse the music file to extract desired information.
It may be the case that, in the future, we require more information from a song than is provided at this time.
Examine all tags that can be retrieved from a mutagen.File object, and adjust the database's schema accordingly.'''
if ".ogg" in filename:
song = OggVorbis(filename)
elif ".mp3" in filename:
song = MP3(filename)
elif ".flac" in filename:
song = FLAC(filename)
else:
raise InvalidSongException(u"Song is not supported by K'atun at this time.")
filename = u'filename'
#song = mutagen.File(filename, easy=True)
artist, title, genre, track, album, bitrate, year, month = '', '', '', '', '', '', '', ''
try:
artist = song['artist'][0]
title = song['title'][0]
except Exception:
raise InvalidSongException(u"Cannot read " + filename + ": missing critical song information.")
if 'genre' in song:
genre = song['genre'][0]
else:
genre = u'Unknown'
if 'tracknumber' in song:
track = song['tracknumber'][0]
else:
track = 0
if 'album' in song:
album = song['album'][0]
else:
album = u'Unknown'
if 'date' in song:
year = song['date'][0]
else:
year = 'Unknown'
try:
bitrate = int(song.info.bitrate)
except AttributeError: # Likely due to us messing with FLAC
bitrate = 999999 # Set to a special flag value, to indicate that this is a lossless file.
self.buf.append((filename, artist, filename.split('.')[-1], title, genre, track, album, bitrate, year, time.time()))
self.filecount += 1
Sat Dec 24 21:24:23 2011 modified.dat
70626027 function calls (70576436 primitive calls) in 1290.127 CPU seconds
Ordered by: cumulative time
List reduced from 666 to 28 due to restriction <28>
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.033 0.033 1290.127 1290.127 parser.py:6(<module>)
1 0.000 0.000 1290.090 1290.090 parser.py:90(main)
1 0.000 0.000 1286.493 1286.493 parser.py:24(__init__)
1 1.826 1.826 1286.335 1286.335 parser.py:35(walk)
90744 2.376 0.000 1264.788 0.014 parser.py:55(parse)
90744 11.840 0.000 1250.401 0.014 lib/mutagen/__init__.py:158(File)
376019 613.881 0.002 613.881 0.002 {method 'seek' of 'file' objects}
90744 1.231 0.000 580.143 0.006 /usr/lib/pymodules/python2.7/mutagen/apev2.py:458(score)
671848 530.346 0.001 530.346 0.001 {method 'read' of 'file' objects}
90742 0.846 0.000 242.337 0.003 /usr/lib/pymodules/python2.7/mutagen/__init__.py:68(__init__)
63944 2.471 0.000 177.050 0.003 /usr/lib/pymodules/python2.7/mutagen/id3.py:1973(load)
63944 0.526 0.000 119.326 0.002 /usr/lib/pymodules/python2.7/mutagen/easyid3.py:161(__init__)
63944 4.649 0.000 118.077 0.002 /usr/lib/pymodules/python2.7/mutagen/id3.py:89(load)
26782 1.073 0.000 64.435 0.002 /usr/lib/pymodules/python2.7/mutagen/ogg.py:434(load)
127531 0.464 0.000 59.314 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:76(__fullread)
63944 1.078 0.000 54.060 0.001 /usr/lib/pymodules/python2.7/mutagen/mp3.py:68(__init__)
26782 0.638 0.000 53.613 0.002 /usr/lib/pymodules/python2.7/mutagen/ogg.py:379(find_last)
66487 3.167 0.000 50.136 0.001 /usr/lib/pymodules/python2.7/mutagen/mp3.py:106(__try)
855079 6.415 0.000 33.237 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:279(__read_frames)
816987 0.904 0.000 24.491 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:321(__load_framedata)
816987 2.805 0.000 23.587 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:1023(fromData)
60803/11257 0.370 0.000 19.036 0.002 /usr/lib/python2.7/os.py:209(walk)
11256 14.651 0.001 14.651 0.001 {posix.listdir}
816973 3.265 0.000 13.140 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:996(_readData)
879103 4.936 0.000 11.473 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:964(__init__)
872462 0.967 0.000 11.336 0.000 /usr/lib/pymodules/python2.7/mutagen/__init__.py:78(__getitem__)
63944 1.969 0.000 10.871 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:443(update_to_v24)
619380 1.396 0.000 8.521 0.000 /usr/lib/pymodules/python2.7/mutagen/easyid3.py:175(__getitem__)
代码块 2: 重构后的代码
def walk(self, d):
'''Walk down the file structure iteratively, gathering file names to be read in.'''
d = os.path.abspath(d)
dirpath = os.walk(d)
parsecount = 0
start = time.time()
for folder in dirpath:
for f in folder[2]: # for each file in the folder...
filetype = f.split('.')[-1].lower()
if filetype == 'mp3':
try:
self.read_mp3(os.path.join(folder[0], f).decode('utf_8'))
except Exception, e:
print e.__unicode__()
elif filetype == 'ogg':
try:
self.read_vorbis(os.path.join(folder[0], f).decode('utf_8'))
except Exception, e:
print e.__unicode__()
elif filetype == 'flac':
try:
self.read_flac(os.path.join(folder[0], f).decode('utf_8'))
except Exception, e:
print e.__unicode__()
else:
continue
if self.filecount == 2000 or self.leftover:
self.filecount = 0
print "Time differential: %1.4f s" % (time.time() - start)
self.batch_commit()
try:
print "Wrapping up"
self.batch_commit()
except Exception, e:
print e.__unicode__()
finally:
print "Elapsed time: " + str(time.time()-start)
def batch_commit(self):
'''Insert new values into the database in large quantities.'''
self.db.execute_batch_insert_statement(u"INSERT INTO song VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", self.buf)
self.buf = []
def read_mp3(self, filename):
'''Read and extract an MP3 file's tags. This makes use of the ID3 standard, not the easy ID3 tag system.'''
artist, title, genre, track, album, bitrate, year = '', '', '', '', '', 0, ''
song = MP3(filename)
keys = song.keys()
try:
artist = song['TPE1'].__unicode__()
title = song['TIT2'].__unicode__()
except KeyError, e:
raise InvalidSongException(u"Cannot read " + filename + ": missing critical song information.")
genre = song['TCON'].__unicode__() if "TCON" in keys else u'Unknown'
track = song['TRCK'].__unicode__() if "TRCK" in keys else u'0'
album = song['TALB'].__unicode__() if "TALB" in keys else u'Unknown'
bitrate = int(song.info.bitrate)
year = song['TDRC'].__unicode__() if "TDRC" in keys else u'Unknown'
self.buf.append((filename, artist, filename.split('.')[-1], title, genre, track, album, bitrate, year, time.time()))
self.filecount += 1
def read_vorbis(self, filename):
'''Read and extract an Ogg Vorbis file's tags.'''
song = OggVorbis(filename)
artist, title, genre, track, album, bitrate, year = '', '', '', '', '', 0, ''
try:
artist = song['artist'][0]
title = song['title'][0]
except KeyError, e:
raise InvalidSongException(u"Cannot read " + filename + ": missing critical song information.")
genre = song['genre'][0] if genre else u'Unknown'
track = song['tracknumber'][0] if 'tracknumber' in song else u'0'
album = song['album'][0] if 'album' in song else u'Unknown'
bitrate = int(song.info.bitrate)
year = song['date'][0] if 'date' in song else 'Unknown'
self.buf.append((filename, artist, filename.split('.')[-1], title, genre, track, album, bitrate, year, time.time()))
self.filecount += 1
def read_flac(self, filename):
'''Read and extract a FLAC file's tags.'''
song = FLAC(filename)
artist, title, genre, track, album, bitrate, year = '', '', '', '', '', 0, ''
try:
artist = song['artist'][0]
title = song['title'][0]
except KeyError, e:
raise InvalidSongException(u"Cannot read " + filename + ": missing critical song information.")
genre = song['genre'][0] if genre else u'Unknown'
track = song['tracknumber'][0] if 'tracknumber' in song else u'0'
album = song['album'][0] if 'album' in song else u'Unknown'
bitrate = 999999 # Special flag for K'atun; will know that this is a lossless file
year = song['date'][0] if 'date' in song else 'Unknown'
self.buf.append((filename, artist, filename.split('.')[-1], title, genre, track, album, bitrate, year, time.time()))
self.filecount += 1
Mon Dec 26 03:22:34 2011 refactored.dat
59939763 function calls (59890172 primitive calls) in 3111.490 CPU seconds
Ordered by: cumulative time
List reduced from 559 to 28 due to restriction <28>
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.001 0.001 3111.490 3111.490 parser.py:6(<module>)
1 0.000 0.000 3111.477 3111.477 parser.py:138(main)
1 0.000 0.000 3108.242 3108.242 parser.py:27(__init__)
1 1.760 1.760 3108.062 3108.062 parser.py:40(walk)
46 0.103 0.002 2220.618 48.274 parser.py:78(batch_commit)
46 0.002 0.000 2220.515 48.272 db_backend.py:127(execute_batch_insert_statement)
46 2184.900 47.498 2184.900 47.498 {method 'executemany' of 'sqlite3.Connection' objects}
90747 0.515 0.000 845.343 0.009 /usr/lib/pymodules/python2.7/mutagen/__init__.py:68(__init__)
426651 640.459 0.002 640.459 0.002 {method 'read' of 'file' objects}
63945 1.847 0.000 582.267 0.009 parser.py:83(read_mp3)
63945 2.372 0.000 577.245 0.009 /usr/lib/pymodules/python2.7/mutagen/id3.py:1973(load)
63945 0.307 0.000 514.927 0.008 /usr/lib/pymodules/python2.7/mutagen/id3.py:72(__init__)
63945 0.256 0.000 514.620 0.008 /usr/lib/pymodules/python2.7/mutagen/_util.py:103(__init__)
63945 0.225 0.000 514.363 0.008 /usr/lib/pymodules/python2.7/mutagen/__init__.py:35(__init__)
63945 4.188 0.000 514.139 0.008 /usr/lib/pymodules/python2.7/mutagen/id3.py:89(load)
127533 0.802 0.000 455.713 0.004 /usr/lib/pymodules/python2.7/mutagen/id3.py:76(__fullread)
63945 1.029 0.000 432.574 0.007 /usr/lib/pymodules/python2.7/mutagen/id3.py:202(__load_header)
26786 0.504 0.000 270.216 0.010 parser.py:102(read_vorbis)
26786 1.095 0.000 267.578 0.010 /usr/lib/pymodules/python2.7/mutagen/ogg.py:434(load)
26782 0.627 0.000 143.492 0.005 /usr/lib/pymodules/python2.7/mutagen/ogg.py:379(find_last)
221337 121.448 0.001 121.448 0.001 {method 'seek' of 'file' objects}
97603 1.797 0.000 118.799 0.001 /usr/lib/pymodules/python2.7/mutagen/ogg.py:66(__init__)
26786 0.342 0.000 114.656 0.004 lib/mutagen/oggvorbis.py:40(__init__)
63945 0.646 0.000 58.809 0.001 lib/mutagen/mp3.py:68(__init__)
66480 3.377 0.000 57.489 0.001 lib/mutagen/mp3.py:106(__try)
47 35.609 0.758 35.609 0.758 {method 'commit' of 'sqlite3.Connection' objects}
855108 6.184 0.000 32.181 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:279(__read_frames)
60803/11257 0.385 0.000 31.885 0.003 /usr/lib/python2.7/os.py:209(walk)
1 个回答
如果你觉得有兴趣,可以试试线程编程 :) 在Python中其实挺简单的。
你可以创建两个队列,一个用来接收文件(IN),一个用来输出文件(OUT),然后让多个线程在中间处理这些文件。这样做的线程(工作线程)和队列是线程安全的。
这样大概只需要多写80行代码,而且你可能可以保留现在的功能,只需要把它们放到合适的类里面(让“线程”可以使用它们)。
不过,处理90,000首歌花20分钟似乎也不是完全不合理。因为这需要频繁地访问磁盘,而寻找文件的速度比较慢(大约10毫秒)。所以处理90,000个文件,每个文件都要寻找一次,光这个就已经要15分钟了。