获取大小写敏感路径的Pythonic方式?
我在想有没有更快的方法来实现一个在Python中返回区分大小写路径的函数。我想到的一个解决方案在Linux和Windows上都能用,但需要我遍历os.listdir,这可能会比较慢。
这个解决方案在一些不需要特别快的应用和场景下是可以正常工作的:
def correctPath(start, path):
'Returns a unix-type case-sensitive path, works in windows and linux'
start = unicode(start);
path = unicode(path);
b = '';
if path[-1] == '/':
path = path[:-1];
parts = path.split('\\');
d = start;
c = 0;
for p in parts:
listing = os.listdir(d);
_ = None;
for l in listing:
if p.lower() == l.lower():
if p != l:
c += 1;
d = os.path.join(d, l);
_ = os.path.join(b, l);
break;
if not _:
return None;
b = _;
return b, c; #(corrected path, number of corrections)
>>> correctPath('C:\\Windows', 'SYSTEM32\\CmD.EXe')
(u'System32\\cmd.exe', 2)
不过,当需要从一个有超过50,000个条目的大数据库中收集文件名时,这种方法就不会那么快了。
一种方法是为每个目录创建一个字典树。把这个字典树和路径中的目录部分进行匹配,如果找不到对应的键,就执行os.listdir来查找并为新目录创建一个字典条目,同时删除未使用的部分,或者保持一个变量计数器来给每个目录分配一个“生命周期”。
2 个回答
0
这是一个扩展版本,使用不区分大小写的缓存来提取正确的路径:
import os,re
def corrected_paths(start, pathlist):
''' This wrapper function takes a list of paths to correct vs. to allow caching '''
start = unicode(start)
pathlist = [unicode(path[:-1]) if path[-1] == '/' else unicode(path) for path in pathlist ]
# Use a dict as a cache, storing oldpath > newpath for first-pass replacement
# with path keys from incorrect to corrected paths
cache = dict()
corrected_path_list = []
corrections_count = 0
path_split = re.compile('(/+|\+)')
for path in pathlist:
cd = start
corrected_path = ''
parts = path_split.split(path)
# Pre-process against the cache
for n,p in enumerate(parts):
# We pass *parts to send through the contents of the list as a series of strings
uncorrected_path= os.path.join( cd, *parts[0:len(parts)-n] ).lower() # Walk backwards
if uncorrected_path in cache:
# Move up the basepath to the latest matched position
cd = os.path.join(cd, cache[uncorrected_path])
parts = parts[len(parts)-n:] # Retrieve the unmatched segment
break; # First hit, we exit since we're going backwards
# Fallback to walking, from the base path cd point
for n,p in enumerate(parts):
if not os.path.exists(os.path.join(cd,p)): # Check it's not correct already
#if p not in os.listdir(cd): # Alternative: The above does not work on Mac Os, returns case-insensitive path test
listing = os.listdir(cd)
cip = p.lower()
cilisting = [l.lower() for l in listing]
if cip in cilisting:
l = listing[ cilisting.index(cip) ] # Get our real folder name
# Store the path correction in the cache for next iteration
cache[ os.path.join(cd,p).lower() ] = os.path.join(cd, l)
cd = os.path.join(cd, l)
corrections_count += 1
else:
print "Error %s not in folder %s" % (cip, cilisting)
return False # Error, this path element isn't found
else:
cd = os.path.join(cd, p)
corrected_path_list.append(cd)
return corrected_path_list, corrections_count
在对一组路径进行示例运行时,这大大减少了列出目录的次数(这显然取决于你的路径有多相似):
corrected_paths('/Users/', ['mxF793/ScRiPtS/meTApaTH','mxF793/ScRiPtS/meTApaTH/metapAth/html','mxF793/ScRiPtS/meTApaTH/metapAth/html/css','mxF793/ScRiPts/PuBfig'])
([u'/Users/mxf793/Scripts/metapath', u'/Users/mxf793/Scripts/metapath/metapath/html', u'/Users/mxf793/Scripts/metapath/metapath/html/css', u'/Users/mxf793/Scripts/pubfig'], 14)
([u'/Users/mxf793/Scripts/metapath', u'/Users/mxf793/Scripts/metapath/metapath/html', u'/Users/mxf793/Scripts/metapath/metapath/html/css', u'/Users/mxf793/Scripts/pubfig'], 5)
在这个过程中,我发现Mac OSX上的Python返回的路径匹配是大小写不敏感的,所以检查路径是否存在总是会成功。在这种情况下,可以把列出目录的操作替换掉。
2
下面是对你自己代码的一个小改动,做了三个修改:首先检查文件名是否已经正确,然后在测试之前把文件名转换成小写,最后用索引来找到正确的文件名。
def corrected_path(start, path):
'''Returns a unix-type case-sensitive path, works in windows and linux'''
start = unicode(start)
path = unicode(path)
corrected_path = ''
if path[-1] == '/':
path = path[:-1]
parts = path.split('\\')
cd = start
corrections_count = 0
for p in parts:
if not os.path.exists(os.path.join(cd,p)): # Check it's not correct already
listing = os.listdir(cd)
cip = p.lower()
cilisting = [l.lower() for l in listing]
if cip in cilisting:
l = listing[ cilisting.index(cip) ] # Get our real folder name
cd = os.path.join(cd, l)
corrected_path = os.path.join(corrected_path, l)
corrections_count += 1
else:
return False # Error, this path element isn't found
else:
cd = os.path.join(cd, p)
corrected_path = os.path.join(corrected_path, p)
return corrected_path, corrections_count
我不确定这样做会不会快很多,不过测试的次数确实少了一点,而且一开始检查文件名是否已经正确可能会有所帮助。