在Python中如何使用"find . -regex ..." 查找完全匹配正则表达式的文件?

10 投票
2 回答
6732 浏览
提问于 2025-04-16 22:07

我想找一些文件,这些文件的完整名称(相对路径,绝对路径也可以)要符合给定的正则表达式(也就是说,像 glob 模块那样,但这里是用正则表达式匹配,而不是用通配符匹配)。使用 find 命令的话,比如可以这样做:

find . -regex ./foo/\w+/bar/[0-9]+-\w+.dat

当然,我可以通过 os.system(...)os.exec*(...) 来使用 find,但我想要一个纯 Python 的解决方案。下面这段代码结合了 os.walk(...)re 模块的正则表达式,是一个简单的 Python 解决方案。(虽然这个方法不够健壮,可能会漏掉很多边缘情况,但对于我这个单次使用的目的来说,找到特定的数据文件以便一次性插入数据库,已经足够用了。)

import os
import re

def find(regex, top='.'):
    matcher = re.compile(regex)
    for dirpath, dirnames, filenames in os.walk(top):
        for f in filenames:
            f = os.path.relpath(os.path.join(dirpath, f), top)
            if matcher.match(f):
                yield f

if __name__=="__main__":
    top = "."
    regex = "foo/\w+/bar/\d+-\w+.dat"
    for f in find(regex, top):
        print f

但是这样效率不高。那些内容无法匹配正则表达式的子树(例如,./foo/\w+/baz/,继续上面的例子)被不必要地遍历了。理想情况下,这些子树应该在遍历时被剪除;任何路径名与正则表达式没有部分匹配的子目录都不应该被遍历。(我猜 GNU 的 find 实现了这样的优化,但我还没有通过测试或查看源代码来确认这一点。)

有没有人知道有没有一个基于正则表达式的健壮的 Python 实现的 find,最好能有子树剪除的优化?我希望我只是错过了 os.path 模块中的某个方法或者某个第三方模块。

2 个回答

1

我写了一个叫select_walk()的函数,用来在一个目录树中搜索和选择文件。

在下面的例子中,搜索的文件扩展名包括.dat.rtf.jpeg,这些文件位于名称符合以下正则表达式模式的目录中:

r'J:\\f[ruv]?o+\\\w+\\b[ae]r(\d+)?\\(?(1)TURI\1\d*|MONO\d+)

注意这里有一个条件的基本模式:

(?(1)TURI\1\d*|MONO\d+)

其中有组引用(1)\1,它们指向数字匹配组(\d+),这个组在基本模式b[ae]r(\d+)中出现过。

1 )

下面是一个代码,用来创建作为例子的目录树:

(注意,它首先会删除目录'foo\'、'fooo\'、'froooo\'、'faooo\',然后再创建这些目录)

import os
from shutil import rmtree

top = 'J:\\'

for x in ('foo\\','fooo\\','froooo\\','faooo\\'):
    if os.path.isdir(top + x):
        rmtree(top + x)


li = [('foo\\',('basil\\','poto%\\','tamata\\')),
      ('foo\\basil\\',('ber89','ber300')),
      ('foo\\basil\\ber89\\',('TURI850','TURI1023')),
      ('foo\\poto%\\',('ocean','earth')),
      ('foo\\tamata\\',('vahine',)),

      ('fooo\\',('york#\\','plain\\','atlantis\\')),
      ('fooo\\york#\\',('noto','nata')),
      ('fooo\\plain\\',('zx13ao','ws89rt','bar999')),
      ('fooo\\plain\\bar999\\',('TURI99905','TURI2227','MONO2')),
      ('fooo\\plain\\bar999\\TURI99905\\',('AERIAL','minidisc')),
      ('fooo\\plain\\bar999\\TURI99905\\AERIAL\\',('bumbum','corean')),
      ('fooo\\atlantis\\',('atlABC','atlDEFG')),
      ('fooo\\atlantis\\atlABC\\',('atlantis_sound','atlantis_image')),

      ('froooo\\',('one_dir\\','another_dir\\')),
      ('froooo\\one_dir\\',('bar25','ber')),
      ('froooo\\one_dir\\bar25\\',('TURI2501','TURI2502','TURI4813','MONO8')),
      ('froooo\\one_dir\\ber\\',('TURI30','TURI','MONO532')),
      ('froooo\\another_dir\\',('notseen','notseen2')),

      ('faooo\\',('somolo-\\','samala+\\'))]


for rep,several in li:
    #print top + rep
    if os.path.isdir(top + rep) == False:
        os.mkdir(top + rep)

    for name in several:
        #print top + rep + name
        os.mkdir(top + rep + name)

for filepath in (top + 'foo\\kalaomi.xls',
                 top + 'foo\\basil\\ber89\\TURI850\\quetzal.jpeg',
                 top + 'foo\\basil\\ber89\\TURI850\\tehoi.txt',
                 top + 'foo\\poto%\\curcuma in poto%.txt',
                 top + 'foo\\poto%\\ocean\\file in ocean.rtf',
                 top + 'foo\\tamata\\vahine\\tahiti.jpeg',
                 top + 'fooo\\york#\\yorkshire.jpeg',
                 top + 'fooo\\plain\\bar999\\TURI99905\\galileo.jpeg',
                 top + 'fooo\\plain\\bar999\\TURI99905\\polynesia.dat',
                 top + 'fooo\\plain\\bar999\\TURI99905\\concrete.txt',
                 top + 'fooo\\plain\\bar999\\TURI2227\\Monroe.jpeg',
                 top + 'fooo\\plain\\bar999\\MONO2\\elastic.jpeg',
                 top + 'froooo\\one_dir\\photo in one_dir.jpeg',
                 top + 'froooo\\one_dir\\tabula.xls',
                 top + 'froooo\\one_dir\\bar25\\TURI2501\\matallelo.jpeg',
                 top + 'froooo\\one_dir\\bar25\\TURI2501\\italy.dat',
                 top + 'froooo\\one_dir\\bar25\\TURI2501\\beretta.xls',
                 top + 'froooo\\one_dir\\bar25\\TURI2501\\turi2501_ser.rtf',
                 top + 'froooo\\one_dir\\bar25\\TURI4813\\boaf_inTURI4813.jpeg',
                 top + 'froooo\\one_dir\\bar25\\TURI4813\\troui_in_TURI4813.txt',
                 top + 'froooo\\one_dir\\bar25\\MONO8\\in_mono8.dat',
                 top + 'froooo\\one_dir\\bar25\\MONO8\\in_mono8.rtf',
                 top + 'froooo\\one_dir\\bar25\\MONO8\\in_mono8.xls',
                 top + 'froooo\\one_dir\\bar25\\TURI2502\\adamante.jpeg',
                 top + 'froooo\\one_dir\\bar25\\TURI2502\\egyptic.txt',
                 top + 'froooo\\one_dir\\bar25\\TURI2502\\urubu.rtf',
                 top + 'froooo\\one_dir\\ber\\MONO532\\bacillus.jpeg',
                 top + 'froooo\\one_dir\\ber\\MONO532\\blueberry.dat',
                 top + 'froooo\\one_dir\\ber\\MONO532\\Perfume.doc',
                 top + 'faooo\\samala+\\kfaz.dat',
                 top + 'faooo\\somolo-\\ytek.rtf',
                 top + 'faooo\\123.txt',
                 top + 'faooo\\458.rtf',):
    with open(filepath,'w') as f:
        pass

这段代码创建了以下的目录树:

J:
|
|--foo
|   |--basil
|      |--ber89
|         |--TURI850
|            |--file quetzal.jpeg
|            |--file tehoi.txt
|         |--TURI1023
|      |--ber300
|   |--poto%
|      |--ocean
|         |--file in ocean.rtf
|      |--earth
|      |--file curcuma in poto%.txt
|   |--tamata
|      |--vahine
|         |--file tahiti.jpeg
|   |--file kalaomi.xls
|
|--fooo
|  |--york#
|     |--noto
|     |--nata
|     |---file yorkshire.jpeg
|  |--plain
|     |--zx13ao
|     |--ws89rt
|     |--bar999
|        |--TURI99905
|           |--AERIAL
|              |--bumbum
|              |--corean
|           |--minidisc
|           |--file galileo.jpeg
|           |--file polynesia.dat
|           |--file concrete.txt
|        |--TURI2227
|           |--file Monroe.jpeg
|        |--MONO2
|           |--file elastic.jpeg
|  |--atlantis
|     |--atlABC
|        |--atlantis_sound
|        |--atlantis_image
|     |--atlDEFG
|
|--froooo
|  |--one_dir
|     |--bar25
|        |--TURI2501
|           |--file matalello.jpeg
|           |--file italy.dat
|           |--file beretta.xls
|           |--file turi2501_ser.rtf
|        |--TURI2502
|           |--file adamante.jpeg
|           |--file egyptic.txt
|           |--file urubu.rtf
|        |--TURI4813
|           |--file boaf_inTURI4813.jpeg
|           |--file troui_inTURI4813.txt
|        |--MONO8
|           |--file in_mono8.dat
|           |--file in_mono8.rtf
|           |--file in_mono8.xls
|     |--ber
|        |--TURI30
|        |--TURI
|        |--MONO532
|           |--file bacillus.jpeg
|           |--file blueberry.dat
|           |--file Perfume.doc
|     |--file photo in one_dir.jpeg
|     |--file tabula.xls
|  |--another_dir
|     |--notseen
|     |--notseen2
|
|--faooo
|  |--somolo-
|     |--file ytek.rtf
|  |--samala+
|     |file kfaz.dat
|  |--file 123.txt
|  |--file 458.rtf

匹配文件的正则表达式模式是:

r'J:\\f[ruv]?o+\\\w+\\b[ae]r(\d+)?\\(?(1)TURI\1\d*|MONO\d+)\\\w+\.(dat|rtf|jpeg)'

而将被选择性探索以搜索这种文件的目录将是:

'J:\\fooo\\plain\\bar999\\TURI99905'
'J:\\froooo\\one_dir\\bar25\\TURI2501'
'J:\\froooo\\one_dir\\bar25\\TURI2502'
'J:\\froooo\\one_dir\\ber\\MONO532'

.

2 )

作为初步演示,这里有一段代码,展示了select_walk()函数中构建正则表达式的部分,这些正则表达式用于在遍历目录树时只探索选定的目录,并返回选定的文件:

import re


def compute_regexes(pat_file, displ = True):
    from os import sep

    splitted_pat = re.split(r'\\\\' if sep=='\\' else '/', pat_file)

    pat_parent_dir = (r'\\' if sep=='\\' else '/').join(splitted_pat[0:-1])

    if displ:
        print ('IN FUNCTION compute_regexes() :'
               '\n\npat_file== %s'
               '\n\nsplitted_pat :\n%s'
               '\n\npat_parent_dir== %s\n') \
              % (pat_file , '\n'.join(splitted_pat) , pat_parent_dir)


    dgr = {}
    for i,el in enumerate(splitted_pat):
        if re.search('\(.*?\)',el):
            dgr[len(dgr)+1] = i
    if displ:
        print 'dgr :'
        print '\n'.join('group(%s) is in splitted_pat[%s]' % (g,i)
                        for g,i in dgr.iteritems())


    def repl(mat, dgr = dgr):
        the = int(mat.group(1) if mat.group(1) else mat.group(2))
        return str(the + dgr[the])

    for i,el in enumerate(splitted_pat):
        splitted_pat[i] = re.sub(r'(?<=\(\?\()(\d+)(?=\))|(?<=\\)(\d+)',repl,el)


    pat_dirs = ''
    for x in splitted_pat[-2:0:-1]:
        pat_dirs = r'(?=\\|\Z)(\\%s%s)?' % (x,pat_dirs)
    pat_dirs = splitted_pat[0] + pat_dirs
    if displ:
        print '\npat_dirs==',pat_dirs

    return (re.compile(pat_file), re.compile(pat_dirs), re.compile(pat_parent_dir) )




pat_file = r'J:\\f[ruv]?o+\\\w+\\b[ae]r(\d+)?\\(?(1)TURI\1\d*|MONO\d+)\\\w+\.(dat|rtf|jpeg)'
regx_file, regx_dirs, regx_parent_dir = compute_regexes(pat_file)

print '\n\nEXAMPLES with regx_file :\n'
print 'pat_file==',pat_file
for filepath in ('J:\\fooo\\basil\\ber92\TURI9258\\beru.rtf  ',
                 'J:\\froooooo\\ki_ki\\bar\MONO47\\madrid.jpeg  '):
    print filepath,bool(regx_file.match(filepath))

print '\n\nEXAMPLES with regx_dirs :\n'
for path in ('J:\\fooo',
             'J:\\fooo\\basil',
             'J:\\fooo\\basil\\ber92',
             'J:\\fooo\\basil\\ber92\\TURI777',
             'J:\\fooo\\basil\\ber92\\TURI9258',
             'J:\\froooooo'
             'J:\\froooooo\\ki_ki',
             'J:\\froooooo\\ki_ki\\bar',
             'J:\\froooooo\\ki=ki\\bar',
             'J:\\froooooo\\ki_ki\\bar\MONO47'):
    print path,("   : ~~ this dir's name is OK ~~" if path==''.join(regx_dirs.match(path).group())
                else "   : ## this dir's name doesn't match ##")

函数compute_regexes()首先将原始的pat_file正则表达式模式拆分成多个元素,目的是匹配路径中的目录名称。

然后它计算:

  • 一个正则表达式模式pat_dirs,用于匹配所需文件的包含目录的不同路径层级

  • 一个正则表达式模式pat_parent_dir,用于匹配所需文件的任何直接父目录

.

涉及dgr和函数repl()的处理是一种复杂化,它允许compute_regexes()函数考虑组的引用(也就是:特殊序列\1、\2等),并对其进行更改,以获得相对于添加的括号仍然正确的pat_dirs组引用。

这段代码的结果是:

IN FUNCTION compute_regexes() :

pat_file== J:\\f[ruv]?o+\\\w+\\b[ae]r(\d+)?\\(?(1)TURI\1\d*|MONO\d+)\\\w+\.(dat|rtf|jpeg)

splitted_pat :
J:
f[ruv]?o+
\w+
b[ae]r(\d+)?
(?(1)TURI\1\d*|MONO\d+)
\w+\.(dat|rtf|jpeg)

pat_parent_dir== J:\\f[ruv]?o+\\\w+\\b[ae]r(\d+)?\\(?(1)TURI\1\d*|MONO\d+)

dgr :
group(1) is in splitted_pat[3]
group(2) is in splitted_pat[4]
group(3) is in splitted_pat[5]

pat_dirs== J:(?=\\|\Z)(\\f[ruv]?o+(?=\\|\Z)(\\\w+(?=\\|\Z)(\\b[ae]r(\d+)?(?=\\|\Z)(\\(?(4)TURI\4\d*|MONO\d+))?)?)?)?


EXAMPLES with regx_file :

pat_file== J:\\f[ruv]?o+\\\w+\\b[ae]r(\d+)?\\(?(1)TURI\1\d*|MONO\d+)\\\w+\.(dat|rtf|jpeg)
J:\fooo\basil\ber92\TURI9258\beru.rtf   True
J:\froooooo\ki_ki\bar\MONO47\madrid.jpeg   True


EXAMPLES with regx_dirs :

J:\fooo    : ~~ this dir's name is OK ~~
J:\fooo\basil    : ~~ this dir's name is OK ~~
J:\fooo\basil\ber92    : ~~ this dir's name is OK ~~
J:\fooo\basil\ber92\TURI777    : ## this dir's name doesn't match ##
J:\fooo\basil\ber92\TURI9258    : ~~ this dir's name is OK ~~
J:\frooooooJ:\froooooo\ki_ki    : ## this dir's name doesn't match ##
J:\froooooo\ki_ki\bar    : ~~ this dir's name is OK ~~
J:\froooooo\ki=ki\bar    : ## this dir's name doesn't match ##
J:\froooooo\ki_ki\bar\MONO47    : ~~ this dir's name is OK ~~

.

.

3 )

最后,这里是select_walk()函数,它的工作是搜索在一个目录树中名称符合某个正则表达式的文件:
它返回由内置的os.walk()函数返回的三元组(dirpath, dirnames, filenames),但只包括那些目录filenames中包含符合pat_file的正确文件名的部分。

当然,在迭代过程中,函数select_walk()不会探索那些文件内容永远不会匹配关键正则表达式模式pat_file的目录,因为它们的(目录)名称的原因。

def select_walk(pat_file,start_dir):

    from os import sep

    splitted_pat = re.split(r'\\\\' if sep=='\\' else '/', pat_file)

    pat_parent_dir = (r'\\' if sep=='\\' else '/').join(splitted_pat[0:-1])

    dgr = {}
    for i,el in enumerate(splitted_pat):
        if re.search('\(.*?\)',el):
            dgr[len(dgr)+1] = i

    def repl(mat, dgr = dgr):
        the = int(mat.group(1) if mat.group(1) else mat.group(2))
        return str(the + dgr[the])

    for i,el in enumerate(splitted_pat):
        splitted_pat[i] = re.sub(r'(?<=\(\?\()(\d+)(?=\))|(?<=\\)(\d+)',repl,el)

    pat_dirs = ''
    for x in splitted_pat[-2:0:-1]:
        pat_dirs = r'(?=\\|\Z)(\\%s%s)?' % (x,pat_dirs)
    pat_dirs = splitted_pat[0] + pat_dirs
    print 'pat_dirs==',pat_dirs

    regx_file = re.compile(pat_file)
    regx_dirs = re.compile(pat_dirs)
    regx_parent_dir = re.compile(pat_parent_dir)

    start_dir = start_dir.rstrip(sep) + sep
    print '\nstart_dir == '+start_dir

    for dirpath,dirnames,filenames in os.walk(start_dir):

        dirpath = dirpath.rstrip(sep)
        print '\n'.join(('explored dirpath : %s    is_direct_parent: %s' \
                         % (dirpath,('NO','YES')[bool(regx_parent_dir.match(dirpath))]),
                         '           dirnames  : %s' % dirnames,
                         '          filenames  : %s' % filenames))

        if regx_parent_dir.match(dirpath):
            filenames[:] = [filename for filename in filenames
                            if regx_file.match(dirpath + sep + filename)]
            dirnames[:] = []
            print '\n'.join(('           dirnames  : not to be explored ' ,
                             '  yielded filenames  : %s\n' % filenames)) 
            yield (dirpath,dirnames,filenames)

        else:
            dirnames[:] = [dirname for dirname in dirnames
                           if regx_dirs.match(dirpath + sep + dirname).group()==dirpath + sep + dirname]
            print '\n'.join(('dirnames to explore  : %s ' % dirnames,
                             '          filenames  : not to be yielded\n')) 




pat_file = r'J:\\f[ruv]?o+\\\w+\\b[ae]r(\d+)?\\(?(1)TURI\1\d*|MONO\d+)\\\w+\.(dat|rtf|jpeg)'
print '\n\nSELECTED (dirpath, dirnames, filenames) :\n' + '\n'.join(map(repr, select_walk(pat_file,'J:\\')))

结果

pat_dirs== J:(?=\\|\Z)(\\f[ruv]?o+(?=\\|\Z)(\\\w+(?=\\|\Z)(\\b[ae]r(\d+)?(?=\\|\Z)(\\(?(4)TURI\4\d*|MONO\d+))?)?)?)?

start_dir == J:\
explored dirpath : J:    is_direct_parent: NO
           dirnames  : ['Amazon', 'faooo', 'Favorites', 'foo', 'fooo', 'froooo', 'Python', 'RECYCLER', 'System Volume Information']
          filenames  : ['image00.pfm', 'rep.py']
dirnames to explore  : ['foo', 'fooo', 'froooo'] 
          filenames  : not to be yielded

explored dirpath : J:\foo    is_direct_parent: NO
           dirnames  : ['basil', 'poto%', 'tamata']
          filenames  : ['kalaomi.xls']
dirnames to explore  : ['basil', 'tamata'] 
          filenames  : not to be yielded

explored dirpath : J:\foo\basil    is_direct_parent: NO
           dirnames  : ['ber300', 'ber89']
          filenames  : []
dirnames to explore  : ['ber300', 'ber89'] 
          filenames  : not to be yielded

explored dirpath : J:\foo\basil\ber300    is_direct_parent: NO
           dirnames  : []
          filenames  : []
dirnames to explore  : [] 
          filenames  : not to be yielded

explored dirpath : J:\foo\basil\ber89    is_direct_parent: NO
           dirnames  : ['TURI1023', 'TURI850']
          filenames  : []
dirnames to explore  : [] 
          filenames  : not to be yielded

explored dirpath : J:\foo\tamata    is_direct_parent: NO
           dirnames  : ['vahine']
          filenames  : []
dirnames to explore  : [] 
          filenames  : not to be yielded

explored dirpath : J:\fooo    is_direct_parent: NO
           dirnames  : ['atlantis', 'plain', 'york#']
          filenames  : []
dirnames to explore  : ['atlantis', 'plain'] 
          filenames  : not to be yielded

explored dirpath : J:\fooo\atlantis    is_direct_parent: NO
           dirnames  : ['atlABC', 'atlDEFG']
          filenames  : []
dirnames to explore  : [] 
          filenames  : not to be yielded

explored dirpath : J:\fooo\plain    is_direct_parent: NO
           dirnames  : ['bar999', 'ws89rt', 'zx13ao']
          filenames  : []
dirnames to explore  : ['bar999'] 
          filenames  : not to be yielded

explored dirpath : J:\fooo\plain\bar999    is_direct_parent: NO
           dirnames  : ['MONO2', 'TURI2227', 'TURI99905']
          filenames  : []
dirnames to explore  : ['TURI99905'] 
          filenames  : not to be yielded

explored dirpath : J:\fooo\plain\bar999\TURI99905    is_direct_parent: YES
           dirnames  : ['AERIAL', 'minidisc']
          filenames  : ['concrete.txt', 'galileo.jpeg', 'polynesia.dat']
           dirnames  : not to be explored 
  yielded filenames  : ['galileo.jpeg', 'polynesia.dat']

explored dirpath : J:\froooo    is_direct_parent: NO
           dirnames  : ['another_dir', 'one_dir']
          filenames  : []
dirnames to explore  : ['another_dir', 'one_dir'] 
          filenames  : not to be yielded

explored dirpath : J:\froooo\another_dir    is_direct_parent: NO
           dirnames  : ['notseen', 'notseen2']
          filenames  : []
dirnames to explore  : [] 
          filenames  : not to be yielded

explored dirpath : J:\froooo\one_dir    is_direct_parent: NO
           dirnames  : ['bar25', 'ber']
          filenames  : ['photo in one_dir.jpeg', 'tabula.xls']
dirnames to explore  : ['bar25', 'ber'] 
          filenames  : not to be yielded

explored dirpath : J:\froooo\one_dir\bar25    is_direct_parent: NO
           dirnames  : ['MONO8', 'TURI2501', 'TURI2502', 'TURI4813']
          filenames  : []
dirnames to explore  : ['TURI2501', 'TURI2502'] 
          filenames  : not to be yielded

explored dirpath : J:\froooo\one_dir\bar25\TURI2501    is_direct_parent: YES
           dirnames  : []
          filenames  : ['beretta.xls', 'italy.dat', 'matallelo.jpeg', 'turi2501_ser.rtf']
           dirnames  : not to be explored 
  yielded filenames  : ['italy.dat', 'matallelo.jpeg', 'turi2501_ser.rtf']

explored dirpath : J:\froooo\one_dir\bar25\TURI2502    is_direct_parent: YES
           dirnames  : []
          filenames  : ['adamante.jpeg', 'egyptic.txt', 'urubu.rtf']
           dirnames  : not to be explored 
  yielded filenames  : ['adamante.jpeg', 'urubu.rtf']

explored dirpath : J:\froooo\one_dir\ber    is_direct_parent: NO
           dirnames  : ['MONO532', 'TURI', 'TURI30']
          filenames  : []
dirnames to explore  : ['MONO532'] 
          filenames  : not to be yielded

explored dirpath : J:\froooo\one_dir\ber\MONO532    is_direct_parent: YES
           dirnames  : []
          filenames  : ['bacillus.jpeg', 'blueberry.dat', 'Perfume.doc']
           dirnames  : not to be explored 
  yielded filenames  : ['bacillus.jpeg', 'blueberry.dat']



SELECTED (dirpath, dirnames, filenames) :
('J:\\fooo\\plain\\bar999\\TURI99905', [], ['galileo.jpeg', 'polynesia.dat'])
('J:\\froooo\\one_dir\\bar25\\TURI2501', [], ['italy.dat', 'matallelo.jpeg', 'turi2501_ser.rtf'])
('J:\\froooo\\one_dir\\bar25\\TURI2502', [], ['adamante.jpeg', 'urubu.rtf'])
('J:\\froooo\\one_dir\\ber\\MONO532', [], ['bacillus.jpeg', 'blueberry.dat'])
6

来自 help(os.walk) 的内容:

当设置为从上到下遍历时,调用者可以直接修改目录名列表(比如通过删除或切片赋值),这样遍历时只会进入那些仍然在目录名列表中的子目录;这可以用来减少搜索范围...

所以,一旦确定某个子目录(在 dirnames 列表中)是不需要的,就应该把它从 dirnames 中删除。这样就能实现你想要的减少子树的效果。(只要确保先从 dirnames 的末尾开始删除,这样就不会影响到剩下要删除的项目的索引。)

import os
import re

def prune(regex,top='.'):
    sep=os.path.sep
    matcher = re.compile(regex)
    pieces=regex.split(sep)
    partial_matchers = map(
        re.compile,
        (sep.join(pieces[:i+1]) for i in range(len(pieces))))
    for root, dirs, files in os.walk(top,topdown=True):
        for i in reversed(range(len(dirs))):
            dirname=os.path.relpath(os.path.join(root,dirs[i]), top)
            dirlevel=dirname.count(sep)
            # print(dirname,dirlevel,sep.join(pieces[:dirlevel+1]))
            if not partial_matchers[dirlevel].match(dirname):
                print('pruning {0}'.format(
                    os.path.relpath(os.path.join(root,dirs[i]), top)))                
                del dirs[i]

        for filename in files:
            filename=os.path.relpath(os.path.join(root,filename))
            # print('checking {0}'.format(filename))
            if matcher.match(filename):
                print(filename)

if __name__=='__main__':
    prune(r'foo/\w+/bar/\d+-\w+.dat')

在这样的目录结构下运行脚本:

~/test% tree .
.
|-- foo
|   `-- baz
|       |-- bad
|       |   |-- bad1.txt
|       |   `-- badbad
|       |       `-- bad2.txt
|       `-- bar
|           |-- 1-good.dat
|           `-- 2-good.dat
`-- tmp
    |-- 000.png
    |-- 001.png
    `-- output.gif

会得到

pruning tmp
pruning foo/baz/bad
foo/baz/bar/2-good.dat
foo/baz/bar/1-good.dat

如果你取消注释“检查”的打印语句,就能清楚地看到被删除的目录没有被遍历。

撰写回答