Python按自定义字段模式解析文件名

def main(): file = open('~/Desktop/client_docs.csv', "rb") output = open('~/Desktop/client_docs_parsed.txt', "wb") for line in file: i = line.find(find_nth(line, '/', 2)) beghouse = line[i + len(find_nth(line, '/', 2)):] endhouse = beghouse.find('/') household = beghouse[:endhouse] lastn = (line[line.find(household):])[(line[line.find(household):]).find('/') + 1:(line[line.find(household):]).find('.')] firstn = line[line.find('.') + 1: line.find('.', line.find('.') + 1)] acct = line[line.find('{}.{}.'.format(lastn,firstn)) + len('{}.{}.'.format(lastn,firstn)):line.find('.',line.find('{}.{}.'.format(lastn,firstn)) + len('{}.{}.'.format(lastn,firstn)))] doctype_beg = line[line.find('{}.{}.{}.'.format(lastn, firstn, acct)) + len('{}.{}.{}.'.format(lastn, firstn, acct)):] doctype = doctype_beg[:doctype_beg.find('.')] date_beg = line[line.find('{}/{}.{}.{}.{}.'.format(household,lastn,firstn,acct,doctype)) + len('{}/{}.{}.{}.{}.'.format(household,lastn,firstn,acct,doctype)):] date = date_beg[:date_beg.find('.')] print '"',household, '"','"',lastn, '"','"',firstn, '"','"',acct, '"','"',doctype, '"','"',date,'"' def find_nth(body, s_term, n): start = body[::-1].find(s_term) while start >= 0 and n > 1: start = body[::-1].find(s_term, start+len(s_term)) n -= 1 return ((body[::-1])[start:])[::-1] if __name__ == "__main__": main()

3条回答

网友

1楼 · 编辑于 2024-06-10 09:03:39

据我所知，我相信这将是一个解决方案，它不依赖于以前编译的文件列表

import csv
import os, os.path

# Replace this with the directory where the household directories are stored.
directory = "home"
output = open("Output.csv", "wb")
csvf = csv.writer(output)

headerRow = ["Household", "Lastname", "Firstname", "Account", "Doctype", 
              "Date", "Extension"]

csvf.writerow(headerRow)

for root, households, files in os.walk(directory):
    for household in households:
        for filename in os.listdir(os.path.join(directory, household)):
            # This will create a record for each filename within the "household"
            # Then will split the filename out, using the "." as a delimiter
            # to get the detail
            csvf.writerow([household] + filename.split("."))
output.flush()
output.close()

这使用os库来“遍历”家庭列表。然后为每个“家庭”收集一份文件清单。如果使用这个列表，在csv文件中生成记录，将文件名分开，使用句点作为分隔符。在

它使用csv库生成输出，这看起来有点像

^{pr2}$

如果不需要扩展名，则可以通过更改行来修改它：

csvf.writerow([household] + filename.split("."))

到

csvf.writerow([household] + filename.split(".")[-1])

它告诉它只使用文件名的最后一部分，然后从headerRow中删除“Extension”字符串。在

希望这有帮助

网友

2楼 · 编辑于 2024-06-10 09:03:39

现在还不清楚问题是什么，但同时，这里有一些东西可以让你开始：

#!/usr/bin/env python

import os
import csv

with open("f1", "rb") as fin:
    reader = csv.reader(fin, delimiter='.')
    for row in reader:
        # split path
        row = list(os.path.split(row[0])) + row[1:]
        print ','.join(row)

输出：

^{pr2}$

另一种解释是，您希望将每个字段存储在一个参数中另外一条路把事情搞砸了。。。在

这是row在for循环中的样子：

['/Household/LastName', 'FirstName', 'Account', 'Doctype', 'Date', 'extension']

因此，解决办法可能是逆向工作。在

将row[-1]分配给extension，row[-2]分配给{}，依此类推。在

网友

3楼 · 编辑于 2024-06-10 09:03:39

{>rstrip()

def find_nth(body, s_term, n):
    start = body[::-1].find(s_term)
    print '                        '
    print 'body[::-1]\n',body[::-1]
    print '\nstart == %s' % start
    while start >= 0 and n > 1:
        start = body[::-1].find(s_term, start+len(s_term))
        print 'n == %s    start == %s' % (n,start)
        n -= 1
    print '\n (body[::-1])[start:]\n',(body[::-1])[start:]
    print '\n((body[::-1])[start:])[::-1]\n',((body[::-1])[start:])[::-1]
    print '       -\n'
    return ((body[::-1])[start:])[::-1]


def cool_find_nth(body, s_term, n):
    assert(len(s_term)==1)
    return body.rsplit(s_term,n)[0] + s_term


ss = 'One / Two / Three / Four / Five / Six / End'
print 'the string\n%s\n' % ss

print ('================================\n'
       "find_nth(ss, '/', 3)\n%s" % find_nth(ss, '/', 3) )

print '================================='
print "cool_find_nth(ss, '/', 3)\n%s" % cool_find_nth(ss, '/', 3)

结果

^{pr2}$

编辑1

这是另一个非常实用的工具：regex

import re

reg = re.compile('/'
                 '([^/.]*?)/'
                 '([^/.]*?)\.'
                 '([^/.]*?)\.'
                 '([^/.]*?)\.'
                 '([^/.]*?)\.'
                 '([^/.]*?)\.'
                 '[^/.]+\Z')

def main():
    #file = open('~/Desktop/client_docs.csv', "rb")
    #output = open('~/Desktop/client_docs_parsed.txt', "wb")
    li = ['/Household/LastName.FirstName.Account.Doctype.Date.extension',
          '- /Volumes/HD/Organized Files/Cosby, Bill/Cosby.Bill..Profile.2006.doc']
    for line in li:
        print "line == %r" % line
        household,lastn,firstn,acct,doctype,date = reg.search(line).groups('')       
        print ('household == %r\n'
               'lastn     == %r\n'
               'firstn    == %r\n'
               'acct      == %r\n'
               'doctype   == %r\n'
               'date      == %r\n'
               % (household,lastn,firstn,acct,doctype,date))

if __name__ == "__main__": main()

结果

line == '/Household/LastName.FirstName.Account.Doctype.Date.extension'
household == 'Household'
lastn     == 'LastName'
firstn    == 'FirstName'
acct      == 'Account'
doctype   == 'Doctype'
date      == 'Date'

line == '- /Volumes/HD/Organized Files/Cosby, Bill/Cosby.Bill..Profile.2006.doc'
household == 'Cosby, Bill'
lastn     == 'Cosby'
firstn    == 'Bill'
acct      == ''
doctype   == 'Profile'
date      == '2006'

编辑2

我想知道当我发布最后一次编辑时我的大脑在哪里。以下功能也可以实现：

rig = re.compile('[/.]')
rig.split(line)[-7:-1]

编辑1

编辑2

相关问题更多 >

编程相关推荐

热门问题

热门文章