Python:如何搜索所有文件类型中的文本字符串
我正在尝试写一个程序,这个程序可以在用户指定的文件夹里搜索所有文件(包括文件名和文件内容),查找特定的字符串,然后把这些文件移动到用户指定的新文件夹。
更新:好的,我对我的代码做了一些修改。现在的工作方式是:首先用 os.path.walk() 获取一个文件列表。然后,程序会在这个列表中的每个文件里查找用户指定的字符串。最开始只检查文件名,如果找到匹配的,就把这些文件名放到一个单独的列表里。接下来,我们开始查看文件内容,使用文件扩展名来决定如何通过 win32com.client 打开文件。最后,仍然在原始列表中的文件会被认为是纯文本文件,然后就会打开并进行搜索。
不过,不知道为什么,程序只移动了纯文本文件。如果有人能找出原因,那就太好了。:)
################
#Import required modules
import fileinput
from shutil import move
from os.path import abspath, join, splitext, split
from os import mkdir, walk, remove
import win32com.client
################
#Create lists to hold file names
file_list = list()
file_move_list = list()
#Define file extensions which need to be converted
excel_set = [".xls", ".xlsx", ".xlsm", ".xlsb"]
msword_set = [".doc", ".docx"]
################
#Define functions
def getFileList( searchdirectory ):
#Get a list of all items in the directory to search
for (dirpath, dirnames, filenames) in walk( searchdirectory ):
for path in [ abspath( join( dirpath, filename ) ) for filename in filenames ]:
file_list.append( path )
def searchFiles( readfilelist, movefilelist, searchstring ):
#Get plain text from each file and search for searchstring
for filename in readfilelist:
ext = splitext( filename )[1]
#Check filenames
if searchstring in filename:
movefilelist.append( filename )
readfilelist.remove( filename )
#Check if file is a pdf
elif ext == ".pdf":
content = getPDFContent( filename )
if searchstring in content:
movefilelist.append( filename )
#Check if file is a word document
elif ext in msword_set:
app = win32com.client.Dispatch('Word.Application')
doc = app.Documents.Open( filename )
if searchstring in doc.Content.Text:
movefilelist.append( filename )
app.Quit()
#Check if file is an excel workbook/spreadsheet
elif ext in excel_set:
app = win32com.client.Dispatch( 'Excel.Application' )
fileDir, fileName = split( filename )
nameOnly = splitext( fileName )
newName = nameOnly[0] + ".csv"
outCSV = join( fileDir, newName )
workbook = app.Workbooks.Open( filename )
workbook.SaveAs(outCSV, FileFormat=24) # 24 is csv format
workbook.Close(False)
for line in open( outCSV, mode='r' ):
if searchstring in line:
movefilelist.append( filename )
app.Quit()
remove( outCSV )
#Assume all other files are plain text
else:
for line in open( filename, mode='r' ):
if searchstring in line:
movefilelist.append( filename )
readfilelist.remove( filename )
def moveFiles( movelist, destinationdirectory ):
mkdir( destinationdirectory )
for path in movelist:
#Move the files to the destination folder
move( path, destinationdirectory )
print( 'Done' )
def getPDFContent( filename ):
content = ""
pdf = pyPdf.PdfFileReader( file( filename, "rb" ) )
# Extract text from each page and add to content
for i in range( 0, pdf.getNumPages() ):
content += pdf.getPage(i).extractText() + " \n"
return content
################
#Run as main
if __name__=='__main__':
search_directory = input( 'Enter the path of the directory you wish to search through: ' )
search_string = input( 'Enter the search term: ' )
destination_directory = input( 'Enter the name of the new directory which will contain the moved files: ' )
getFileList( search_directory )
searchFiles( file_list, file_move_list, search_string )
moveFiles( file_move_list, destination_directory )
任何帮助都非常感谢。(顺便说一下,我使用的是 Python 3.2.1)
4 个回答
0
对于Windows系统,可以考虑这个:
os.system('findstr /C:"text to search for" *.*')
这个基本上能满足你想要的所有功能。
1
如果有人需要这段代码,我已经把它搞定了。确保你安装了 PyPDF2
和 win32com.client
这两个库。
################
#Import required modules
import fileinput
from shutil import move
from os.path import abspath, join, splitext, split
from os import mkdir, walk, remove
import win32com.client
import PyPDF2 as pyPdf
################
#Create lists to hold file names
file_list = list()
file_move_list = list()
#Define file extensions which need to be converted
excel_set = [".xls", ".xlsx", ".xlsm", ".xlsb"]
msword_set = [".doc", ".docx"]
################
#Define functions
def getFileList( searchdirectory ):
#Get a list of all items in the directory to search
for (dirpath, dirnames, filenames) in walk( searchdirectory ):
for path in [ abspath( join( dirpath, filename ) ) for filename in filenames ]:
file_list.append( path )
def searchFiles( readfilelist, movefilelist, searchstring ):
#Get plain text from each file and search for searchstring
for filename in readfilelist:
ext = splitext( filename )[1]
#Check filenames
if searchstring in filename:
movefilelist.append( filename )
readfilelist.remove( filename )
#Check if file is a pdf
elif ext == ".pdf":
content = getPDFContent( filename )
if searchstring in content:
movefilelist.append( filename )
#Check if file is a word document
elif ext in msword_set:
app = win32com.client.Dispatch('Word.Application')
doc = app.Documents.Open( filename )
if searchstring in doc.Content.Text:
movefilelist.append( filename )
app.Quit()
#Check if file is an excel workbook/spreadsheet
elif ext in excel_set:
app = win32com.client.Dispatch( 'Excel.Application' )
fileDir, fileName = split( filename )
nameOnly = splitext( fileName )
newName = nameOnly[0] + ".csv"
outCSV = join( fileDir, newName )
workbook = app.Workbooks.Open( filename )
workbook.SaveAs(outCSV, FileFormat=24) # 24 is csv format
workbook.Close(False)
for line in open( outCSV, mode='r' ):
if searchstring in line:
movefilelist.append( filename )
app.Quit()
remove( outCSV )
#Assume all other files are plain text
elif ext == ".txt":
txtFile = open(filename, mode='r')
for line in txtFile:
if searchstring in line:
movefilelist.append( filename )
txtFile.close()
else:
print(filename + " is not reconized")
#readfilelist.remove( filename )
def moveFiles( movelist, destinationdirectory ):
mkdir( destinationdirectory )
for path in movelist:
#Move the files to the destination folder
move( path, destinationdirectory )
print( 'Done' )
def getPDFContent( filename ):
content = ""
fd = file(filename, 'rb')
pdf = pyPdf.PdfFileReader( fd )
# Extract text from each page and add to content
for i in range( 0, pdf.getNumPages() ):
content += pdf.getPage(i).extractText() + " \n"
fd.close()
return content
################
#Run as main
if __name__=='__main__':
search_directory = input( 'Enter the path of the directory you wish to search through, in this format "C:\Users\admin\folder" : ' )
search_string = input( 'Enter the search term in quotes: ' )
destination_directory = input( 'Enter the name of the new directory which will contain the moved files, in this format"C:\Users\admin\folder" : ' )
getFileList( search_directory )
searchFiles( file_list, file_move_list, search_string )
moveFiles( file_move_list, destination_directory )