Python扫描文件夹和子文件夹。子文件夹。仅处理指定的文件

2024-06-16 12:53:02 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在进行PDF展开,但挑战是源PDF是存储“c:\root folder\folder1\subfolders1\subfolders”每个文件夹和子文件夹都有PDF,在PDF展开后,文件保存在不同的位置,但结构与c:\folder1\subfolders1\subfolders相同。 文件夹结构示例 enter image description here

我面临着挑战,因为PDF名称就是这样(06032021_toibhoc_mp_01_1_colu r1.PDF、06032021_toibhoc_PP_01_1_colu r1.PDF、06032021_toindu mp_01_1_colu r1.PDF)

文件名语法:-Date\u Techcode\u mp\u pageno\u edition\u number\u col\u page\u revisionnumber

我只需要关注日期和技术代码(即06032021_toibhoc),因为这在每个文件中都是唯一的。 案例1: 我想检查文件名是否以“明天日期”开始,而“今天日期”仅处理它们。 案例2。 我想通过JSON文件检查技术代码。我有一个存储技术代码,我想充实它,所以程序应该比较文件名教学代码和JSON文件,如果JSON文件技术代码只存在于文件名中,则只存在于该文件进程中,否则它会复制到目标,而无需进程

json文件的示例。{"toiac_mp","toiac_pp","su_mp","rjk_mp","rjk_pp","bar_mp","cap_mp",”cap_pp”"}

案例3:- 我的工作时间是下午3:00到凌晨2:00,因为我想在设定的工作时间后更改

我尝试使用下面的代码来处理文件夹和子文件夹中的每个文件

import win32com.client, win32com.client.makepy, os, winerror
from win32com.client.dynamic import ERRORS_BAD_CONTEXT
import ghostscript,locale,time,datetime
import threading


def convert_to_1_3(dirpath,out_file,org_name):
    ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
    try:
        src = os.path.join(dirpath,org_name)  
        win32com.client.makepy.GenerateFromTypeLibSpec('Acrobat')
        adobe = win32com.client.DispatchEx('AcroExch.App')
        avDoc = win32com.client.DispatchEx('AcroExch.AVDoc')
        if avDoc.Open(src," "):
            pdDoc = avDoc.GetPDDoc()
            jObject = pdDoc.GetJSObject()
            jObject.SaveAs(out_file, "com.adobe.acrobat.ps")#com.Callas.preflight.pdfa
    except Exception as e:
        print(str(e))
    finally:
        avDoc.Close(True)
        jsObject = None
        pdDoc = None
        avDoc = None

def ps2pdf(ps_input_path, pdf_output_path):
    args1 = ["ps2pdf", # actual value doesn't matter
            "-sBATCH",
            "-sNOPAUSE",
            "-sSAFER",
            "-sDEVICE=pdfwrite",
            "-dCompatibilityLevel=1.3",
            "-sOutputFile=" + pdf_output_path,
            ps_input_path]
    encoding = locale.getpreferredencoding()
    args1 = [a.encode(encoding) for a in args1]
    ghostscript.Ghostscript(*args1)
    ghostscript.cleanup()

def write_log(file):
    start_time = time.time()
    current_date_and_time = datetime.datetime.now().strftime("%d%m%Y-%H:%M:%S")
    current_date_and_time_string = str(current_date_and_time) 
    with open(file,'a+') as out:
        out.write(current_date_and_time_string+"\n--- %s seconds ---" % (str(time.time() - start_time))+'\n')
        # print("--- %s seconds ---" % (time.time() - start_time))

def main(input_file,pdf_file):
    try:
        for dirpath, dirnames, filenames in os.walk(inputpath):
            # structure = os.path.join(outputpath, dirpath[len(inputpath):])
            structure = (outputpath+dirpath[len(inputpath):])
            if not os.path.isdir(structure):
                os.mkdir(structure)
            for file in filenames:
                if not file.endswith('qxd.pdf'):
                    # shutil.copy2(os.path.join(dirpath,file),os.path.join(structure,file))
                    PS_filename='ps'.join(str(file).split('pdf'))
                    PDF_filename='pdf'.join(str(file).split('ps'))                    
                    convert_to_1_3(dirpath,PS_filename,file)
                    ps2pdf(os.path.join(dirpath,PS_filename),os.path.join(structure,PDF_filename))
                    os.remove(os.path.join(dirpath,PS_filename))
    except Exception as e:
        print(str(e))

if __name__=="__main__":
    log_file="\\PDF_1.3\log.txt"
    inputpath = '/121rawfile'
    outputpath = 'c:/'
    # t1=threading.Thread(target=main(inputpath,outputpath),args=(10 ,0))
    main(inputpath,outputpath)
    write_log(log_file)
    # t1.start()

Tags: 文件path代码clientpdftimeosmp
1条回答
网友
1楼 · 发布于 2024-06-16 12:53:02

用于处理(06032021_toibhoc_mp_01_1_colu r1.pdf、06032021_toibhoc_PP_01_1_colu r1.pdf、06032021_toindu mp_01_colu r1.pdf)

首先,您可以读取配置文件

def read_config():
    with open('/path/filename.config', 'r') as lookupfile:#use your path name 
        for line in lookupfile:
            tech= [word[1:-1] for word in line.split(",")]
            return tech

然后可以使用DateTime获取日期,使用fnmatch.fnmatch搜索文件。检查“今天”和“明天”文件,您可以同时使用这两个文件

tom_date=(datetime.today()+timedelta(days=1)).strftime("%d%m%Y")
today_date=(datetime.now()).strftime("%d%m%Y")
patterns=(tom_date+'_'+techcode+'*',today_date+'_'+techcode+'*')
if [pat for pat in patterns if fnmatch.fnmatch(file,pat)]:
       #do waht you want   

如果我没有弄错您的完整代码,我们喜欢这样:

from win32com.client.dynamic import Dispatch,ERRORS_BAD_CONTEXT,winerror
import ghostscript,locale,time,fnmatch,shutil ,os
from datetime import datetime,timedelta
def convert_to_1_3(old_pdf_file,PS_filename):
    ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
    PDSaveFull = 0x01
    PDSaveLinearized = 0x04
    PDDocOldVersion = 0x0080
    PDDocNeedsSave = 0x0001
    SAVEFLAG = PDDocNeedsSave|PDSaveLinearized
    try:
        src =os.path.abspath(old_pdf_file) 
        avDoc = Dispatch('AcroExch.AVDoc')
        if avDoc.Open(src," "):
            pdDoc = avDoc.GetPDDoc()
            pdDoc.Save(SAVEFLAG,PS_filename)
    except Exception as e:
        print(str(e))
    finally:
        avDoc.Close(-1)
       
def ps2pdf(ps_input_path, pdf_output_path):
    args1 = ["ps2pdf", # actual value doesn't matter
            "-sBATCH",
            "-sNOPAUSE",
            "-sSAFER",
            "-sDEVICE=pdfwrite",
            "-dCompatibilityLevel=1.3",
            "-sOutputFile=" + pdf_output_path,
            ps_input_path]
    encoding = locale.getpreferredencoding()
    args1 = [a.encode(encoding) for a in args1]
    ghostscript.Ghostscript(*args1)
    ghostscript.cleanup()

def read_config():
    with open('/PDF_Flattening/tech_code.config', 'r') as lookupfile:
        for line in lookupfile:
            tech= [word[1:-1] for word in line.split(",")]
            return tech

def main(input_file):
    try:
        tom_date=(datetime.today()+timedelta(days=1)).strftime("%d%m%Y")
        today_date=(datetime.now()).strftime("%d%m%Y")
        for dirpath, dirnames, filenames in os.walk(inputpath):
            structure = os.path.join('c:\\',dirpath[len(inputpath):])
            if not os.path.isdir(structure):
                os.mkdir(structure)
            for file in filenames:
                if not os.path.isfile(os.path.join(structure,file)):
                    for techcode in read_config():
                        patterns=(tom_date+'_'+techcode+'*',today_date+'_'+techcode+'*')
                        if [pat for pat in patterns if fnmatch.fnmatch(file,pat)]:
                            if not file.endswith('qxd.pdf'):                                                    
                                PS_filename='ps'.join(str(file).split('pdf'))
                                PDF_filename='pdf'.join(str(file).split('ps'))
                                old_pdf_file= os.path.join(dirpath,file)              
                                convert_to_1_3(old_pdf_file,os.path.join(dirpath,PS_filename))
                                ps2pdf(os.path.join(dirpath,PS_filename),os.path.join(structure,PDF_filename))
                                os.remove(os.path.join(dirpath,PS_filename))
                    else: 
                        shutil.copy2(os.path.join(dirpath,file),os.path.join(structure,file))                        
    except Exception as e:
        print(str(e))

if __name__=="__main__":
    inputpath = "/121rawfile"
    main(inputpath)

                      

相关问题 更多 >