将pdfmin用来将pdf分成页面

2024-04-24 11:06:12 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试逐页提取pdf并将结果存储在字典中,如下所示:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re

def convert_pdf_to_txt(path):
    ps=dict()
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    i=1
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
        text = retstr.getvalue()
        ps[i]=re.sub(' +',' ',text)
        i+=1
    return ps

print convert_pdf_to_txt('Aak.pdf')[3]

但无论我进入哪个页面,我都会得到之前所有的页面。请告诉我怎么解决这个问题?在


Tags: fromimportpdfpagepasswordpdfminercachingcodec
1条回答
网友
1楼 · 发布于 2024-04-24 11:06:12

这应该行得通。 在

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

import os

def set_interpreter():
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return { 'retstr': retstr, 'device': device, 'interpreter': interpreter }

def convert_pdf_to_txt(path):
    fp = file(path, 'rb')
    si = set_interpreter()
    retstr = si['retstr']
    device = si['device']
    interpreter = si['interpreter']
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    page_counter = 0

    for pageNumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True)):
        interpreter.process_page(page)
        fpp = file('pagetext_%d.txt' % page_counter, 'w+')
        fpp.write(retstr.getvalue())
        fpp.close()
        page_counter += 1
        si = set_interpreter()
        retstr = si['retstr']
        device = si['device']
        interpreter = si['interpreter']

    fp.close()
    device.close()
    retstr.close()
    return text

print convert_pdf_to_txt(os.path.dirname(os.path.realpath('filename.pdf')) + "/filename.pdf")

相关问题 更多 >