根据大纲拆分PDF
我想用pyPdf这个工具来根据PDF文件的目录结构,把一个PDF文件拆分开来。目录里的每个目标都指向PDF中的不同页面。
举个例子,目录结构如下:
main --> points to page 1 sect1 --> points to page 1 sect2 --> points to page 15 sect3 --> points to page 22
在pyPdf里,逐页查看文档或查看文档目录中的每个目标是很简单的;不过,我就是搞不清楚怎么找到每个目标所指向的页面号码。
有没有人知道怎么找到目录中每个目标对应的页面号码呢?
6 个回答
1
这是对@darrell类的小更新,目的是让它能够解析UTF-8格式的目录。我把这个更新放在这里作为回答,因为在评论里说可能不太好理解。
问题出在pyPdf.pdf.Destination.title
这个地方,它可能会有两种返回形式:
pyPdf.generic.TextStringObject
pyPdf.generic.ByteStringObject
所以从_setup_outline_page_ids()
这个函数返回的title
对象也会有这两种类型。如果目录标题里包含了ASCII以外的字符,就会出现UnicodeDecodeError
的错误。
为了解决这个问题,我添加了以下代码:
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
这是整个类的代码:
class PdfOutline(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
1
这正是我想要的。Darrell对PdfFileReader的改进应该加入到PyPDF2里。
我写了一个小程序,利用PyPDF2和sejda-console来根据书签拆分PDF。在我的情况下,有几个一级章节我想要保持在一起。这个脚本让我可以做到这一点,并给生成的文件起个有意义的名字。
import operator
import os
import subprocess
import sys
import time
import PyPDF2 as pyPdf
# need to have sejda-console installed
# change this to point to your installation
sejda = 'C:\\sejda-console-1.0.0.M2\\bin\\sejda-console.bat'
class Darrell(pyPdf.PdfFileReader):
...
if __name__ == '__main__':
t0= time.time()
# get the name of the file to split as a command line arg
pdfname = sys.argv[1]
# open up the pdf
pdf = Darrell(open(pdfname, 'rb'))
# build list of (pagenumbers, newFileNames)
splitlist = [(1,'FrontMatter')] # Customize name of first section
template = '%-5s %s'
print template % ('Page', 'Title')
print '-'*72
for t,p in sorted(pdf.getDestinationPageNumbers().iteritems(),
key=operator.itemgetter(1)):
# Customize this to get it to split where you want
if t.startswith('Chapter') or \
t.startswith('Preface') or \
t.startswith('References'):
print template % (p+1, t)
# this customizes how files are renamed
new = t.replace('Chapter ', 'Chapter')\
.replace(': ', '-')\
.replace(': ', '-')\
.replace(' ', '_')
splitlist.append((p+1, new))
# call sejda tools and split document
call = sejda
call += ' splitbypages'
call += ' -f "%s"'%pdfname
call += ' -o ./'
call += ' -n '
call += ' '.join([str(p) for p,t in splitlist[1:]])
print '\n', call
subprocess.call(call)
print '\nsejda-console has completed.\n\n'
# rename the split files
for p,t in splitlist:
old ='./%i_'%p + pdfname
new = './' + t + '.pdf'
print 'renaming "%s"\n to "%s"...'%(old, new),
try:
os.remove(new)
except OSError:
pass
try:
os.rename(old, new)
print' succeeded.\n'
except:
print' failed.\n'
print '\ndone. Spliting took %.2f seconds'%(time.time() - t0)
11
我搞明白了:
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)