如何获取书签的页码
from typing import List
from PyPDF2 import PdfFileReader
from PyPDF2.generic import Destination
def get_outlines(pdf_filepath: str) -> List[Destination]:
"""Get the bookmarks of a PDF file."""
with open(pdf_filepath, "rb") as fp:
pdf_file_reader = PdfFileReader(fp)
outlines = pdf_file_reader.getOutlines()
return outlines
print(get_outlines("PDF-export-example.pdf"))
pyPdf.pdf.Destination
有很多属性,但我找不到任何与书签的页码相关的信息。我该如何获取书签的页码呢?
举个例子,outlines[1].page.idnum
返回的数字大约是 PDF 文档中引用的页码的三倍。我猜这个数字可能指向的是某个比页面小的对象,因为在整个 PDF 文档的目录上运行 .page.idnum
返回的数字数组与 PDF 文档中的“真实”页码并没有线性关系,而且大约是三倍的关系。
更新:这个问题和这个链接中的内容是一样的:根据目录拆分 PDF,不过我不太明白作者在他的自答中做了什么。对我来说,看起来太复杂了,无法使用。
4 个回答
3
在2019年,对于那些想要更快方法的人,可以使用:
from PyPDF2 import PdfFileReader
def printPageNumberFrom(filename):
with open(filename, "rb") as f:
pdf = PdfFileReader(f)
bookmarks = pdf.getOutlines()
for b in bookmarks:
print(pdf.getDestinationPageNumber(b) + 1) #page count starts from 0
7
根据vjayky和Giulio D的建议,递归管理书签。
需要使用PyPDF2版本大于等于1.25。
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
def reviewAndPrintBookmarks(bookmarks, indent=0):
for b in bookmarks:
if type(b) == list:
reviewAndPrintBookmarks(b, indent + 4)
continue
pg_num = pdf.getDestinationPageNumber(b) + 1 # page count starts from 0
print("%s%s: Page %s" % (" " * indent, b.title, pg_num))
reviewAndPrintBookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
如果使用的PyPDF2版本小于1.25。
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
# Map page ids to page numbers
pg_id_to_num = {}
for pg_num in range(0, pdf.getNumPages()):
pg_id_to_num[pdf.getPage(pg_num).indirectRef.idnum] = pg_num
def reviewAndPrintBookmarks(bookmarks, indent=0):
for b in bookmarks:
if type(b) == list:
reviewAndPrintBookmarks(b, indent + 4)
continue
pg_num = pg_id_to_num[b.page.idnum] + 1 # page count starts from 0
print("%s%s: Page %s" % (" " * indent, b.title, pg_num))
reviewAndPrintBookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
示例输出(两种方法的结果):
Bookmark 1: Page 1
Bookmark 1.1: Page 2
Bookmark 1.2: Page 3
Bookmark 2: Page 4
Bookmark 3: Page 5
Bookmark 3.1: Page 6
10
正如@theta提到的,"根据大纲拆分PDF"这个帖子里有提取页码所需的代码。如果你觉得这很复杂,我复制了一部分代码,这部分代码可以把页面ID映射到页码,并把它做成了一个函数。下面是一个可以工作的例子,它会打印出书签o[0]的页码:
from PyPDF2 import PdfFileReader
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
# main
f = open('document.pdf','rb')
p = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(p)
o = p.getOutlines()
pg_num = pg_id_num_map[o[0].page.idnum] + 1
print(pg_num)
可能对@theta来说已经太晚了,但这可能对其他人有帮助 :) 顺便说一下,这是我在stackoverflow上的第一篇帖子,如果我没有遵循通常的格式,请多多包涵。
进一步扩展: 如果你想获取书签在页面上的确切位置,这将让你的工作更轻松:
from PyPDF2 import PdfFileReader
import PyPDF2 as pyPdf
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None):
if result is None:
result = dict()
if type(outlines) == list:
for outline in outlines:
result = outlines_pg_zoom_info(outline, pg_id_num_map, result)
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
return result
# main
pdf_name = 'document.pdf'
f = open(pdf_name,'rb')
pdf = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(pdf)
outlines = pdf.getOutlines()
bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)
print(bookmarks_info)
注意:我的书签是章节编号(例如:1.1 引言),我把书签信息映射到章节编号。如果你的书签不同,请修改代码的这一部分:
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))