如何在Python中为PDF每页顶部添加超链接?
我们正在一个网站上发布扫描和OCR处理过的文档,需要在每一页上添加一个链接,这样通过搜索引擎找到这些页面的人就能轻松访问相关文档的父索引。
我一直在尝试用Python和pypdf来实现这个功能,但到目前为止还没有成功。我的计划是为每个文档创建一个只包含超链接的页面(每个文档的链接需要不同),然后把这个页面合并到文档中的每一页。我尝试过在StackOverflow上找到的代码(大多是针对PyPdf2的,我知道这个库已经不再维护了,而且需要对pypdf进行大量修改),但都没有成功。
例如,我在pypdf的文档页面上尝试了:
from pypdf import PdfWriter, PdfReader
stamp = PdfReader("bg.pdf").pages[0]
writer = PdfWriter(clone_from="source.pdf")
for page in writer.pages:
page.merge_page(stamp, over=False) # here set to False for watermarking
writer.write("out.pdf")
结果生成了损坏的PDF文件。
2 个回答
-1
这段内容包含了一些代码,具体的代码在
from PyPDF2 import PdfWriter, PdfReader
from PyPDF2.generic import TextStringObject, Annotation
# Function to add hyperlink to each page
def add_hyperlink_to_page(page, hyperlink_text, hyperlink_url, x=100, y=100):
# Create a link annotation
link_annotation = Annotation()
link_annotation.update({
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Link"),
NameObject("/Rect"): [x, y, x + 100, y + 20],
NameObject("/Border"): [0, 0, 0], # No border
NameObject("/A"): {
NameObject("/Type"): NameObject("/Action"),
NameObject("/S"): NameObject("/URI"),
NameObject("/URI"): TextStringObject(hyperlink_url)
}
})
# Create a text object for the hyperlink text
text_object = TextStringObject(hyperlink_text)
# Add the annotation and text object to the page
page[NameObject("/Annots")] = [link_annotation]
page.mergePage(link_annotation)
# Open the source PDF file
with open("source.pdf", "rb") as source_file:
reader = PdfReader(source_file)
writer = PdfWriter()
# Iterate over each page in the PDF
for page_number in range(len(reader.pages)):
# Get the current page
page = reader.pages[page_number]
# Add the hyperlink to the current page
add_hyperlink_to_page(page, "Parent Index", "https://your-parent-index-url.com")
# Add the modified page to the output PDF
writer.add_page(page)
# Write the output PDF to a file
with open("output.pdf", "wb") as output_file:
writer.write(output_file)
这个地方。这里的代码是用JavaScript写的,可能是用来解决某个问题或者实现某个功能的。你可以把它想象成一个小工具,帮助你完成特定的任务。具体的代码内容需要查看from PyPDF2 import PdfWriter, PdfReader
from PyPDF2.generic import TextStringObject, Annotation
# Function to add hyperlink to each page
def add_hyperlink_to_page(page, hyperlink_text, hyperlink_url, x=100, y=100):
# Create a link annotation
link_annotation = Annotation()
link_annotation.update({
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Link"),
NameObject("/Rect"): [x, y, x + 100, y + 20],
NameObject("/Border"): [0, 0, 0], # No border
NameObject("/A"): {
NameObject("/Type"): NameObject("/Action"),
NameObject("/S"): NameObject("/URI"),
NameObject("/URI"): TextStringObject(hyperlink_url)
}
})
# Create a text object for the hyperlink text
text_object = TextStringObject(hyperlink_text)
# Add the annotation and text object to the page
page[NameObject("/Annots")] = [link_annotation]
page.mergePage(link_annotation)
# Open the source PDF file
with open("source.pdf", "rb") as source_file:
reader = PdfReader(source_file)
writer = PdfWriter()
# Iterate over each page in the PDF
for page_number in range(len(reader.pages)):
# Get the current page
page = reader.pages[page_number]
# Add the hyperlink to the current page
add_hyperlink_to_page(page, "Parent Index", "https://your-parent-index-url.com")
# Add the modified page to the output PDF
writer.add_page(page)
# Write the output PDF to a file
with open("output.pdf", "wb") as output_file:
writer.write(output_file)
才能了解它的作用。
1
这对我来说非常有效:
前提条件:
pip install pypdf==4.1.0
pip install fpdf2==2.7.8
代码:
import fpdf # pip install fpdf2
from fpdf.enums import XPos, YPos
import pypdf
from pypdf import PdfReader, PdfWriter
def generate_overlay(target_path: str, text: str, link: str) -> None:
class PDF(fpdf.FPDF):
def header(self) -> None:
self.set_font("helvetica", "B", 12)
self.set_text_color(0, 0, 255) # Blue color for the link
link_width = pdf.get_string_width(text)
link_height = 10
self.cell(
link_width,
link_height,
text=text,
new_x=XPos.RIGHT,
new_y=YPos.TOP,
align="C",
link=link,
)
pdf = PDF()
pdf.add_page()
pdf.output(target_path)
def stamp(original_path: str, stamp_path: str, out_path: str) -> None:
stamp = PdfReader(stamp_path).pages[0]
writer = PdfWriter(clone_from=original_path)
for page in writer.pages:
page.merge_page(stamp, over=False)
writer.write(out_path)
if __name__ == "__main__":
print(f"pypdf=={pypdf.__version__}")
print(f"fpdf2=={fpdf.__version__}")
stamp_path = "stamp.pdf"
generate_overlay(stamp_path, "py-pdf.github.io", "https://py-pdf.github.io")
stamp(stamp_path, "GeoTopo.pdf", "out.pdf")
它应该输出:
pypdf==4.1.0
fpdf2==2.7.8