使用pdfminer通过URL解析PDF文件的Python方法
我正在尝试解析一个文件,但不想把它从网站上下载下来。我在硬盘上运行这个文件时可以顺利解析,但运行这个脚本时却出问题了。
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
我觉得我在整合网址的时候搞错了。
import sys
import getopt
import urllib2
import datetime
import re
from pdfminer.pdfparser import PDFParser
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, PDFConverter, LTContainer, LTText, LTTextBox, LTImage
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from urllib2 import Request
# Define a PDF parser function
def parsePDF(url):
# Open the url provided as an argument to the function and read the content
open = urllib2.urlopen(Request(url)).read()
# Cast to StringIO object
from StringIO import StringIO
memory_file = StringIO(open)
# Create a PDF parser object associated with the StringIO object
parser = PDFParser(memory_file)
# Create a PDF document object that stores the document structure
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Define parameters to the PDF device objet
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
codec = 'utf-8'
Create a PDF device object
device = PDFDevice(rsrcmgr, retstr, codec = codec, laparams = laparams)
# Create a PDF interpreter object
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# Construct the url
url = 'http://www.city.pittsburgh.pa.us/police/blotter/blotter_monday.pdf'
2 个回答
2
在上面的回答基础上,我找到了一种小技巧,效果非常好!以下是我版本的函数:
def pdf_from_url_to_txt(url):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
f = urllib.request.urlopen(url).read()
fp = BytesIO(f)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
2
根据你自己的回答和这里提供的函数链接,这个代码可以从一个网址上的PDF文件中提取出字符串,而不需要先下载这个文件:
import urllib2
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def pdf_from_url_to_txt(url):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Open the url provided as an argument to the function and read the content
f = urllib2.urlopen(urllib2.Request(url)).read()
# Cast to StringIO object
fp = StringIO(f)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str