PDFMiner-遍历页面并将其转换为tex

#!/usr/bin/env python # -*- coding: utf-8 -*- from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from subprocess import call from cStringIO import StringIO import re import sys import os argNum = len(sys.argv) pdfLoc = str(sys.argv[1]) #CLI arguments def convert_pdf_to_txt(path): #converts pdf to raw text (not my function) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str if (pdfLoc[-4:] == ".pdf"): contents = "" try: # Get the outlines (contents) of the document fp = open(pdfLoc, 'rb') #open a pdf document for reading parser = PDFParser(fp) document = PDFDocument(parser) outlines = document.get_outlines() for (level,title,dest,a,se) in outlines: title = re.sub(r".*\s", "", title) #get raw titles, stripped of formatting contents += title + "\n" except: #if pdfMiner can't get contents then manually get contents from text conversion #contents = convert_pdf_to_txt(pdfLoc) #startToCpos = contents.find("TABLE OF CONTENTS") #endToCpos = contents.rfind(". . .") #contents = contents[startToCpos:endToCpos+8] fp = open(pdfLoc, 'rb') #open a pdf document for reading parser = PDFParser(fp) document = PDFDocument(parser) pages = PDFPage(document, 3, {'Resources':'thing', 'MediaBox':'Thing'}) #God knows what's going on here for pageNumber, page in enumerate(pages.get_pages(PDFDocument, fp)): #The hell is the first argument? if pageNumber == 42: print "Hello" #for line in s: # print line # if (re.search("(\.\s){2,}", line) and not re.search("NOTES|SCOPE", line)): # line = re.sub("(\.\s){2,}", "", line) # line = re.sub("(\s?)*[0-9]*\n", "\n", line) # line = re.sub("^\s", "", line) # print line, #contents = contents.lower() #contents = re.sub("“", "\"", contents) #contents = re.sub("”", "\"", contents) #contents = re.sub("ﬁ", "f", contents) #contents = re.sub(r"(TABLE OF CONTENTS|LIST OF TABLES|SCOPE|REFERENCED DOCUMENTS|Identification|System (o|O)verview|Document (o|O)verview|Title|Page|Table|Tab)(\n)?|\.\s?|Section|[0-9]", "", contents) #contents = re.sub(r"This document contains proprietary information and may not be reproduced in any form whatsoever, nor may be used by or its contents divulged to third\nparties without written permission from the ownerAll rights reservedNumber: STP SMEDate: -Jul-Issue: A of CMC STPNHIndustriesCLASSIFICATION\nNATO UNCLASSIFIED AGUSTAEUROCOPTEREUROCOPTER DEUTSCHLAND FOKKER", "", contents) #contents = re.sub(r"(\r?\n){2,}", "", contents) #contents = contents.lstrip() #contents = contents.rstrip() #print contents else: print "Not a valid PDF file"

2条回答

网友

1楼 · 编辑于 2024-05-15 04:36:32

尝试使用PyPDF2。它使用起来要简单得多，而且不像PDFMiner那样具有不必要的丰富功能（这对您的情况很好）。这是你想要的，实现起来非常简单。

from PyPDF2 import PdfFileReader

PDF = PdfFileReader(file(pdf_fp, 'rb'))

if PDF.isEncrypted:
    decrypt = PDF.decrypt('')
    if decrypt == 0:
        print "Password Protected PDF: " + pdf_fp
        raise Exception("Nope")
    elif decrypt == 1 or decrypt == 2:
        print "Successfully Decrypted PDF"

for page in PDF.pages:
    print page.extractText()
    '''page.extractText() is the unicode string of the contents of the page
    And I am assuming you know how to play with a string and use regex
    If you find what you want just break like so:
    if some_condition == True:
        break'''

网友

2楼 · 编辑于 2024-05-15 04:36:32

也许我有点晚了，你已经解决了这个问题，但是，为了将来的参考：

经过大量的搜索，我想起了这个link，从中我将指出以下部分（粗体相关部分）：

Python decided to do methods in a way that makes the instance to which the method belongs be passed automatically, but not received automatically: the first parameter of methods is the instance the method is called on. That makes methods entirely the same as functions, and leaves the actual name to use up to you (although self is the convention, and people will generally frown at you when you use something else.) self is not special to the code, it's just another object.

相关问题更多 >

编程相关推荐

热门问题

热门文章