在PDF文件中使用Python搜索
这里是PDF结构的一部分:
5 0 obj
<< /Length 56 >>
stream
BT /F1 12 Tf 100 700 Td 15 TL (JavaScript example) Tj ET
endstream
endobj
6 0 obj
<<
/Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /MacRomanEncoding
>>
endobj
7 0 obj
<<
/Type /Action
/S /JavaScript
我想查找“javascript”这个词,不管它是否存在。问题在于,“javascript”可以用十六进制表示,可能是完整的,也可能是部分的,比如“javascript”或者“Jav#61Script”或者“J#61v#61Script”等等。
那么,我该怎么才能找到“javascript”这个词,考虑到所有这些可能性呢???
1 个回答
2
逐个读取字符,并在这个过程中把你发现的十六进制数转换成字符,同时把它们转换成小写字母。然后把结果和“javascript”进行比较。
这里有个想法:
import string
import os
import re
def pdf_find_str(pdfname, str):
f = open(pdfname, "rb")
# read the file CHUNK_SIZE chars at a time, keeping last KEEP_SIZE chars
CHUNK_SIZE = 2*1024*1024
KEEP_SIZE = 3 * len(str) # each char might be in #ff form
hexvals = "0123456789abcdef"
ichunk = removed = 0
chunk = f.read(CHUNK_SIZE)
while len(chunk) > 0:
# Loop to find all #'s and replace them with the character they represent.
hpos = chunk.find('#')
while hpos != -1:
if len(chunk)-hpos >= 3 and chunk[hpos+1] in hexvals and chunk[hpos+2] in hexvals:
hex = int(chunk[hpos+1:hpos+3], 16) # next two characters are int value
ch = chr(hex).lower()
if ch in str: # avoid doing this if ch is not in str
chunk = chunk[:hpos] + ch + chunk[hpos+3:]
removed += 2
hpos = chunk.find('#', hpos+1)
m = re.search(str, chunk, re.I)
if m:
return ichunk * (CHUNK_SIZE-KEEP_SIZE) + m.start()
# Transfer last KEEP_SIZE characters to beginning for next round of
# testing since our string may span chunks.
next_chunk = f.read(CHUNK_SIZE - KEEP_SIZE)
if len(next_chunk) == 0: break
chunk = chunk[-KEEP_SIZE:] + next_chunk
ichunk += 1
f.close()
return -1
# On one file:
#if pdf_find_str("Consciousness Explained.pdf", "javascript") != -1:
# print 'Contains "javascript"'
# Recursively on a directory:
for root, dirs, files in os.walk("Books"):
for file in files:
if file.endswith(".pdf"):
position = pdf_find_str(root + "/" + file, "javascript")
if position != -1:
print file, "(", position, ")"
# Note: position returned by pdf_find_str does not account for removed
# characters from #ff representations (if any).