使用OCR代码时出现Python pycharm错误

2024-03-28 21:30:50 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试使用以下代码:

https://www.geeksforgeeks.org/python-reading-contents-of-pdf-using-ocr-optical-character-recognition/

# Import libraries 
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os

# Path of the pdf 
PDF_file = "/Users/user1/Desktop/pdf1.pdf"

''' 
Part #1 : Converting PDF to images 
'''

# Store all the pages of the PDF in a variable 
pages = convert_from_path(PDF_file, 500)

# Counter to store images of each page of PDF to image 
image_counter = 1

# Iterate through all the pages stored above 
for page in pages:
    # Declaring filename for each page of PDF as JPG
    # For each page, filename will be: 
    # PDF page 1 -> page_1.jpg 
    # PDF page 2 -> page_2.jpg 
    # PDF page 3 -> page_3.jpg 
    # .... 
    # PDF page n -> page_n.jpg 
    filename = "page_" + str(image_counter) + ".jpg"

    # Save the image of the page in system 
    page.save(filename, 'JPEG')

    # Increment the counter to update filename 
    image_counter = image_counter + 1

''' 
Part #2 - Recognizing text from the images using OCR 
'''
3
# Variable to get count of total number of pages 
filelimit = image_counter - 1

# Creating a text file to write the output 
outfile = "/Users/user1/Desktop/ocr/pdf1.txt"

# Open the file in append mode so that  
# All contents of all images are added to the same file 
f = open(outfile, "a")

# Iterate from 1 to total number of pages 
for i in range(1, filelimit + 1):
    # Set filename to recognize text from
    # Again, these files will be: 
    # page_1.jpg 
    # page_2.jpg 
    # .... 
    # page_n.jpg 
    filename = "page_" + str(i) + ".jpg"

    # Recognize the text as string in image using pytesserct 
    text = str(((pytesseract.image_to_string(Image.open(filename)))))

    # The recognized text is stored in variable text 
    # Any string processing may be applied on text 
    # Here, basic formatting has been done: 
    # In many PDFs, at line ending, if a word can't 
    # be written fully, a 'hyphen' is added. 
    # The rest of the word is written in the next line 
    # Eg: This is a sample text this word here GeeksF- 
    # orGeeks is half on first line, remaining on next. 
    # To remove this, we replace every '-\n' to ''. 
    text = text.replace('-\n', '')

    # Finally, write the processed text to the file. 
    f.write(text)

# Close the file after writing all the text. 
f.close() 

但我得到了以下错误:

Traceback (most recent call last):
  File "/Users/user1/PycharmProjects/project1/venv/lib/python3.8/site-packages/pdf2image/pdf2image.py", line 409, in pdfinfo_from_path
    proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/subprocess.py", line 854, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/subprocess.py", line 1702, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'pdfinfo'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/user1/PycharmProjects/project1/non_parseable.py", line 16, in <module>
    pages = convert_from_path(PDF_file, 500)
  File "/Users/user1/PycharmProjects/project1/venv/lib/python3.8/site-packages/pdf2image/pdf2image.py", line 89, in convert_from_path
    page_count = pdfinfo_from_path(pdf_path, userpw, poppler_path=poppler_path)["Pages"]
  File "/Users/user1/PycharmProjects/project1/venv/lib/python3.8/site-packages/pdf2image/pdf2image.py", line 430, in pdfinfo_from_path
    raise PDFInfoNotInstalledError(
pdf2image.exceptions.PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

Process finished with exit code 1

你知道我该怎么修吗


Tags: ofthetopathtextinfromimage