base64字符串python上的pytesseract

2024-06-17 14:54:24 发布

您现在位置:Python中文网/ 问答频道 /正文

对于png图像,我有一大堆base64格式的图像字符串。它们是电话号码(请参阅http://www.trulia.com/profile/gerald-drexler-broker-neillsville-wi-10703037/overview作为我的工作示例,使用号码中的src标记)。我想让他们通过pytesseract来提取数字。在

我从这里的答案中得到了一些指导:Loading Base64 String into Python Image Library

我尝试了几个公式,但我似乎不知道如何正确地将字符串加载到PIL中以运行pytesseract。下面是一个尝试的例子:

from PIL import Image
import base64
import pytesseract
import cStringIO

imgstring = 'data: image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGcAAAAVCAYAAABbq/AzAAAACXBIWXMAAA7EAAAOxAGVKw4bAAADiUlEQVRoge3YTWgeVRQG4IcQJJQSQihBNIQiXYUSpJSgIF1IkSKllCJFQggutBQRQRfFH3RTRFwVERdBRHcuREVEupASRIP4C7VoFSmWSq2gtRGjNm21Ls79+k0mM3cmLrpxXhjm++aec973/sy55w4dOnTo0KFDhw4FLOIe3IWrmWtbwWcfLuDhinj7avzvTjyLDXp24F2cxx84gQMVdgfxHVbwFe5tqaEObXxynDCAQ/g2af+8hjM3ftcwIzoPQ5iouObEQG1INvP4PvlVBT+IYxVxhlJ7Vad6GMICHsQkNqWO/I7Zgt1s6twejGEvfhMLrI2GKjT5NHHCs6l/dyTtM8nnzkL/msbvGhbFTOdwDIfT7514GyN4ryb403glE+8JvN/AWcY8Xi/8P45HK+J+0FJDFZp8mjgHxRu1o2TzkH62aDN+YFS8tlMZQdP4S6yCMhZqgr8oVlAdtuFvDGdsyngHL6Xfw0L31pq4Qy00VCHn04bzlmSzsWQznmw2lJ7XjZ8BbMclfJ0R/CRexS8ZmzJGcb/+nnFcrJ6B1P4lroiJb8I4nhGD8FzhGZwu2Z5OHJtbaFiv7jacvTG6qWSzKdmMZbjX4D78kGnfisvYUtNeN/PDYr8Yxo2iCDiH5ws2ZxN/HY7ob8insKvQtj09Hyz5DOkXLm00rEd3G04iZR3Vn8xpkdKuWjtptW+O1HAyI/Y1vJFpzwYvYUakx17nTjb4Dor0MIH9YgPtpZxJ1emjl6YnGzTMWF2NzbTQ3ZZzRKTfc6KIOSoKiMu4oeRbO36DWK4g62GLGJTbM8LXg2/EKtuIpXRfzthfSe3LOJPub+EpkUr+ERNXTMkT6X6mQcObuLnw/NcWuttyLuGBUoxZkcovZXhWYQA/6efDMh7Hh/ikbcAGTCe+Jf38++M6/Iur7k98ZnWqk/5/oX7SexouJu7edbGF7v/KOYBH8HKGoxJjqqu1cVES5g5s1L+Wh3GbyNvjYm+5oH+QnEq8oxW+t4rSfkqkiDHsFmltvmC3R2zcu5LdbnHm2N9SQxWafJo4iYwwIhbTtCiXF1S/AI3bwqfWnnNeEAepJuRK6VOi4vlZnGmKK+4QPq6JuVmcZ86KBXI+2R6wtoNz4iS+Ik7txYFv0lCFNj45TuJguqL/ZeMxa/eaHhonZ07/C8H1wonE26EFPhKfSK4H9ia+Dh06dOjw/8G/sXcmUir28IcAAAAASUVORK5CYII='
imgstring = imgstring.split('base64,')[-1].strip()
pic = cStringIO.StringIO()
image_string = cStringIO.StringIO(base64.b64decode(imgstring))
image = Image.open(image_string)
image.save('pic.png', image.format, quality = 100)
picture = Image.open('pic.png', mode='r')
picture.load()
picture.seek(0)

print pytesseract.image_to_string(Image.open(picture))

在我看来,我必须努力做到这一点,但即使在保存、加载等之后,我仍然得到一个AttributeError: read

什么是最有效的方法来加载这些到内存中,让pytesseract把它们吃掉?我甚至还没到过tesseract的阶段,我也不知道它有多快或多慢,但我有数以百万计的这些要处理。在

^{pr2}$

Tags: 字符串图像imageimportstringpilpngopen
2条回答

巴布亚新几内亚的透明度似乎引起了一些问题。覆盖在白色背景上可以解决这个问题。在

from PIL import Image
import base64
import pytesseract
import cStringIO

imgstring = 'data: image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGcAAAAVCAYAAABbq/AzAAAACXBIWXMAAA7EAAAOxAGVKw4bAAADiUlEQVRoge3YTWgeVRQG4IcQJJQSQihBNIQiXYUSpJSgIF1IkSKllCJFQggutBQRQRfFH3RTRFwVERdBRHcuREVEupASRIP4C7VoFSmWSq2gtRGjNm21Ls79+k0mM3cmLrpxXhjm++aec973/sy55w4dOnTo0KFDhw4FLOIe3IWrmWtbwWcfLuDhinj7avzvTjyLDXp24F2cxx84gQMVdgfxHVbwFe5tqaEObXxynDCAQ/g2af+8hjM3ftcwIzoPQ5iouObEQG1INvP4PvlVBT+IYxVxhlJ7Vad6GMICHsQkNqWO/I7Zgt1s6twejGEvfhMLrI2GKjT5NHHCs6l/dyTtM8nnzkL/msbvGhbFTOdwDIfT7514GyN4ryb403glE+8JvN/AWcY8Xi/8P45HK+J+0FJDFZp8mjgHxRu1o2TzkH62aDN+YFS8tlMZQdP4S6yCMhZqgr8oVlAdtuFvDGdsyngHL6Xfw0L31pq4Qy00VCHn04bzlmSzsWQznmw2lJ7XjZ8BbMclfJ0R/CRexS8ZmzJGcb/+nnFcrJ6B1P4lroiJb8I4nhGD8FzhGZwu2Z5OHJtbaFiv7jacvTG6qWSzKdmMZbjX4D78kGnfisvYUtNeN/PDYr8Yxo2iCDiH5ws2ZxN/HY7ob8insKvQtj09Hyz5DOkXLm00rEd3G04iZR3Vn8xpkdKuWjtptW+O1HAyI/Y1vJFpzwYvYUakx17nTjb4Dor0MIH9YgPtpZxJ1emjl6YnGzTMWF2NzbTQ3ZZzRKTfc6KIOSoKiMu4oeRbO36DWK4g62GLGJTbM8LXg2/EKtuIpXRfzthfSe3LOJPub+EpkUr+ERNXTMkT6X6mQcObuLnw/NcWuttyLuGBUoxZkcovZXhWYQA/6efDMh7Hh/ikbcAGTCe+Jf38++M6/Iur7k98ZnWqk/5/oX7SexouJu7edbGF7v/KOYBH8HKGoxJjqqu1cVES5g5s1L+Wh3GbyNvjYm+5oH+QnEq8oxW+t4rSfkqkiDHsFmltvmC3R2zcu5LdbnHm2N9SQxWafJo4iYwwIhbTtCiXF1S/AI3bwqfWnnNeEAepJuRK6VOi4vlZnGmKK+4QPq6JuVmcZ86KBXI+2R6wtoNz4iS+Ik7txYFv0lCFNj45TuJguqL/ZeMxa/eaHhonZ07/C8H1wonE26EFPhKfSK4H9ia+Dh06dOjw/8G/sXcmUir28IcAAAAASUVORK5CYII='
imgstring = imgstring.split('base64,')[-1].strip()
pic = cStringIO.StringIO()
image_string = cStringIO.StringIO(base64.b64decode(imgstring))
image = Image.open(image_string)

# Overlay on white background, see http://stackoverflow.com/a/7911663/1703216
bg = Image.new("RGB", image.size, (255,255,255))
bg.paste(image,image)

print pytesseract.image_to_string(bg)

# Save the image passed to pytesseract for debugging purposes
bg.save('pic.png')

适用于Python 3.*

from PIL import Image
import base64
import pytesseract
import io

imgstring = 'data: image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGcAAAAVCAYAAABbq/AzAAAACXBIWXMAAA7EAAAOxAGVKw4bAAADiUlEQVRoge3YTWgeVRQG4IcQJJQSQihBNIQiXYUSpJSgIF1IkSKllCJFQggutBQRQRfFH3RTRFwVERdBRHcuREVEupASRIP4C7VoFSmWSq2gtRGjNm21Ls79+k0mM3cmLrpxXhjm++aec973/sy55w4dOnTo0KFDhw4FLOIe3IWrmWtbwWcfLuDhinj7avzvTjyLDXp24F2cxx84gQMVdgfxHVbwFe5tqaEObXxynDCAQ/g2af+8hjM3ftcwIzoPQ5iouObEQG1INvP4PvlVBT+IYxVxhlJ7Vad6GMICHsQkNqWO/I7Zgt1s6twejGEvfhMLrI2GKjT5NHHCs6l/dyTtM8nnzkL/msbvGhbFTOdwDIfT7514GyN4ryb403glE+8JvN/AWcY8Xi/8P45HK+J+0FJDFZp8mjgHxRu1o2TzkH62aDN+YFS8tlMZQdP4S6yCMhZqgr8oVlAdtuFvDGdsyngHL6Xfw0L31pq4Qy00VCHn04bzlmSzsWQznmw2lJ7XjZ8BbMclfJ0R/CRexS8ZmzJGcb/+nnFcrJ6B1P4lroiJb8I4nhGD8FzhGZwu2Z5OHJtbaFiv7jacvTG6qWSzKdmMZbjX4D78kGnfisvYUtNeN/PDYr8Yxo2iCDiH5ws2ZxN/HY7ob8insKvQtj09Hyz5DOkXLm00rEd3G04iZR3Vn8xpkdKuWjtptW+O1HAyI/Y1vJFpzwYvYUakx17nTjb4Dor0MIH9YgPtpZxJ1emjl6YnGzTMWF2NzbTQ3ZZzRKTfc6KIOSoKiMu4oeRbO36DWK4g62GLGJTbM8LXg2/EKtuIpXRfzthfSe3LOJPub+EpkUr+ERNXTMkT6X6mQcObuLnw/NcWuttyLuGBUoxZkcovZXhWYQA/6efDMh7Hh/ikbcAGTCe+Jf38++M6/Iur7k98ZnWqk/5/oX7SexouJu7edbGF7v/KOYBH8HKGoxJjqqu1cVES5g5s1L+Wh3GbyNvjYm+5oH+QnEq8oxW+t4rSfkqkiDHsFmltvmC3R2zcu5LdbnHm2N9SQxWafJo4iYwwIhbTtCiXF1S/AI3bwqfWnnNeEAepJuRK6VOi4vlZnGmKK+4QPq6JuVmcZ86KBXI+2R6wtoNz4iS+Ik7txYFv0lCFNj45TuJguqL/ZeMxa/eaHhonZ07/C8H1wonE26EFPhKfSK4H9ia+Dh06dOjw/8G/sXcmUir28IcAAAAASUVORK5CYII='
imgstring = imgstring.split('base64,')[-1].strip()
pic = io.StringIO()
image_string = io.BytesIO(base64.b64decode(imgstring))
image = Image.open(image_string)

# Overlay on white background, see http://stackoverflow.com/a/7911663/1703216
bg = Image.new("RGB", image.size, (255,255,255))
bg.paste(image,image)

print(pytesseract.image_to_string(bg))

# Save the image passed to pytesseract for debugging purposes
bg.save('pic.png')

相关问题 更多 >