当试图从网页上的图片中提取元数据时，总是返回{}，为什么？

import exifread import colorama import urllib2 import urllib import random import time import bs4 import sys def get_images(target): colorama.init() print(colorama.Fore.LIGHTGREEN_EX + "[*] Retrieving Meta Data from Target's Page...") req = urllib2.Request(target) resp = urllib2.urlopen(req) page = resp.read() soup = bs4.BeautifulSoup(page, "html.parser") for img in soup.find_all("img"): src = img.get("src") if "www" in src or "http" in src or "https" in src: rand_num = random.random() name = str(rand_num) + ".jpg" urllib.urlretrieve(src, name) f = open(name, "rb") tags = exifread.process_file(f) print (tags) else: s = target + src rand_num = random.random() name = str(rand_num) + ".jpg" urllib.urlretrieve(s, name) f = open(name, "rb") tags = exifread.process_file(f) print (tags) return def main(): target = raw_input("Enter the target: ") print ("\n") get_images(target) time.sleep(5) sys.exit() if __name__ == "__main__": main()

1条回答

网友

1楼 · 发布于 2024-04-24 20:17:14

问题是您没有传递一个基url，您需要传递主机，然后将其连接到src，除非您从src属性获得一个绝对的url。在

下面的代码演示了一个工作示例，我使用requests代替urllib，但逻辑是相同的：

import bs4
import sys
import os
import requests
from urlparse import urljoin

def get_images(target, base):
    page = requests.get(target).content
    soup = bs4.BeautifulSoup(page, "html.parser")
    for img in soup.find_all("img", src=True):
        src = img.get("src")
        name = os.path.basename(src)
        if not src.startswith(("www.","http:","https:")):
                src = urljoin(base, src)
        with open(name, "wb+") as f:
            f.write(requests.get(src).content)
            f.seek(0)
            tags = exifread.process_file(f,"rb")
            print (tags)


def main():

    target ="http://www.exiv2.org/sample.html"
    # need base to join to relative src 
    base = "http://www.exiv2.org/"
    get_images(target, base)


if __name__ == "__main__":
    main()

您将在包含以下内容的页面上获取一个图像的exif数据：

PIL示例：

^{pr2}$

os.remove(name)将删除没有exif数据的文件，如果不希望这样做，请将其删除。在

相关问题更多 >

编程相关推荐

热门问题

热门文章