基于聚类对应关系对图像进行排序

import os import re import shutil srcdir = '/home/username/pictures/' # if not os.path.isdir(srcdir): print("Error, %s is not a valid directory!" % srcdir) return None pts_cls # is the list of pairs (image_id, cluster_id) filelist = [(srcdir+fn) for fn in os.listdir(srcdir) if re.search(r'\.jpg$', fn, re.IGNORECASE)] filelist.sort(key=lambda var:[int(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)]) for f in filelist: fbname = os.path.splitext(os.path.basename(f))[0] for e,cls in enumerate(pts_cls): # for each (img_id, clst_id) pair if str(cls[0])==fbname: # check if image_id corresponds to file basename on disk) if cls[1]==-1: # if cluster_id is -1 (->noise) outdir = srcdir+'cluster_'+'Noise'+'/' else: outdir = srcdir+'cluster_'+str(cls[1])+'/' if not os.path.isdir(outdir): os.makedirs(outdir) dstf = outdir+os.path.basename(f) if os.path.isfile(dstf)==False: shutil.copy2(f,dstf)

1条回答

网友

1楼 · 发布于 2024-05-13 05:36:29

我觉得你把事情复杂化了。由于您的图像名称是唯一的（只能有一个image_id），因此您可以安全地将pts_cls转换为dict，并在现场进行快速查找，而不是每次都在成对的列表中循环。您还可以在不需要regex的地方使用regex，打包路径只是为了稍后解包

此外，如果源目录中的映像不在pts_cls中，则代码将中断，因为它的outdir永远不会被设置（或者更糟的是，它的outdir将是上一个循环中的映像）

我把它简化成：

import os
import shutil

src_dir = "/home/username/pictures/"

if not os.path.isdir(src_dir):
    print("Error, %s is not a valid directory!" % src_dir)
    exit(1)  # return is expected only from functions

pts_cls = []  # is the list of pairs (image_id, cluster_id), load from whereever...

# convert your pts_cls into a dict - since there cannot be any images in multiple clusters
# base image name is perfectly ok to use as a key for blazingly fast lookups later
cluster_map = dict(pts_cls)

# get only `.jpg` files; store base name and file name, no need for a full path at this time
files = [(fn[:-4], fn) for fn in os.listdir(src_dir) if fn.lower()[-4:] == ".jpg"]
# no need for sorting based on your code

for name, file_name in files:  # loop through all files
    if name in cluster_map:  # proceed with the file only if in pts_cls
        cls = cluster_map[name]  # get our cluster value
        # get our `cluster_<cluster_id>` or `cluster_Noise` (if cluster == -1) target path
        target_dir = os.path.join(src_dir, "cluster_" + str(cls if cls != -1 else "Noise"))
        target_file = os.path.join(target_dir, file_name)  # get the final target path
        if not os.path.exists(target_file):  # if the target file doesn't exists
            if not os.path.isdir(target_dir):  # make sure our target path exists
                os.makedirs(target_dir, exist_ok=True)  # create a full path if it doesn't
            shutil.copy(os.path.join(src_dir, file_name), target_file)  # copy

更新-如果您有多个针对特定群集ID的“特殊”文件夹（例如Noise是针对-1），您可以创建一个类似cluster_targets = {-1: "Noise"}的映射，其中键是您的群集ID，它们的值显然是特殊的名称。然后可以将target_dir代替换为：target_dir = os.path.join(src_dir, "cluster_" + str(cluster_targets.get(cls,cls)))

更新#2-由于image_id值是整数，而文件名是字符串，因此我建议您通过将image_id部分转换为字符串来构建cluster_mapdict。这样你就可以比较喜欢与喜欢，而不会有类型不匹配的危险：

cluster_map = {str(k): v for k, v in pts_cls}

如果确定src_dir中的*.jpg文件的名称中没有非整数，则可以将文件名转换为一个整数，以files列表生成中的开头-只需将fn[:-4]替换为int(fn[:-4])。但我不建议你这样做，因为你永远不知道你的文件是如何命名的

相关问题更多 >

编程相关推荐

热门问题

热门文章