25

def remove_duplicates(dir):
    unique = []
    for filename in os.listdir(dir):
        if os.path.isfile(filename):
            filehash = md5.md5(file(filename).read()).hexdigest()
            if filehash not in unique: 
                unique.append(filehash)
            else: 
                os.remove(filename)

//编辑：

如果你对MP3文件感兴趣，可能还想看看这个话题如何检测不同比特率和/或不同ID3标签的重复MP3文件？

回答于 2025-04-15 由 Python大师

分享举报

47

递归文件夹版本：

这个版本通过文件大小和内容的哈希值来查找重复的文件。你可以给它多个路径，它会递归地扫描所有路径，并报告找到的所有重复文件。

import sys
import os
import hashlib

def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk

def check_for_duplicates(paths, hash=hashlib.sha1):
    hashes = {}
    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                hashobj = hash()
                for chunk in chunk_reader(open(full_path, 'rb')):
                    hashobj.update(chunk)
                file_id = (hashobj.digest(), os.path.getsize(full_path))
                duplicate = hashes.get(file_id, None)
                if duplicate:
                    print "Duplicate found: %s and %s" % (full_path, duplicate)
                else:
                    hashes[file_id] = full_path

if sys.argv[1:]:
    check_for_duplicates(sys.argv[1:])
else:
    print "Please pass the paths to check as parameters to the script"

回答于 2025-04-15 由 Python大师

分享举报

129

最快的算法 - 性能提升100倍，跟之前的答案比真的很厉害 :))

其他方案都很不错，但它们忽略了一个重要的特点：重复文件的大小是一样的。只对同样大小的文件计算复杂的哈希值，可以节省大量的CPU资源；后面会有性能对比，这里先解释一下。

借鉴@nosklo的有效方案，并结合@Raffi的想法，只对每个文件的开头部分进行快速哈希，只有在快速哈希碰撞时才计算完整哈希，步骤如下：

建立一个文件的哈希表，以文件大小作为关键字。
对于同样大小的文件，创建一个哈希表，存储它们前1024个字节的哈希值；没有碰撞的元素就是唯一的。
对于前1k字节哈希值相同的文件，计算完整内容的哈希 - 匹配的文件就不是唯一的了。

代码如下：

#!/usr/bin/env python3
from collections import defaultdict
import hashlib
import os
import sys


def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk


def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
    hashobj = hash()
    file_object = open(filename, 'rb')

    if first_chunk_only:
        hashobj.update(file_object.read(1024))
    else:
        for chunk in chunk_reader(file_object):
            hashobj.update(chunk)
    hashed = hashobj.digest()

    file_object.close()
    return hashed


def check_for_duplicates(paths, hash=hashlib.sha1):
    hashes_by_size = defaultdict(list)  # dict of size_in_bytes: [full_path_to_file1, full_path_to_file2, ]
    hashes_on_1k = defaultdict(list)  # dict of (hash1k, size_in_bytes): [full_path_to_file1, full_path_to_file2, ]
    hashes_full = {}   # dict of full_file_hash: full_path_to_file_string

    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            # get all files that have the same size - they are the collision candidates
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                try:
                    # if the target is a symlink (soft one), this will 
                    # dereference it - change the value to the actual target file
                    full_path = os.path.realpath(full_path)
                    file_size = os.path.getsize(full_path)
                    hashes_by_size[file_size].append(full_path)
                except (OSError,):
                    # not accessible (permissions, etc) - pass on
                    continue


    # For all files with the same file size, get their hash on the 1st 1024 bytes only
    for size_in_bytes, files in hashes_by_size.items():
        if len(files) < 2:
            continue    # this file size is unique, no need to spend CPU cycles on it

        for filename in files:
            try:
                small_hash = get_hash(filename, first_chunk_only=True)
                # the key is the hash on the first 1024 bytes plus the size - to
                # avoid collisions on equal hashes in the first part of the file
                # credits to @Futal for the optimization
                hashes_on_1k[(small_hash, size_in_bytes)].append(filename)
            except (OSError,):
                # the file access might've changed till the exec point got here 
                continue

    # For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
    for __, files_list in hashes_on_1k.items():
        if len(files_list) < 2:
            continue    # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it

        for filename in files_list:
            try: 
                full_hash = get_hash(filename, first_chunk_only=False)
                duplicate = hashes_full.get(full_hash)
                if duplicate:
                    print("Duplicate found: {} and {}".format(filename, duplicate))
                else:
                    hashes_full[full_hash] = filename
            except (OSError,):
                # the file access might've changed till the exec point got here 
                continue


if __name__ == "__main__":
    if sys.argv[1:]:
        check_for_duplicates(sys.argv[1:])
    else:
        print("Please pass the paths to check as parameters to the script")

接下来是有趣的部分 - 性能对比。

基准情况 -

一个包含1047个文件的目录，其中32个是mp4，1015个是jpg，总大小为5445.998 MiB - 也就是我手机相机自动上传的目录 :)
小型（但功能齐全）的处理器 - 1600 BogoMIPS，1.2 GHz 32L1 + 256L2 Kbs缓存，/proc/cpuinfo：

处理器 : Feroceon 88FR131 rev 1 (v5l) BogoMIPS : 1599.07

（也就是我的低端NAS :)，运行的是Python 2.7.11。

所以，@nosklo的非常实用的解决方案输出如下：

root@NAS:InstantUpload# time ~/scripts/checkDuplicates.py 
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg

real    5m44.198s
user    4m44.550s
sys     0m33.530s

这是一个版本，先根据大小过滤，然后进行小哈希，最后如果发现碰撞再进行完整哈希：

root@NAS:InstantUpload# time ~/scripts/checkDuplicatesSmallHash.py . "/i-data/51608399/photo/Todor phone"
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg

real    0m1.398s
user    0m1.200s
sys     0m0.080s

这两个版本各运行了3次，以获取平均所需时间。

所以v1的时间是（用户+系统）284秒，另一个是2秒；差别真不小，哈哈 :) 有了这个提升，可以使用SHA512，甚至更复杂的算法 - 性能损失会因为计算量减少而减轻。

缺点：

比其他版本需要更多的磁盘访问 - 每个文件都要访问一次以获取大小统计（这虽然便宜，但仍然是磁盘IO），每个重复文件要打开两次（一次是为了小的前1k字节哈希，另一次是为了完整内容哈希）
由于在运行时存储哈希表，会消耗更多内存

回答于 2025-04-15 由 Python大师

分享举报

查找并删除重复文件

10 个回答

递归文件夹版本：

最快的算法 - 性能提升100倍，跟之前的答案比真的很厉害 :))

撰写回答