import hashlib
def compare_common_files_by_hash(directory_one, directory_two):
d1_files = set(os.listdir(directory_one))
d2_files = set(os.listdir(directory_two))
common_files = list(d1_files & d2_files)
if common_files:
for filename in common_files:
hash_01 = hashlib.sha256(open(f'{directory_one}/{filename}', 'rb').read()).hexdigest()
hash_02 = hashlib.sha256(open(f'{directory_two}/{filename}', 'rb').read()).hexdigest()
if hash_01 == hash_02:
print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')
elif hash_01 != hash_02:
print(f'The file - {filename} is different in the directories {directory_one} and {directory_two}')
import os
def compare_common_files_by_size(directory_one, directory_two):
d1_files = set(os.listdir(directory_one))
d2_files = set(os.listdir(directory_two))
common_files = list(d1_files & d2_files)
if common_files:
for filename in common_files:
file_01 = os.stat(f'{directory_one}/{filename}')
file_02 = os.stat(f'{directory_two}/{filename}')
if file_01.st_size == file_02.st_size:
print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')
elif file_01.st_size != file_02.st_size:
print(f'The file - {filename} is different in the directories {directory_one} and'
f' {directory_two}')
import os
def compare_common_files_by_metadata(directory_one, directory_two):
d1_files = set(os.listdir(directory_one))
d2_files = set(os.listdir(directory_two))
common_files = list(d1_files & d2_files)
if common_files:
for filename in common_files:
file_01 = os.stat(f'{directory_one}/{filename}')
file_02 = os.stat(f'{directory_two}/{filename}')
if file_01.st_size == file_02.st_size and file_01.st_mtime == file_02.st_mtime:
print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')
elif file_01.st_size != file_02.st_size or file_01.st_mtime != file_02.st_mtime:
print(f'The file - {filename} is different in the directories {directory_one} and'
f' {directory_two}')
import os
def compare_common_files_by_lines(directory_one, directory_two):
d1_files = set(os.listdir(directory_one))
d2_files = set(os.listdir(directory_two))
common_files = list(d1_files & d2_files)
if common_files:
for filename in common_files:
if fileName.endswith('.csv'):
file_01 = open(f'{directory_one}/{filename}', 'r', encoding='ISO-8859-1')
file_02 = open(f'{directory_two}/{filename}', 'r', encoding='ISO-8859-1')
csv_file_01 = set(map(tuple, csv.reader(file_01)))
csv_file_02 = set(map(tuple, csv.reader(file_02)))
different = csv_file_01 ^ csv_file_02
for row in sorted(different, key=lambda x: x, reverse=True):
if row:
print(f'This row: \n {row} \n was different between the file {fileName} in the directories'
f' {directory_one} and {directory_two}')
import filecmp
def compare_common_files(directory_one, directory_two):
d1_files = set(os.listdir(directory_one))
d2_files = set(os.listdir(directory_two))
common_files = list(d1_files & d2_files)
if common_files:
for filename in common_files:
file_01 = f'{directory_one}/{filename}'
file_02 = f'{directory_two}/{filename}'
comparison = filecmp.cmp(file_01, file_02, shallow=False)
if comparison:
print(f'The file - {filename} is identical in the directories - {directory_one} and {directory_two}')
elif not comparison:
print(f'The file - {filename} is different in the directories - {directory_one} and {directory_two}')
import filecmp
def directory_recursive(directory_one, directory_two):
files = filecmp.dircmp(directory_one, directory_two)
for filename in files.diff_files:
print(f'The file - {filename} is different in the directories - {files.left} and {files.right}')
for filename in files.left_only:
print(f'The file - {filename} - was only found in the directory {files.left}')
for filename in files.right_only:
print(f'The file - {filename} - was only found in the directory {files.right}')
import csv
def get_csv_file_lines(file):
with open(file, 'r', encoding='utf-8') as csv_file:
rows = csv.reader(csv_file)
for row in rows:
yield row
def compare_csv_files_line_by_line(csv_file_one, csv_file_two):
csvfile_02 = get_csv_file_lines(csv_file_two)
for line_one in get_csv_file_lines(csv_file_one):
line_two = csvfile_02.__next__()
if line_two != line_one:
print('File names being compared:')
print(f'csv_file_one: {csv_file_one}')
print(f'csv_file_two: {csv_file_two}')
print(f'The following rows have difference in the files being compared.')
print('csv_file_one:', line_one)
print('csv_file_two:', line_two)
print('\n')
import fs
import os
import boto3
import hashlib
def create_temp_memory_filesystem():
mem_fs = fs.open_fs('mem://')
virtual_disk = mem_fs.makedir('hidden_dir')
return mem_fs, virtual_disk
def query_s3_file_by_name(filename, memory_filesystem, temp_directory):
s3 = boto3.resource('s3', aws_access_key_id='your_access_key_id',
aws_secret_access_key='your_secret_access_key')
bucket = s3.Bucket('your_bucket_name')
for obj in bucket.objects.all():
if obj.key == filename:
body = obj.get()['Body'].read()
with memory_filesystem.open(f'{temp_directory}/s3_{filename}', 'w') as f:
f.write(str(body))
f.close()
def compare_local_files_to_s3_files(local_csv_files):
virtual_disk = create_temp_memory_filesystem()
directory_name = str(virtual_disk[1]).split('/')[1]
files = set(os.listdir(local_csv_files))
for filename in files:
if filename.endswith('.csv'):
local_file_hash = hashlib.sha256(open(f'{local_csv_files}/{filename}', 'rb').read()).hexdigest()
query_s3_file_by_name(filename, virtual_disk[0], directory_name)
virtual_files = virtual_disk[0].opendir(directory_name)
for file_name in virtual_files.listdir('/'):
s3_file_hash = hashlib.sha256(open(file_name, 'rb').read()).hexdigest()
if local_file_hash == s3_file_hash:
print(f'The file - {filename} is identical in both the local file system and the S3 bucket.')
elif local_file_hash != s3_file_hash:
print(f'The file - {filename} is different between the local file system and the S3 bucket.')
virtual_files.remove(file_name)
virtual_disk[0].close()
使用filecmp将本地文件系统连接到S3存储桶
此示例与上面的示例相同,只是我使用filecmp.cmp而不是hashlib进行比较操作
import fs
import os
import boto3
import filecmp
def create_temp_memory_filesystem():
mem_fs = fs.open_fs('mem://')
virtual_disk = mem_fs.makedir('hidden_dir')
return mem_fs, virtual_disk
def query_s3_file_by_name(filename, memory_filesystem, temp_directory):
s3 = boto3.resource('s3', aws_access_key_id='your_access_key_id',
aws_secret_access_key='your_secret_access_key')
bucket = s3.Bucket('your_bucket_name')
for obj in bucket.objects.all():
if obj.key == filename:
body = obj.get()['Body'].read()
with memory_filesystem.open(f'{temp_directory}/s3_{filename}', 'w') as f:
f.write(str(body))
f.close()
def compare_local_files_to_s3_files(local_csv_files):
virtual_disk = create_temp_memory_filesystem()
directory_name = str(virtual_disk[1]).split('/')[1]
files = set(os.listdir(local_csv_files))
for filename in files:
if filename.endswith('.csv'):
local_file = f'{local_csv_files}/{filename}'
query_s3_file_by_name(filename, virtual_disk[0], directory_name)
virtual_files = virtual_disk[0].opendir(directory_name)
for file_name in virtual_files.listdir('/'):
comparison = filecmp.cmp(local_file, file_name, shallow=False)
if comparison:
print(f'The file - {filename} is identical in both the local file system and the S3 bucket.')
elif not comparison:
print(f'The file - {filename} is different between the local file system and the S3 bucket.')
virtual_files.remove(file_name)
virtual_disk[0].close()
使用hashlib将本地文件系统连接到Google云存储桶
此示例与上面的S3 hashlib代码示例类似,但它使用了一个Google云存储桶
import fs
import os
import hashlib
from google.cloud import storage
def create_temp_memory_filesystem():
mem_fs = fs.open_fs('mem://')
virtual_disk = mem_fs.makedir('hidden_dir')
return mem_fs, virtual_disk
def query_google_cloud_storage_file_by_name(filename, memory_filesystem, temp_directory):
client = storage.Client.from_service_account_json('path_to_your_credentials.json')
bucket = client.get_bucket('your_bucket_name')
blobs = bucket.list_blobs()
for blob in blobs:
if blob.name == filename:
with memory_filesystem.open(f'{temp_directory}/{filename}', 'w') as f:
f.write(str(blob.download_to_filename(blob.name)))
f.close()
def compare_local_files_to_google_storage_files(local_csv_files):
virtual_disk = create_temp_memory_filesystem()
directory_name = str(virtual_disk[1]).split('/')[1]
files = set(os.listdir(local_csv_files))
for filename in files:
if filename.endswith('.csv'):
local_file_hash = hashlib.sha256(open(f'{local_csv_files}/{filename}', 'rb').read()).hexdigest()
query_google_cloud_storage_file_by_name(filename, virtual_disk[0], directory_name)
virtual_files = virtual_disk[0].opendir(directory_name)
for file_name in virtual_files.listdir('/'):
gs_file_hash = hashlib.sha256(open(file_name, 'rb').read()).hexdigest()
if local_file_hash == gs_file_hash:
print(f'The file - {filename} is identical in both the local file system and the Google Cloud bucket.')
elif local_file_hash != gs_file_hash:
print(f'The file - {filename} is different between the local file system and the Google Cloud bucket.')
virtual_files.remove(file_name)
virtual_disk[0].close()
使用filecmp将本地文件系统连接到Google云存储桶
此示例与上面的S3 filecmp代码示例类似,但它使用Google云存储桶
import fs
import os
import filecmp
from google.cloud import storage
def create_temp_memory_filesystem():
mem_fs = fs.open_fs('mem://')
virtual_disk = mem_fs.makedir('hidden_dir')
return mem_fs, virtual_disk
def query_google_cloud_storage_file_by_name(filename, memory_filesystem, temp_directory):
client = storage.Client.from_service_account_json('path_to_your_credentials.json')
bucket = client.get_bucket('your_bucket_name')
blobs = bucket.list_blobs()
for blob in blobs:
if blob.name == filename:
with memory_filesystem.open(f'{temp_directory}/{filename}', 'w') as f:
f.write(str(blob.download_to_filename(blob.name)))
f.close()
def compare_local_files_to_google_storage_files(local_csv_files):
virtual_disk = create_temp_memory_filesystem()
directory_name = str(virtual_disk[1]).split('/')[1]
files = set(os.listdir(local_csv_files))
for filename in files:
if filename.endswith('.csv'):
local_file = f'{local_csv_files}/{filename}'
query_google_cloud_storage_file_by_name(filename, virtual_disk[0], directory_name)
virtual_files = virtual_disk[0].opendir(directory_name)
for file_name in virtual_files.listdir('/'):
comparison = filecmp.cmp(local_file, file_name, shallow=False)
if comparison:
print(f'The file - {filename} is identical in both the local file system and the Google Cloud bucket.')
elif not comparison:
print(f'The file - {filename} is different between the local file system and the Google Cloud bucket.')
virtual_files.remove(file_name)
virtual_disk[0].close()
import filecmp
# Path of first file
file1 = "/home/geeks/Desktop/gfg/data.txt"
# Path of second file
file2 = "/home/geeks/Desktop/gfg/gfg.txt"
# Compare the os.stat()
# signature i.e the metadata
# of both files
comp = filecmp.cmp(file1, file2)
# Print the result of comparison
print(comp)
# Compare the
# contents of both files
comp = filecmp.cmp(file1, file2, shallow = False)
# Print the result of comparison
print(comp)
有多种方法可以比较两个存储库(服务器文件系统和本地文件系统)之间的.csv文件
方法1:使用hashlib
此方法使用Python模块hashlib。我使用哈希算法sha256计算文件的哈希摘要。我将文件的哈希值与确切的文件名进行比较。这种方法工作得很好,但它会忽略两个目录中都不存在的任何文件
方法2:使用os st_大小
此方法使用Python模块操作系统。在本例中,我比较了文件的大小。此方法可以正常工作,但它将错误分类任何具有任何数据更改但不会更改文件大小的文件
方法3:使用操作系统st_大小和st_时间
此方法还使用Python模块操作系统。在本例中,我不仅比较了文件的大小,还比较了上次修改的时间。此方法工作正常,但会将文件误分类为相同文件。在测试中,我保存了一个没有数据修改的文件,并且os.st_mtime将该文件标记为不同,但实际上它并没有真正的不同
方法4:使用set()
本例使用Pythonset()确定具有相同名称的两个csv文件之间的行间差异。此方法将输出两个csv文件之间的精确更改
方法5:使用filecmp.cmp
此方法使用Python模块filecmp。在本例中,我使用filecmp.cmp,将浅设置为假。将此参数设置为False将指示filecmp查看文件的内容,而不是元数据,例如filesize,这是filecmp.cmp的默认值此方法与使用hashlib的方法1一样有效
方法6:使用filecmp.dircmp
此方法还使用Python模块filecmp。在本例中,我使用了filecmp.dircmp,它不仅允许我识别两个目录之间不常见的文件,还可以找到名称相似但内容不同的文件
方法7:逐行比较
此示例逐行比较2个csv文件,并输出不同的行。可以将输出添加到Python字典或JSON文件中,以便进行二次查询
使用hashlib将本地文件系统连接到S3存储桶
下面的示例是一个真实的用例,用于比较本地文件系统和远程S3存储桶之间的文件。我本来打算使用AWS S3创建的object.e_标记,但该标记可能有问题,不应该在哈希比较操作中使用。我决定查询S3并将单个文件加载到内存文件系统中,在每次比较操作期间可以查询并清空该文件系统。这种方法工作得很好,对我的系统性能没有负面影响
使用filecmp将本地文件系统连接到S3存储桶
此示例与上面的示例相同,只是我使用filecmp.cmp而不是hashlib进行比较操作
使用hashlib将本地文件系统连接到Google云存储桶
此示例与上面的S3 hashlib代码示例类似,但它使用了一个Google云存储桶
使用filecmp将本地文件系统连接到Google云存储桶
此示例与上面的S3 filecmp代码示例类似,但它使用Google云存储桶
浅值(可选):布尔值“真”或“假”。此参数的默认值为True。如果其值为True,则仅比较文件的元数据如果为False,则比较文件的内容。
https://www.geeksforgeeks.org/python-filecmp-cmp-method/#:~:text=cmp()%20method%20in%20Python,size%2C%20date%20modified%20etc.)
问题是
filecmp.dircmp
执行了一个浅比较:浅比较意味着
filecmp
将检查文件A和文件Bos.stat是否相等。在这种情况下,它返回true。如果为false,则比较A和B内容,如果它们相等,则返回true,否则返回false要忽略os.stat,可以使用
filecmp.cmpfiles(dir1, dir2, common, shallow=False)
。请注意filecmp.cmpfiles
的工作原理如下:你可以阅读更多关于它的内容
此外,您还可以循环troughtdir1和dir2中的所有文件,并且每次运行
filecmp.cmp(f1, f2, shallow=False)
。您可以阅读有关filecmp.cmp
{a4}的更多信息如果您对浅的工作原理有疑问,那么this答案可能会对您有所帮助
相关问题 更多 >
编程相关推荐