Azure Data Lake Gen 2及Python在数据湖文件夹中复制文件

0 投票
1 回答
159 浏览
提问于 2025-04-14 15:57

根据这里提供的示例:https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python?tabs=account-key,我已经能够连接到我的Azure数据湖,并查看目录和文件结构,做一些简单的操作,一切都很好。
接下来的步骤是:我想把一个大文件上传到数据湖里。
问题是:我需要在数据湖中有两个文件的副本,一个是原始的副本,另一个可能会被修改。我想避免重复上传文件。希望能先上传一次文件,然后再复制到第二个位置。
在上面的示例中,我看到了如何上传文件、移动文件和删除文件,但没有看到如何复制文件。

希望能得到你的帮助。

KD


** 新代码 ** 新尝试 ** 新异常 ** 我根据谷歌搜索和浏览更新了我的代码

import datetime 
import time
import os, uuid
from azure.identity import DefaultAzureCredential
from azure.storage.blob  import BlobServiceClient, BlobClient, ContainerClient
from azure.storage.blob import ResourceTypes, AccountSasPermissions, BlobSasPermissions
from azure.storage.blob import generate_account_sas , generate_blob_sas
import  azure.core.exceptions


storage_account_name = "XXdatarepos"
storage_account_key = "qvoVHq5NP9EtzKcmH1mm9kXXXXXXXXXXXXXXXXXX**XX**XXAStWgFLpA=="
container_name = "iadata"
local_dir_name  = "C:\XXX\SupplierManagement"
target_dir_name = "Inbox/Test/"
target_dir_name2 = "Repos/SupplierXXX/Data/fooMar9/"
file_name =   "compare_all.xlsx"
local_filePath = os.path.join(local_dir_name, file_name)

target_blob = target_dir_name  +  file_name
target_blob2 = target_dir_name2  +  file_name

account_url = "https://"+ storage_account_name + ".blob.core.windows.net"

# Create the BlobServiceClient object
blob_service_client = BlobServiceClient(account_url, credential=storage_account_key,   connection_verify= False)
containers = blob_service_client.list_containers( name_starts_with="iadata" , include_metadata=True)
container_client = blob_service_client.get_container_client("iadata")   
cc_url  = container_client.url

# Create sas token for blob
sas_resource_types=ResourceTypes(service=True, object=True, container=True)
try:
    sas_token_target = generate_account_sas(  account_name = blob_service_client.account_name, 
                       container_name=container_client.container_name,  
                       account_key = storage_account_key ,  
                       blob_name=target_blob, 
                       resource_types = sas_resource_types,
                       permission= AccountSasPermissions(read=True) ,   expiry = datetime.timedelta(hours=4)  )
except Exception as error:
    print(error)    
    print( type(error).__name__ )
    
    


source_blob_client = BlobClient(account_url = account_url, container_name= container_client.container_name,  blob_name=target_blob, credential = sas_token_target   )
##  ##  source_blob_client = container_client.get_blob_client(target_blob )
##  ##  source_blob_client = BlobClient(account_url = cc_url, blob_name=target_blob, credential = sas_token_target   )

target_blob_client =   container_client.get_blob_client(target_blob2)

target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)

copy_properties = target_blob_client.get_blob_properties().copy

if copy_properties.status != "success":
    target_blob_client.abort_copy(copy_properties.id)
    raise Exception(
            f"Unable to copy blob %s with status %s"
            % (target_blob, copy_properties.status)
    )

** ** 新异常 ** ** 关于不安全请求的警告是新的

ipdb> C:\Users\ne098406\.conda\envs\python_3.7_XXX\lib\site-packages\urllib3\connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
azure.core.exceptions.ResourceNotFoundError: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:3d2343a2-001e-00a6-5d2c-7ae40b000000
Time:2024-03-19T18:40:46.7150943Z
ErrorCode:CannotVerifyCopySource
Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>CannotVerifyCopySource</Code><Message>Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:3d2343a2-001e-00a6-5d2c-7ae40b000000
Time:2024-03-19T18:40:46.7150943Z</Message></Error>
None
> c:\kbd\testazure\testblobcopymar18.py(57)<module>()
     55 target_blob_client =   container_client.get_blob_client(target_blob2)
     56 
---> 57 target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)
     58 
     59 copy_properties = target_blob_client.get_blob_properties().copy


ipdb> --Return--
None
> c:\kbd\testazure\testblobcopymar18.py(57)<module>()
     55 target_blob_client =   container_client.get_blob_client(target_blob2)
     56 
---> 57 target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)
     58 
     59 copy_properties = target_blob_client.get_blob_properties().copy 

源blob客户端的URL如下

ipdb> 
ipdb> source_blob_client.url
'https://iadatarepos.blob.core.windows.net/iadata/Inbox/Test/compare_all.xlsx?se=4%3A00%3A00&sp=r&sv=2023-11-03&ss=b&srt=sco&sig=an0cyq5YK%2BL0woSLQIUzWUoz9V1GbHHQyJnOQipxDCI%3D'

如果我把这个URL放到Chrome里(不带单引号),我得到的结果是: AuthenticationFailed 服务器未能验证请求。请确保Authorization头的值正确形成,包括签名。请求ID:4f30a97b-801e-003e-75cd-7ac46a000000 时间:2024-03-20T13:53:05.9201944Z 签名字段格式不正确。

希望能得到你的帮助。

KBD

1 个回答

0

你可以使用下面的代码,把文件从一个文件夹复制到另一个文件夹,这里是用Python语言在ADLS Gen2上操作的:

from azure.storage.blob import BlobServiceClient

def copy_files_to_adls(account_url, sas_token, source_container, source_directory, destination_container, destination_directory):
    blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token) 
    source_container_client = blob_service_client.get_container_client(source_container)
    blobs = source_container_client.list_blobs(name_starts_with=source_directory) 
    for blob in blobs:
        source_blob_path = "<sourcePath>"
        destination_blob_path = "<destinationDirectory>/<fileName>"
        blob_service_client.get_blob_client(destination_container, destination_blob_path).start_copy_from_url(source_blob_path)    
    print("Files copied successfully!")


account_url = 'https://adlsc.blob.core.windows.net'
account_name = '<accountname>'
account_key = "<accountKey>"
source_container = '<sourceContainer>'
source_directory = '<sourceDirectory>'
destination_container = '<destinationContainer>'
destination_directory = '<destinationDirectory>'

copy_files_to_adls(account_url, account_key, source_container, source_directory, destination_container, destination_directory)

在这里输入图片描述

文件成功复制,并且名字保持不变。

在这里输入图片描述

撰写回答