Azure Data Lake Gen 2及Python在数据湖文件夹中复制文件
根据这里提供的示例:https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python?tabs=account-key,我已经能够连接到我的Azure数据湖,并查看目录和文件结构,做一些简单的操作,一切都很好。
接下来的步骤是:我想把一个大文件上传到数据湖里。
问题是:我需要在数据湖中有两个文件的副本,一个是原始的副本,另一个可能会被修改。我想避免重复上传文件。希望能先上传一次文件,然后再复制到第二个位置。
在上面的示例中,我看到了如何上传文件、移动文件和删除文件,但没有看到如何复制文件。
希望能得到你的帮助。
KD
** 新代码 ** 新尝试 ** 新异常 ** 我根据谷歌搜索和浏览更新了我的代码
import datetime
import time
import os, uuid
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.storage.blob import ResourceTypes, AccountSasPermissions, BlobSasPermissions
from azure.storage.blob import generate_account_sas , generate_blob_sas
import azure.core.exceptions
storage_account_name = "XXdatarepos"
storage_account_key = "qvoVHq5NP9EtzKcmH1mm9kXXXXXXXXXXXXXXXXXX**XX**XXAStWgFLpA=="
container_name = "iadata"
local_dir_name = "C:\XXX\SupplierManagement"
target_dir_name = "Inbox/Test/"
target_dir_name2 = "Repos/SupplierXXX/Data/fooMar9/"
file_name = "compare_all.xlsx"
local_filePath = os.path.join(local_dir_name, file_name)
target_blob = target_dir_name + file_name
target_blob2 = target_dir_name2 + file_name
account_url = "https://"+ storage_account_name + ".blob.core.windows.net"
# Create the BlobServiceClient object
blob_service_client = BlobServiceClient(account_url, credential=storage_account_key, connection_verify= False)
containers = blob_service_client.list_containers( name_starts_with="iadata" , include_metadata=True)
container_client = blob_service_client.get_container_client("iadata")
cc_url = container_client.url
# Create sas token for blob
sas_resource_types=ResourceTypes(service=True, object=True, container=True)
try:
sas_token_target = generate_account_sas( account_name = blob_service_client.account_name,
container_name=container_client.container_name,
account_key = storage_account_key ,
blob_name=target_blob,
resource_types = sas_resource_types,
permission= AccountSasPermissions(read=True) , expiry = datetime.timedelta(hours=4) )
except Exception as error:
print(error)
print( type(error).__name__ )
source_blob_client = BlobClient(account_url = account_url, container_name= container_client.container_name, blob_name=target_blob, credential = sas_token_target )
## ## source_blob_client = container_client.get_blob_client(target_blob )
## ## source_blob_client = BlobClient(account_url = cc_url, blob_name=target_blob, credential = sas_token_target )
target_blob_client = container_client.get_blob_client(target_blob2)
target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)
copy_properties = target_blob_client.get_blob_properties().copy
if copy_properties.status != "success":
target_blob_client.abort_copy(copy_properties.id)
raise Exception(
f"Unable to copy blob %s with status %s"
% (target_blob, copy_properties.status)
)
** ** 新异常 ** ** 关于不安全请求的警告是新的
ipdb> C:\Users\ne098406\.conda\envs\python_3.7_XXX\lib\site-packages\urllib3\connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)
azure.core.exceptions.ResourceNotFoundError: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:3d2343a2-001e-00a6-5d2c-7ae40b000000
Time:2024-03-19T18:40:46.7150943Z
ErrorCode:CannotVerifyCopySource
Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>CannotVerifyCopySource</Code><Message>Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:3d2343a2-001e-00a6-5d2c-7ae40b000000
Time:2024-03-19T18:40:46.7150943Z</Message></Error>
None
> c:\kbd\testazure\testblobcopymar18.py(57)<module>()
55 target_blob_client = container_client.get_blob_client(target_blob2)
56
---> 57 target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)
58
59 copy_properties = target_blob_client.get_blob_properties().copy
ipdb> --Return--
None
> c:\kbd\testazure\testblobcopymar18.py(57)<module>()
55 target_blob_client = container_client.get_blob_client(target_blob2)
56
---> 57 target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)
58
59 copy_properties = target_blob_client.get_blob_properties().copy
源blob客户端的URL如下
ipdb>
ipdb> source_blob_client.url
'https://iadatarepos.blob.core.windows.net/iadata/Inbox/Test/compare_all.xlsx?se=4%3A00%3A00&sp=r&sv=2023-11-03&ss=b&srt=sco&sig=an0cyq5YK%2BL0woSLQIUzWUoz9V1GbHHQyJnOQipxDCI%3D'
如果我把这个URL放到Chrome里(不带单引号),我得到的结果是:
AuthenticationFailed
服务器未能验证请求。请确保Authorization头的值正确形成,包括签名。请求ID:4f30a97b-801e-003e-75cd-7ac46a000000 时间:2024-03-20T13:53:05.9201944Z
签名字段格式不正确。
希望能得到你的帮助。
KBD
1 个回答
0
你可以使用下面的代码,把文件从一个文件夹复制到另一个文件夹,这里是用Python语言在ADLS Gen2上操作的:
from azure.storage.blob import BlobServiceClient
def copy_files_to_adls(account_url, sas_token, source_container, source_directory, destination_container, destination_directory):
blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token)
source_container_client = blob_service_client.get_container_client(source_container)
blobs = source_container_client.list_blobs(name_starts_with=source_directory)
for blob in blobs:
source_blob_path = "<sourcePath>"
destination_blob_path = "<destinationDirectory>/<fileName>"
blob_service_client.get_blob_client(destination_container, destination_blob_path).start_copy_from_url(source_blob_path)
print("Files copied successfully!")
account_url = 'https://adlsc.blob.core.windows.net'
account_name = '<accountname>'
account_key = "<accountKey>"
source_container = '<sourceContainer>'
source_directory = '<sourceDirectory>'
destination_container = '<destinationContainer>'
destination_directory = '<destinationDirectory>'
copy_files_to_adls(account_url, account_key, source_container, source_directory, destination_container, destination_directory)
文件成功复制,并且名字保持不变。