如何解决这段代码的深拷贝问题？

Question

我正在尝试使用这个模型，先是从GitHub上下载了它，然后做了一些修改。
我按照下面的方式定义了模型，并试图进行深拷贝，但遇到了提到的错误。

import copy

model = ViViTBackbone(
    t=24,
    h=112,
    w=112,
    patch_t=4,
    patch_h=7,
    patch_w=7,
    num_classes=500,
    dim=1024,
    depth=6,
    heads=10,
    mlp_dim=8,
    model=3
)
target_model = copy.deepcopy(model)

不过，我遇到了这样的错误：

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[11], line 2
      1 import copy
----> 2 target_model = copy.deepcopy(model)

File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:172, in deepcopy(x, memo, _nil)
    170                 y = x
    171             else:
--> 172                 y = _reconstruct(x, memo, *rv)
    174 # If is its own copy, don't memoize.
    175 if y is not x:

File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:270, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
    268 if state is not None:
    269     if deep:
--> 270         state = deepcopy(state, memo)
    271     if hasattr(y, '__setstate__'):
    272         y.__setstate__(state)

File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:146, in deepcopy(x, memo, _nil)
    144 copier = _deepcopy_dispatch.get(cls)
    145 if copier is not None:
--> 146     y = copier(x, memo)
    147 else:
    148     if issubclass(cls, type):

File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:230, in _deepcopy_dict(x, memo, deepcopy)
    228 memo[id(x)] = y
    229 for key, value in x.items():
--> 230     y[deepcopy(key, memo)] = deepcopy(value, memo)
    231 return y

File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:153, in deepcopy(x, memo, _nil)
    151 copier = getattr(x, "__deepcopy__", None)
    152 if copier is not None:
--> 153     y = copier(memo)
    154 else:
    155     reductor = dispatch_table.get(cls)

File ~/anaconda3/envs/el_dorado/lib/python3.8/site-packages/torch/_tensor.py:86, in Tensor.__deepcopy__(self, memo)
     84     return handle_torch_function(Tensor.__deepcopy__, (self,), self, memo)
     85 if not self.is_leaf:
---> 86     raise RuntimeError(
     87         "Only Tensors created explicitly by the user "
     88         "(graph leaves) support the deepcopy protocol at the moment.  "
     89         "If you were attempting to deepcopy a module, this may be because "
     90         "of a torch.nn.utils.weight_norm usage, "
     91         "see https://github.com/pytorch/pytorch/pull/103001"
     92     )
     93 if id(self) in memo:
     94     return memo[id(self)]

RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment.  If you were attempting to deepcopy a module, this may be because of a torch.nn.utils.weight_norm usage, see https://github.com/pytorch/pytorch/pull/103001

而且深拷贝并没有成功。你能帮帮我吗？下面是原始代码。
(参考GitHub链接： https://github.com/drv-agwl/ViViT-pytorch/blob/master/models.py )

from torch import nn, einsum
import torch
from einops.layers.torch import Rearrange
from einops import rearrange, repeat

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)


class FSAttention(nn.Module):
    """Factorized Self-Attention"""

    def __init__(self, dim, heads=8, dim_head=64, dropout=0.):
        super().__init__()
        inner_dim = dim_head * heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim=-1)
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), qkv)

        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        attn = self.attend(dots)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)


class FDAttention(nn.Module):
    """Factorized Dot-product Attention"""

    def __init__(self, dim, nt, nh, nw, heads=8, dim_head=64, dropout=0.):
        super().__init__()
        inner_dim = dim_head * heads
        project_out = not (heads == 1 and dim_head == dim)

        self.nt = nt
        self.nh = nh
        self.nw = nw

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim=-1)
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        b, n, d, h = *x.shape, self.heads

        qkv = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), qkv)
        qs, qt = q.chunk(2, dim=1)
        ks, kt = k.chunk(2, dim=1)
        vs, vt = v.chunk(2, dim=1)

        # Attention over spatial dimension
        qs = qs.view(b, h // 2, self.nt, self.nh * self.nw, -1)
        ks, vs = ks.view(b, h // 2, self.nt, self.nh * self.nw, -1), vs.view(b, h // 2, self.nt, self.nh * self.nw, -1)
        spatial_dots = einsum('b h t i d, b h t j d -> b h t i j', qs, ks) * self.scale
        sp_attn = self.attend(spatial_dots)
        spatial_out = einsum('b h t i j, b h t j d -> b h t i d', sp_attn, vs)

        # Attention over temporal dimension
        qt = qt.view(b, h // 2, self.nh * self.nw, self.nt, -1)
        kt, vt = kt.view(b, h // 2, self.nh * self.nw, self.nt, -1), vt.view(b, h // 2, self.nh * self.nw, self.nt, -1)
        temporal_dots = einsum('b h s i d, b h s j d -> b h s i j', qt, kt) * self.scale
        temporal_attn = self.attend(temporal_dots)
        temporal_out = einsum('b h s i j, b h s j d -> b h s i d', temporal_attn, vt)

        # return self.to_out(out)


class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout=0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


class FSATransformerEncoder(nn.Module):
    """Factorized Self-Attention Transformer Encoder"""

    def __init__(self, dim, depth, heads, dim_head, mlp_dim, nt, nh, nw, dropout=0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        self.nt = nt
        self.nh = nh
        self.nw = nw

        for _ in range(depth):
            self.layers.append(nn.ModuleList(
                [PreNorm(dim, FSAttention(dim, heads=heads, dim_head=dim_head, dropout=dropout)),
                 PreNorm(dim, FSAttention(dim, heads=heads, dim_head=dim_head, dropout=dropout)),
                 PreNorm(dim, FeedForward(dim, mlp_dim, dropout=dropout))
                 ]))

    def forward(self, x):

        b = x.shape[0]
        x = torch.flatten(x, start_dim=0, end_dim=1)  # extract spatial tokens from x

        for sp_attn, temp_attn, ff in self.layers:
            sp_attn_x = sp_attn(x) + x  # Spatial attention

            # Reshape tensors for temporal attention
            sp_attn_x = sp_attn_x.chunk(b, dim=0)
            sp_attn_x = [temp[None] for temp in sp_attn_x]
            sp_attn_x = torch.cat(sp_attn_x, dim=0).transpose(1, 2)
            sp_attn_x = torch.flatten(sp_attn_x, start_dim=0, end_dim=1)

            temp_attn_x = temp_attn(sp_attn_x) + sp_attn_x  # Temporal attention

            x = ff(temp_attn_x) + temp_attn_x  # MLP

            # Again reshape tensor for spatial attention
            x = x.chunk(b, dim=0)
            x = [temp[None] for temp in x]
            x = torch.cat(x, dim=0).transpose(1, 2)
            x = torch.flatten(x, start_dim=0, end_dim=1)

        # Reshape vector to [b, nt*nh*nw, dim]
        x = x.chunk(b, dim=0)
        x = [temp[None] for temp in x]
        x = torch.cat(x, dim=0)
        x = torch.flatten(x, start_dim=1, end_dim=2)
        return x


class FDATransformerEncoder(nn.Module):
    """Factorized Dot-product Attention Transformer Encoder"""

    def __init__(self, dim, depth, heads, dim_head, mlp_dim, nt, nh, nw, dropout=0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        self.nt = nt
        self.nh = nh
        self.nw = nw

        for _ in range(depth):
            self.layers.append(
                PreNorm(dim, FDAttention(dim, nt, nh, nw, heads=heads, dim_head=dim_head, dropout=dropout)))

    def forward(self, x):
        for attn in self.layers:
            x = attn(x) + x

        return x


class ViViTBackbone(nn.Module):
    """ Model-3 backbone of ViViT """

    def __init__(self, t, h, w, patch_t, patch_h, patch_w, num_classes, dim, depth, heads, mlp_dim, dim_head=3,
                 channels=3, mode='tubelet', device='cuda', emb_dropout=0., dropout=0., model=3):
        super().__init__()

        assert t % patch_t == 0 and h % patch_h == 0 and w % patch_w == 0, "Video dimensions should be divisible by " \
                                                                           "tubelet size "

        self.T = t
        self.H = h
        self.W = w
        self.channels = channels
        self.t = patch_t
        self.h = patch_h
        self.w = patch_w
        self.mode = mode
        self.device = device

        self.nt = self.T // self.t
        self.nh = self.H // self.h
        self.nw = self.W // self.w

        tubelet_dim = self.t * self.h * self.w * channels

        # x.shape: torch.Size([64, 3, 32, 64, 64]) -> torch.Size([64, 32, 64, 64, 3])
        self.to_tubelet_embedding = nn.Sequential(
            Rearrange('b c (t pt) (h ph) (w pw) -> b t (h w) (pt ph pw c)', pt=self.t, ph=self.h, pw=self.w),
            nn.Linear(tubelet_dim, dim)
        )

        # repeat same spatial position encoding temporally
        self.pos_embedding = nn.Parameter(torch.randn(1, 1, self.nh * self.nw, dim)).repeat(1, self.nt, 1, 1)

        self.dropout = nn.Dropout(emb_dropout)

        if model == 3:
            self.transformer = FSATransformerEncoder(dim, depth, heads, dim_head, mlp_dim,
                                                     self.nt, self.nh, self.nw, dropout)
        elif model == 4:
            assert heads % 2 == 0, "Number of heads should be even"
            self.transformer = FDATransformerEncoder(dim, depth, heads, dim_head, mlp_dim,
                                                     self.nt, self.nh, self.nw, dropout)

        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x):
        """ x is a video: (b, C, T, H, W) """
        tokens = self.to_tubelet_embedding(x)

        self.pos_embedding = self.pos_embedding.to(tokens.device)
        
        tokens += self.pos_embedding
        tokens = self.dropout(tokens)

        x = self.transformer(tokens) # output dimension: [b, nt*nh*nw, dim]
        
        b, _, _ = x.shape
        x = x.view(b, self.nt, self.nh*self.nw, -1).mean(dim=2)  # 공간적 차원에 대한 평균

        x = self.to_latent(x)
        
        return x

我对输出做了一些修改，并尝试使用clone()，但似乎有点困难，因为我在使用一个叫做Hydra的东西。不过，深拷贝应该是可以的。

错误处理代码调试深拷贝 pytorch 模型修改 GitHub 克隆方法 Hydra

如何解决这段代码的深拷贝问题？

0 个回答

撰写回答