如何解决这段代码的深拷贝问题?
我正在尝试使用这个模型,先是从GitHub上下载了它,然后做了一些修改。
我按照下面的方式定义了模型,并试图进行深拷贝,但遇到了提到的错误。
import copy
model = ViViTBackbone(
t=24,
h=112,
w=112,
patch_t=4,
patch_h=7,
patch_w=7,
num_classes=500,
dim=1024,
depth=6,
heads=10,
mlp_dim=8,
model=3
)
target_model = copy.deepcopy(model)
不过,我遇到了这样的错误:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[11], line 2
1 import copy
----> 2 target_model = copy.deepcopy(model)
File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:270, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
268 if state is not None:
269 if deep:
--> 270 state = deepcopy(state, memo)
271 if hasattr(y, '__setstate__'):
272 y.__setstate__(state)
File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:230, in _deepcopy_dict(x, memo, deepcopy)
228 memo[id(x)] = y
229 for key, value in x.items():
--> 230 y[deepcopy(key, memo)] = deepcopy(value, memo)
231 return y
File ~/anaconda3/envs/el_dorado/lib/python3.8/copy.py:153, in deepcopy(x, memo, _nil)
151 copier = getattr(x, "__deepcopy__", None)
152 if copier is not None:
--> 153 y = copier(memo)
154 else:
155 reductor = dispatch_table.get(cls)
File ~/anaconda3/envs/el_dorado/lib/python3.8/site-packages/torch/_tensor.py:86, in Tensor.__deepcopy__(self, memo)
84 return handle_torch_function(Tensor.__deepcopy__, (self,), self, memo)
85 if not self.is_leaf:
---> 86 raise RuntimeError(
87 "Only Tensors created explicitly by the user "
88 "(graph leaves) support the deepcopy protocol at the moment. "
89 "If you were attempting to deepcopy a module, this may be because "
90 "of a torch.nn.utils.weight_norm usage, "
91 "see https://github.com/pytorch/pytorch/pull/103001"
92 )
93 if id(self) in memo:
94 return memo[id(self)]
RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment. If you were attempting to deepcopy a module, this may be because of a torch.nn.utils.weight_norm usage, see https://github.com/pytorch/pytorch/pull/103001
而且深拷贝并没有成功。你能帮帮我吗?下面是原始代码。
(参考GitHub链接: https://github.com/drv-agwl/ViViT-pytorch/blob/master/models.py )
from torch import nn, einsum
import torch
from einops.layers.torch import Rearrange
from einops import rearrange, repeat
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = nn.LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
class FSAttention(nn.Module):
"""Factorized Self-Attention"""
def __init__(self, dim, heads=8, dim_head=64, dropout=0.):
super().__init__()
inner_dim = dim_head * heads
project_out = not (heads == 1 and dim_head == dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.attend = nn.Softmax(dim=-1)
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
self.to_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
) if project_out else nn.Identity()
def forward(self, x):
b, n, _, h = *x.shape, self.heads
qkv = self.to_qkv(x).chunk(3, dim=-1)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), qkv)
dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
attn = self.attend(dots)
out = einsum('b h i j, b h j d -> b h i d', attn, v)
out = rearrange(out, 'b h n d -> b n (h d)')
return self.to_out(out)
class FDAttention(nn.Module):
"""Factorized Dot-product Attention"""
def __init__(self, dim, nt, nh, nw, heads=8, dim_head=64, dropout=0.):
super().__init__()
inner_dim = dim_head * heads
project_out = not (heads == 1 and dim_head == dim)
self.nt = nt
self.nh = nh
self.nw = nw
self.heads = heads
self.scale = dim_head ** -0.5
self.attend = nn.Softmax(dim=-1)
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
self.to_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
) if project_out else nn.Identity()
def forward(self, x):
b, n, d, h = *x.shape, self.heads
qkv = self.to_qkv(x).chunk(3, dim=-1)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), qkv)
qs, qt = q.chunk(2, dim=1)
ks, kt = k.chunk(2, dim=1)
vs, vt = v.chunk(2, dim=1)
# Attention over spatial dimension
qs = qs.view(b, h // 2, self.nt, self.nh * self.nw, -1)
ks, vs = ks.view(b, h // 2, self.nt, self.nh * self.nw, -1), vs.view(b, h // 2, self.nt, self.nh * self.nw, -1)
spatial_dots = einsum('b h t i d, b h t j d -> b h t i j', qs, ks) * self.scale
sp_attn = self.attend(spatial_dots)
spatial_out = einsum('b h t i j, b h t j d -> b h t i d', sp_attn, vs)
# Attention over temporal dimension
qt = qt.view(b, h // 2, self.nh * self.nw, self.nt, -1)
kt, vt = kt.view(b, h // 2, self.nh * self.nw, self.nt, -1), vt.view(b, h // 2, self.nh * self.nw, self.nt, -1)
temporal_dots = einsum('b h s i d, b h s j d -> b h s i j', qt, kt) * self.scale
temporal_attn = self.attend(temporal_dots)
temporal_out = einsum('b h s i j, b h s j d -> b h s i d', temporal_attn, vt)
# return self.to_out(out)
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout=0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
class FSATransformerEncoder(nn.Module):
"""Factorized Self-Attention Transformer Encoder"""
def __init__(self, dim, depth, heads, dim_head, mlp_dim, nt, nh, nw, dropout=0.):
super().__init__()
self.layers = nn.ModuleList([])
self.nt = nt
self.nh = nh
self.nw = nw
for _ in range(depth):
self.layers.append(nn.ModuleList(
[PreNorm(dim, FSAttention(dim, heads=heads, dim_head=dim_head, dropout=dropout)),
PreNorm(dim, FSAttention(dim, heads=heads, dim_head=dim_head, dropout=dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout=dropout))
]))
def forward(self, x):
b = x.shape[0]
x = torch.flatten(x, start_dim=0, end_dim=1) # extract spatial tokens from x
for sp_attn, temp_attn, ff in self.layers:
sp_attn_x = sp_attn(x) + x # Spatial attention
# Reshape tensors for temporal attention
sp_attn_x = sp_attn_x.chunk(b, dim=0)
sp_attn_x = [temp[None] for temp in sp_attn_x]
sp_attn_x = torch.cat(sp_attn_x, dim=0).transpose(1, 2)
sp_attn_x = torch.flatten(sp_attn_x, start_dim=0, end_dim=1)
temp_attn_x = temp_attn(sp_attn_x) + sp_attn_x # Temporal attention
x = ff(temp_attn_x) + temp_attn_x # MLP
# Again reshape tensor for spatial attention
x = x.chunk(b, dim=0)
x = [temp[None] for temp in x]
x = torch.cat(x, dim=0).transpose(1, 2)
x = torch.flatten(x, start_dim=0, end_dim=1)
# Reshape vector to [b, nt*nh*nw, dim]
x = x.chunk(b, dim=0)
x = [temp[None] for temp in x]
x = torch.cat(x, dim=0)
x = torch.flatten(x, start_dim=1, end_dim=2)
return x
class FDATransformerEncoder(nn.Module):
"""Factorized Dot-product Attention Transformer Encoder"""
def __init__(self, dim, depth, heads, dim_head, mlp_dim, nt, nh, nw, dropout=0.):
super().__init__()
self.layers = nn.ModuleList([])
self.nt = nt
self.nh = nh
self.nw = nw
for _ in range(depth):
self.layers.append(
PreNorm(dim, FDAttention(dim, nt, nh, nw, heads=heads, dim_head=dim_head, dropout=dropout)))
def forward(self, x):
for attn in self.layers:
x = attn(x) + x
return x
class ViViTBackbone(nn.Module):
""" Model-3 backbone of ViViT """
def __init__(self, t, h, w, patch_t, patch_h, patch_w, num_classes, dim, depth, heads, mlp_dim, dim_head=3,
channels=3, mode='tubelet', device='cuda', emb_dropout=0., dropout=0., model=3):
super().__init__()
assert t % patch_t == 0 and h % patch_h == 0 and w % patch_w == 0, "Video dimensions should be divisible by " \
"tubelet size "
self.T = t
self.H = h
self.W = w
self.channels = channels
self.t = patch_t
self.h = patch_h
self.w = patch_w
self.mode = mode
self.device = device
self.nt = self.T // self.t
self.nh = self.H // self.h
self.nw = self.W // self.w
tubelet_dim = self.t * self.h * self.w * channels
# x.shape: torch.Size([64, 3, 32, 64, 64]) -> torch.Size([64, 32, 64, 64, 3])
self.to_tubelet_embedding = nn.Sequential(
Rearrange('b c (t pt) (h ph) (w pw) -> b t (h w) (pt ph pw c)', pt=self.t, ph=self.h, pw=self.w),
nn.Linear(tubelet_dim, dim)
)
# repeat same spatial position encoding temporally
self.pos_embedding = nn.Parameter(torch.randn(1, 1, self.nh * self.nw, dim)).repeat(1, self.nt, 1, 1)
self.dropout = nn.Dropout(emb_dropout)
if model == 3:
self.transformer = FSATransformerEncoder(dim, depth, heads, dim_head, mlp_dim,
self.nt, self.nh, self.nw, dropout)
elif model == 4:
assert heads % 2 == 0, "Number of heads should be even"
self.transformer = FDATransformerEncoder(dim, depth, heads, dim_head, mlp_dim,
self.nt, self.nh, self.nw, dropout)
self.to_latent = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, x):
""" x is a video: (b, C, T, H, W) """
tokens = self.to_tubelet_embedding(x)
self.pos_embedding = self.pos_embedding.to(tokens.device)
tokens += self.pos_embedding
tokens = self.dropout(tokens)
x = self.transformer(tokens) # output dimension: [b, nt*nh*nw, dim]
b, _, _ = x.shape
x = x.view(b, self.nt, self.nh*self.nw, -1).mean(dim=2) # 공간적 차원에 대한 평균
x = self.to_latent(x)
return x
我对输出做了一些修改,并尝试使用clone(),但似乎有点困难,因为我在使用一个叫做Hydra的东西。不过,深拷贝应该是可以的。
0 个回答
暂无回答