Is it feasible to use resnet as student network and CLIP as teacher network? Their output tensors are different. Is it feasible for me to reshape?
My code is as follows:
z = self.encoder_q(img) //encode is resnet network
print("z0的shape=")
print(z.shape)# //[16,128] ,16 is batchsize
z=z.unsqueeze(2)
print("z1的shape=")
print(z.shape) //[16,128,1]
z=self.decoder(z) //一个卷积层,
print("z2的shape=")
print(z.shape) //[16,38400,1]
z=z.view(8, 50, 768) //Reshape is used here
# x_rec = self.decoder(z)
# print("x_rec的shape=")
# print(x_rec.shape)
self.feature_model.eval()
with torch.no_grad():
x = normalize_clip(unnormalize(im1))
print("x0的shape=")
print(x.shape)
# x = self.resize_func(x)
# print("x0的shape=")
# print(x.shape)
x_tgt = self.feature_model.encode_image_featuremap(x)
print("x1的shape=")
print(x_tgt.shape)
x_tgt = self.feature_model.visual.ln_post(x_tgt)
x_tgt = x_tgt.detach()
x_tgt = self.ln_tgt(x_tgt)
print("x2的shape=")
print(x_tgt.shape) //[16,50,768]
loss_FD = self.loss_feat(z, x_tgt)
loss1 = loss_FD.mean()
Is it feasible to use resnet as student network and CLIP as teacher network? Their output tensors are different. Is it feasible for me to reshape?
My code is as follows:
z = self.encoder_q(img) //encode is resnet network
print("z0的shape=")
print(z.shape)# //[16,128] ,16 is batchsize
z=z.unsqueeze(2)
print("z1的shape=")
print(z.shape) //[16,128,1]
z=self.decoder(z) //一个卷积层,
print("z2的shape=")
print(z.shape) //[16,38400,1]
z=z.view(8, 50, 768) //Reshape is used here
# x_rec = self.decoder(z)
# print("x_rec的shape=")
# print(x_rec.shape)
self.feature_model.eval()
with torch.no_grad():
x = normalize_clip(unnormalize(im1))
print("x0的shape=")
print(x.shape)
# x = self.resize_func(x)
# print("x0的shape=")
# print(x.shape)
x_tgt = self.feature_model.encode_image_featuremap(x)
print("x1的shape=")
print(x_tgt.shape)
x_tgt = self.feature_model.visual.ln_post(x_tgt)
x_tgt = x_tgt.detach()
x_tgt = self.ln_tgt(x_tgt)
print("x2的shape=")
print(x_tgt.shape) //[16,50,768]