If it's a list, take them blocks_to_take = range (total_block_len - n, total_block_len) if isinstance (n, int) else n for block_chunk in self.blocks: for blk in block_chunk [i:]: # Passing the nn.Identity () x = blk (x) if i in blocks_to_take: output.append (x) i += 1 assert len (output) == len (blocks_to_take), f"only {len (output)} / {len (blocks_to_take)} blocks found" return output def get_intermediate_layers ( self, x: torch.Tensor, n: Union [int, Sequence] = 1, # Layers or n last layers to take reshape: bool = False, return_class_token: bool = False, norm=True ) -> Tuple [Union [torch.Tensor, Tuple [torch.Tensor]]]: if self.chunked_blocks: outputs = self._get_intermediate_layers_chunked (x, n) else: outputs = self.get_intermediate_layers_not_chunked (x, n) if norm: outputs = [self.norm (out) for out in outputs] class_tokens = [out [:, 0] for out in outputs] outputs = [out [:, 1 + self.num_register_tokens:] for out in outputs] if reshape: B, , w, h = x.shape outputs = [ out.reshape (B, w // self.patch_size, h // self.patch_size, -1).permute (0, 3, 1, 2).contiguous () for out in outputs ] if return_class_token: return tuple (zip (outputs, class_tokens)) return tuple (outputs) def forward (self, *args, is_training=False, **kwargs): ret = self.forward_features (*args, **kwargs) if is_training: return ret else: return self.head (ret ["x_norm_clstoken"]) def init_weights_vit_timm (module: nn.Module, name: str = ""): """ViT weight initialization, original timm impl (for reproducibility)""" if isinstance (module, nn.Linear): trunc_normal (module.weight, std=0.02) if module.bias is not None: nn.init.zeros (module.bias) def vit_small (patch_size=16, num_register_tokens=0, **kwargs): model = DinoVisionTransformer ( patch_size=patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, block_fn=partial (Block, attn_class=MemEffAttention), num_register_tokens=num_register_tokens, **kwargs, ) return model def vit_base (patch_size=16, num_register_tokens=0, **kwargs): model = DinoVisionTransformer ( patch_size=patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, block_fn=partial (Block, attn_class=MemEffAttention), num_register_tokens=num_register_tokens, **kwargs, ) return model def vit_large (patch_size=16, num_register_tokens=0, **kwargs): model = DinoVisionTransformer ( patch_size=patch_size, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, block_fn=partial (Block, attn_class=MemEffAttention), num_register_tokens=num_register_tokens, **kwargs, ) return model def vit_giant2 (patch_size=16, num_register_tokens=0, **kwargs): """ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64 """ model = DinoVisionTransformer ( patch_size=patch_size, embed_dim=1536, depth=40, num_heads=24, mlp_ratio=4, block_fn=partial (Block, attn_class=MemEffAttention), num_register_tokens=num_register_tokens, **kwargs, ) return model def DINOv2 (model_name): model_zoo = { "vits": vit_small, "vitb": vit_base, "vitl": vit_large, "vitg": vit_giant2 } return model_zoo [model_name] ( img_size=518, patch_size=14, init_values=1.0, ffn_layer="mlp" if model_name != "vitg" else "swiglufused", block_chunks=0, num_register_tokens=0, interpolate_antialias=False, interpolate_offset=0.1 )
npyImageplaneZ = np.full ( [intHeight, intWidth, 1], fltFocal, np.float32) npyImageplane = np.concatenate ( [npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2) npyDepth = npyDistance / np.linalg.norm (npyImageplane, 2, 2) * fltFocal return npyDepth class Hypersim (Dataset): def init (self, filelist_path, mode, size= (518, 518)):
norms = np.linalg.norm(normals, axis=1, keepdims=True) normals_normalized = normals / (norms + 1e-8) - # Mean Shift clustering on normal vectors - ms = MeanShift(bandwidth=0.1, bin_seeding=True) - labels = ms.fit_predict(normals_normalized) - labels_image = labels.reshape(H, W)