# Camera Adapter增强版设计 - 支持动态数量、类型和位置 **设计目标**: - ✅ 支持动态数量 (1-N cameras) - ✅ 支持不同类型 (广角/长焦/鱼眼等) - ✅ 支持不同位置 (前/后/左/右/任意) --- ## 🎯 核心设计理念 ### 问题分析 **原始Camera Adapter局限**: ```python # 固定数量的adapters self.camera_adapters = nn.ModuleList([ Adapter() for _ in range(6) # ❌ 固定6个 ]) # 问题: 1. num_cameras必须在训练时确定 2. Camera 0总是用adapter[0] → 无法处理位置变化 3. 无法处理缺失cameras ``` **增强版解决方案**: ```python # 基于camera属性的动态adapter # 不再是"第i个camera用adapter[i]" # 而是"根据camera的type和position选择adapter" 核心思想: Camera特征 → [类型, 位置, FOV, ...] → 动态选择/组合adapters ``` --- ## 🚀 增强版Camera Adapter架构 ### 设计1: Type-Position Factorized Adapter ⭐⭐⭐⭐⭐ **核心思想**: 将adapter分解为"类型"和"位置"两个正交维度 ```python # mmdet3d/models/modules/camera_adapter_enhanced.py import torch import torch.nn as nn import torch.nn.functional as F class EnhancedCameraAdapter(nn.Module): """ 增强版Camera Adapter 支持: ✅ 动态camera数量 (1-N) ✅ 不同camera类型 (广角/长焦/鱼眼) ✅ 不同camera位置 (前/后/左/右/任意角度) 设计: Adapter = Type-Specific Module ⊕ Position-Specific Module """ def __init__( self, in_channels: int = 256, # 类型维度 camera_types: list = ['wide', 'tele', 'fisheye'], # 支持的类型 type_adapter_channels: int = 256, # 位置维度 max_cameras: int = 12, # 最多支持12个cameras position_encoding_dim: int = 128, use_learned_position: bool = True, # 学习位置编码 vs 固定编码 # 通用设置 adapter_depth: int = 2, # adapter层数 use_residual: bool = True, ): super().__init__() self.in_channels = in_channels self.camera_types = camera_types self.num_types = len(camera_types) self.max_cameras = max_cameras self.use_residual = use_residual # ========== 1. Type-Specific Adapters ========== # 为每种camera类型创建adapter self.type_adapters = nn.ModuleDict({ cam_type: self._make_adapter( in_channels, type_adapter_channels, adapter_depth ) for cam_type in camera_types }) print(f"[EnhancedCameraAdapter] Created {self.num_types} type-specific adapters") for cam_type in camera_types: params = sum(p.numel() for p in self.type_adapters[cam_type].parameters()) print(f" - {cam_type}: {params:,} params") # ========== 2. Position Encoding ========== if use_learned_position: # 学习式位置编码: 根据camera的3D位置生成embedding self.position_encoder = nn.Sequential( nn.Linear(6, 128), # [x, y, z, roll, pitch, yaw] nn.ReLU(inplace=True), nn.Linear(128, position_encoding_dim), nn.ReLU(inplace=True), nn.Linear(position_encoding_dim, in_channels), ) print(f" - Position encoder: learned ({position_encoding_dim}D)") else: # 固定式位置编码: sinusoidal encoding self.register_buffer( 'position_encoding', self._get_sinusoidal_encoding(max_cameras, in_channels) ) print(f" - Position encoder: sinusoidal (fixed)") self.use_learned_position = use_learned_position # ========== 3. Type-Position Fusion ========== # 融合type和position两种信息 self.fusion_layer = nn.Sequential( nn.Conv2d(in_channels * 2, in_channels, 1), # concat后压缩 nn.BatchNorm2d(in_channels), nn.ReLU(inplace=True), ) # ========== 4. Camera Importance Weighting ========== # 根据camera属性自动学习权重 self.importance_net = nn.Sequential( nn.AdaptiveAvgPool2d(1), # Global pooling nn.Flatten(), nn.Linear(in_channels, in_channels // 4), nn.ReLU(inplace=True), nn.Linear(in_channels // 4, 1), nn.Sigmoid(), ) # ========== 5. Type-ID映射 ========== self.type_to_id = {t: i for i, t in enumerate(camera_types)} print(f"[EnhancedCameraAdapter] Total params: {self._count_params():,}") def _make_adapter(self, in_channels, hidden_channels, depth): """创建adapter网络""" layers = [] for i in range(depth): layers.extend([ nn.Conv2d( in_channels if i == 0 else hidden_channels, hidden_channels, kernel_size=3, padding=1, groups=max(1, hidden_channels // 32), # Depthwise ), nn.BatchNorm2d(hidden_channels), nn.ReLU(inplace=True), ]) # 最后一层映射回原始通道 layers.append(nn.Conv2d(hidden_channels, in_channels, 1)) layers.append(nn.BatchNorm2d(in_channels)) return nn.Sequential(*layers) def _get_sinusoidal_encoding(self, max_len, d_model): """生成固定的sinusoidal位置编码""" position = torch.arange(max_len).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)) pe = torch.zeros(max_len, d_model) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) return pe def _count_params(self): """统计参数量""" return sum(p.numel() for p in self.parameters()) def forward( self, x, camera_types=None, camera_positions=None, camera_indices=None, ): """ Args: x: (B, N, C, H, W) - N个cameras的特征 camera_types: list of str, len=N - 每个camera的类型 例: ['wide', 'tele', 'wide', 'fisheye'] camera_positions: (B, N, 6) - 每个camera的3D位置 [x, y, z, roll, pitch, yaw] 相对于vehicle camera_indices: (N,) - camera在原始配置中的index (可选) 用于处理缺失cameras的情况 Returns: output: (B, N, C, H, W) - adapted features camera_weights: (B, N) - 每个camera的重要性权重 """ B, N, C, H, W = x.shape # 处理默认参数 if camera_types is None: # 默认都是wide camera_types = ['wide'] * N assert len(camera_types) == N, \ f"camera_types length ({len(camera_types)}) != num_cameras ({N})" # 存储输出 adapted_features = [] camera_weights = [] for i in range(N): cam_feat = x[:, i] # (B, C, H, W) cam_type = camera_types[i] # ===== 1. Type-Specific Adaptation ===== if cam_type not in self.type_adapters: print(f"Warning: camera type '{cam_type}' not found, using 'wide'") cam_type = 'wide' type_adapted = self.type_adapters[cam_type](cam_feat) # (B, C, H, W) # ===== 2. Position Encoding ===== if camera_positions is not None: cam_pos = camera_positions[:, i] # (B, 6) if self.use_learned_position: # 学习式: 从3D位置学习embedding pos_embed = self.position_encoder(cam_pos) # (B, C) pos_embed = pos_embed.unsqueeze(-1).unsqueeze(-1) # (B, C, 1, 1) else: # 固定式: 使用预定义encoding cam_idx = camera_indices[i] if camera_indices is not None else i pos_embed = self.position_encoding[cam_idx] # (C,) pos_embed = pos_embed.view(1, C, 1, 1).expand(B, -1, 1, 1) # Broadcast到spatial dimensions pos_embed = pos_embed.expand(-1, -1, H, W) # ===== 3. Fusion: Type + Position ===== # Concatenate type-adapted和position encoding fused_feat = torch.cat([type_adapted, pos_embed], dim=1) # (B, 2C, H, W) fused_feat = self.fusion_layer(fused_feat) # (B, C, H, W) else: # 如果没有position信息,只用type adaptation fused_feat = type_adapted # ===== 4. Residual Connection ===== if self.use_residual: fused_feat = fused_feat + cam_feat # Residual # ===== 5. Camera Importance ===== # 根据adapted特征计算该camera的重要性 importance = self.importance_net(fused_feat) # (B, 1) adapted_features.append(fused_feat) camera_weights.append(importance) # 组合所有cameras output = torch.stack(adapted_features, dim=1) # (B, N, C, H, W) weights = torch.stack(camera_weights, dim=1).squeeze(-1) # (B, N) return output, weights def get_camera_importance_summary(self, weights): """ 分析各camera的重要性 Args: weights: (B, N) - camera重要性权重 Returns: dict: 统计信息 """ weights_mean = weights.mean(dim=0) # (N,) summary = { 'mean_weights': weights_mean.cpu().numpy(), 'max_weight': weights_mean.max().item(), 'min_weight': weights_mean.min().item(), 'std': weights_mean.std().item(), } return summary # ========== Helper: Camera Configuration Manager ========== class CameraConfigManager: """ 管理动态camera配置 功能: - 处理不同数量cameras - 管理camera类型映射 - 计算camera位置 - 处理missing cameras """ @staticmethod def create_camera_config(camera_list): """ 从camera列表创建配置 Args: camera_list: list of dict [ { 'name': 'CAM_FRONT', 'type': 'wide', 'position': [1.5, 0.0, 1.5, 0, 0, 0], # [x,y,z,r,p,y] 'fov': 120.0, 'focal_length': [1266, 1266], }, { 'name': 'CAM_FRONT_TELE', 'type': 'tele', 'position': [1.5, 0.0, 1.6, 0, 0, 0], 'fov': 30.0, 'focal_length': [2532, 2532], }, ... ] Returns: config: dict with all necessary info """ config = { 'num_cameras': len(camera_list), 'camera_names': [cam['name'] for cam in camera_list], 'camera_types': [cam['type'] for cam in camera_list], 'camera_positions': [cam['position'] for cam in camera_list], 'camera_fovs': [cam.get('fov', 120.0) for cam in camera_list], 'camera_focals': [cam.get('focal_length', [1266, 1266]) for cam in camera_list], } return config @staticmethod def get_camera_attributes_tensor(camera_config, device='cuda'): """ 生成camera属性tensor Returns: attributes: (N, D) - 每个camera的属性向量 D = 6(position) + 2(focal) + 1(fov) + 1(type_id) = 10 """ N = camera_config['num_cameras'] attributes = [] for i in range(N): attr = ( camera_config['camera_positions'][i] + # 6D position list(camera_config['camera_focals'][i]) + # 2D focal [camera_config['camera_fovs'][i]] + # 1D fov [hash(camera_config['camera_types'][i]) % 100] # 1D type ) attributes.append(attr) return torch.tensor(attributes, device=device, dtype=torch.float32) ``` --- ## 💻 完整实现 ### 主模块实现 ```python # mmdet3d/models/vtransforms/enhanced_camera_lss.py from typing import List, Optional, Dict, Tuple import torch import torch.nn as nn import torch.nn.functional as F from .lss import LSSTransform class EnhancedCameraAwareLSS(LSSTransform): """ 增强版Camera-Aware LSS Transform 特性: 1. 动态camera数量 - 训练时N可变 2. 类型感知 - 不同类型camera不同处理 3. 位置感知 - 利用camera 3D位置信息 4. 缺失处理 - 可以处理部分cameras缺失 示例: # 训练时: 6 cameras train_config = { 'cameras': [ {'type': 'wide', 'position': [1.5, 0, 1.5, 0, 0, 0]}, {'type': 'tele', 'position': [1.5, 0, 1.6, 0, 0, 0]}, ... ] } # 推理时: 4 cameras (subset) test_config = { 'cameras': [ {'type': 'wide', 'position': [1.5, 0, 1.5, 0, 0, 0]}, {'type': 'wide', 'position': [1.5, -0.5, 1.5, 0, 0, -30]}, ... ] } """ def __init__( self, # LSS基础参数 in_channels: int, out_channels: int, image_size: Tuple[int, int], feature_size: Tuple[int, int], xbound: Tuple[float, float, float], ybound: Tuple[float, float, float], zbound: Tuple[float, float, float], dbound: Tuple[float, float, float], # Camera Adapter参数 camera_types: List[str] = None, max_cameras: int = 12, use_learned_position: bool = True, adapter_channels: int = 256, adapter_depth: int = 2, **kwargs ): # 初始化LSS super().__init__( in_channels, out_channels, image_size, feature_size, xbound, ybound, zbound, dbound, **kwargs ) # 默认支持的camera类型 if camera_types is None: camera_types = ['wide', 'tele', 'fisheye', 'ultra_wide'] self.camera_types = camera_types self.max_cameras = max_cameras # 创建Enhanced Camera Adapter self.camera_adapter = EnhancedCameraAdapter( in_channels=in_channels, camera_types=camera_types, max_cameras=max_cameras, type_adapter_channels=adapter_channels, use_learned_position=use_learned_position, adapter_depth=adapter_depth, ) print(f"[EnhancedCameraAwareLSS] Initialized") print(f" - Supported camera types: {camera_types}") print(f" - Max cameras: {max_cameras}") print(f" - Position encoding: {'learned' if use_learned_position else 'fixed'}") def get_cam_feats(self, x, mats_dict, camera_meta=None): """ 提取camera特征 (LSS风格) Args: x: (B, N, C, fH, fW) - N个cameras的neck输出 mats_dict: dict - camera矩阵 camera_meta: dict (可选) - camera元信息 { 'types': ['wide', 'tele', 'wide', 'wide'], 'positions': tensor(B, N, 6), # [x,y,z,r,p,y] 'fovs': [120, 30, 120, 120], } Returns: x: (B, N, D, fH, fW, C) - depth-aware features """ B, N, C, fH, fW = x.shape # ===== 提取camera元信息 ===== if camera_meta is not None: camera_types = camera_meta.get('types', ['wide'] * N) camera_positions = camera_meta.get('positions', None) else: # 默认配置: 全部wide, 无position camera_types = ['wide'] * N camera_positions = None # ===== Enhanced Camera Adapter ===== # 应用type和position感知的adaptation x_adapted, cam_weights = self.camera_adapter( x, camera_types=camera_types, camera_positions=camera_positions, ) # 可选: 打印camera重要性 (调试用) if self.training and torch.rand(1).item() < 0.01: # 1%概率打印 weights_summary = self.camera_adapter.get_camera_importance_summary(cam_weights) print(f"[Camera Importance] {weights_summary['mean_weights']}") # ===== 继续LSS处理 ===== # 将adapted特征传给LSS的depth estimation B, N, C, fH, fW = x_adapted.shape x_flat = x_adapted.view(B * N, C, fH, fW) # Depth estimation (LSS核心) # 这里调用父类的实现 depth_logits = self.depthnet(x_flat) # (B*N, D, fH, fW) depth_prob = depth_logits.softmax(dim=1) # Depth-aware features # (B*N, C, fH, fW) -> (B*N, D, fH, fW, C) context = x_flat.unsqueeze(1).expand(-1, self.D, -1, -1, -1) context = context.permute(0, 1, 3, 4, 2) # (B*N, D, fH, fW, C) depth_prob_expanded = depth_prob.unsqueeze(-1) # (B*N, D, fH, fW, 1) x_weighted = context * depth_prob_expanded # 加权 # Reshape: (B*N, D, fH, fW, C) -> (B, N, D, fH, fW, C) x_output = x_weighted.view(B, N, self.D, fH, fW, C) return x_output # ========== 工具函数 ========== def create_standard_camera_configs(): """ 创建标准camera配置 返回多种预定义配置 """ configs = {} # nuScenes标准 (6 cameras) configs['nuscenes'] = { 'cameras': [ {'name': 'CAM_FRONT', 'type': 'wide', 'position': [1.5, 0.0, 1.5, 0, 0, 0]}, {'name': 'CAM_FRONT_RIGHT', 'type': 'wide', 'position': [1.5, -0.5, 1.5, 0, 0, -60]}, {'name': 'CAM_FRONT_LEFT', 'type': 'wide', 'position': [1.5, 0.5, 1.5, 0, 0, 60]}, {'name': 'CAM_BACK', 'type': 'wide', 'position': [-1.5, 0.0, 1.5, 0, 0, 180]}, {'name': 'CAM_BACK_LEFT', 'type': 'wide', 'position': [-1.5, 0.5, 1.5, 0, 0, 120]}, {'name': 'CAM_BACK_RIGHT', 'type': 'wide', 'position': [-1.5, -0.5, 1.5, 0, 0, -120]}, ] } # 4 cameras + tele configs['4cam_tele'] = { 'cameras': [ {'name': 'CAM_FRONT_WIDE', 'type': 'wide', 'position': [1.5, 0.0, 1.5, 0, 0, 0]}, {'name': 'CAM_FRONT_TELE', 'type': 'tele', 'position': [1.5, 0.0, 1.6, 0, 0, 0]}, {'name': 'CAM_FRONT_LEFT', 'type': 'wide', 'position': [1.5, 0.5, 1.5, 0, 0, 45]}, {'name': 'CAM_FRONT_RIGHT', 'type': 'wide', 'position': [1.5, -0.5, 1.5, 0, 0, -45]}, ] } # 5 cameras (no back) configs['5cam_front'] = { 'cameras': [ {'name': 'CAM_FRONT', 'type': 'wide', 'position': [1.5, 0.0, 1.5, 0, 0, 0]}, {'name': 'CAM_FRONT_LEFT', 'type': 'wide', 'position': [1.5, 0.5, 1.5, 0, 0, 60]}, {'name': 'CAM_FRONT_RIGHT', 'type': 'wide', 'position': [1.5, -0.5, 1.5, 0, 0, -60]}, {'name': 'CAM_LEFT', 'type': 'wide', 'position': [0.0, 0.8, 1.5, 0, 0, 90]}, {'name': 'CAM_RIGHT', 'type': 'wide', 'position': [0.0, -0.8, 1.5, 0, 0, -90]}, ] } # 8 cameras (全覆盖) configs['8cam_full'] = { 'cameras': [ {'name': 'CAM_FRONT', 'type': 'wide', 'position': [1.5, 0.0, 1.5, 0, 0, 0]}, {'name': 'CAM_FRONT_TELE', 'type': 'tele', 'position': [1.5, 0.0, 1.6, 0, 0, 0]}, {'name': 'CAM_FRONT_LEFT', 'type': 'wide', 'position': [1.5, 0.5, 1.5, 0, 0, 45]}, {'name': 'CAM_FRONT_RIGHT', 'type': 'wide', 'position': [1.5, -0.5, 1.5, 0, 0, -45]}, {'name': 'CAM_LEFT', 'type': 'wide', 'position': [0.0, 0.8, 1.5, 0, 0, 90]}, {'name': 'CAM_RIGHT', 'type': 'wide', 'position': [0.0, -0.8, 1.5, 0, 0, -90]}, {'name': 'CAM_BACK_LEFT', 'type': 'wide', 'position': [-1.5, 0.5, 1.5, 0, 0, 135]}, {'name': 'CAM_BACK_RIGHT', 'type': 'wide', 'position': [-1.5, -0.5, 1.5, 0, 0, -135]}, ] } return configs ``` --- ## 🔧 集成到BEVFusion ### 修改模型forward ```python # mmdet3d/models/fusion_models/bevfusion.py (修改extract_camera_features) def extract_camera_features( self, img, points, radar, camera2ego, lidar2ego, lidar2camera, lidar2image, camera_intrinsics, camera2lidar, img_aug_matrix, lidar_aug_matrix, metas, gt_depths=None, ): """ 增强版camera特征提取 自动从metas中提取camera配置信息 """ B, N, C, H, W = img.shape # ===== 提取camera元信息 ===== camera_meta = self._extract_camera_meta(metas, N) # camera_meta包含: # - types: ['wide', 'tele', ...] # - positions: (B, N, 6) # - fovs, focals等 # ===== Backbone处理 (共享) ===== x = img.view(B * N, C, H, W) x = self.encoders["camera"]["backbone"](x) x = self.encoders["camera"]["neck"](x) # Reshape _, C_out, H_out, W_out = x.shape x = x.view(B, N, C_out, H_out, W_out) # ===== VTransform (Enhanced) ===== # 传入camera meta信息 x = self.encoders["camera"]["vtransform"]( x, points, radar, camera2ego, lidar2ego, lidar2camera, lidar2image, camera_intrinsics, camera2lidar, img_aug_matrix, lidar_aug_matrix, camera_meta=camera_meta, # ← 新增 ) return x def _extract_camera_meta(self, metas, num_cameras): """从metas中提取camera元信息""" # 从metas中读取camera配置 # 如果没有,使用默认值 camera_meta = { 'types': [], 'positions': [], } for i in range(num_cameras): # 尝试从meta中获取 if 'camera_types' in metas: cam_type = metas['camera_types'][i] else: cam_type = 'wide' # 默认 if 'camera_positions' in metas: cam_pos = metas['camera_positions'][i] else: cam_pos = [0, 0, 0, 0, 0, 0] # 默认 camera_meta['types'].append(cam_type) camera_meta['positions'].append(cam_pos) # 转为tensor camera_meta['positions'] = torch.tensor( camera_meta['positions'], dtype=torch.float32, device=img.device ).unsqueeze(0) # (1, N, 6) - batch维度 return camera_meta ``` --- ## 📝 配置文件示例 ### 示例1: 标准6 cameras ```yaml # configs/nuscenes/det/.../multitask_enhanced_camera.yaml model: type: BEVFusion encoders: camera: backbone: type: SwinTransformer # ... 标准配置 neck: type: GeneralizedLSSFPN in_channels: [192, 384, 768] out_channels: 256 vtransform: type: EnhancedCameraAwareLSS # ← 使用增强版 in_channels: 256 out_channels: 80 # Camera Adapter配置 camera_types: ['wide', 'tele', 'fisheye'] max_cameras: 12 use_learned_position: true adapter_channels: 256 adapter_depth: 2 # LSS配置 image_size: ${image_size} feature_size: ${[image_size[0] // 8, image_size[1] // 8]} xbound: [-54.0, 54.0, 0.3] ybound: [-54.0, 54.0, 0.3] zbound: [-10.0, 10.0, 20.0] dbound: [1.0, 60.0, 0.5] downsample: 2 # Camera配置 camera_config: num_cameras: 6 cameras: - name: CAM_FRONT type: wide position: [1.5, 0.0, 1.5, 0.0, 0.0, 0.0] fov: 120.0 - name: CAM_FRONT_RIGHT type: wide position: [1.5, -0.5, 1.5, 0.0, 0.0, -60.0] fov: 120.0 # ... 其他cameras ``` ### 示例2: 4 cameras (广角+长焦) ```yaml # configs/custom/4cam_tele.yaml camera_config: num_cameras: 4 cameras: - name: CAM_FRONT_WIDE type: wide position: [1.5, 0.0, 1.5, 0.0, 0.0, 0.0] fov: 120.0 focal_length: [1266, 1266] - name: CAM_FRONT_TELE type: tele # ← 长焦 position: [1.5, 0.0, 1.6, 0.0, 0.0, 0.0] fov: 30.0 focal_length: [2532, 2532] - name: CAM_LEFT type: wide position: [0.5, 0.8, 1.5, 0.0, 0.0, 75.0] fov: 120.0 - name: CAM_RIGHT type: wide position: [0.5, -0.8, 1.5, 0.0, 0.0, -75.0] fov: 120.0 model: encoders: camera: vtransform: type: EnhancedCameraAwareLSS camera_types: ['wide', 'tele'] # 只需要2种adapters max_cameras: 8 # 预留扩展空间 ``` ### 示例3: 可变cameras (训练时支持3-8) ```yaml # configs/custom/variable_cameras.yaml # 训练时随机drop cameras (data augmentation) train_augmentation: random_drop_cameras: enabled: true min_cameras: 3 # 最少保留3个 max_cameras: 6 # 最多6个 drop_prob: 0.2 # 每个camera 20%概率被drop model: encoders: camera: vtransform: type: EnhancedCameraAwareLSS camera_types: ['wide'] max_cameras: 12 # 训练时N会在[3,6]之间变化 # 增强模型对缺失cameras的鲁棒性 ``` --- ## 🎯 核心优势 ### 1. 动态数量支持 ```python # 同一个模型,不同运行时camera数量 # 训练: 6 cameras train_input = (B, 6, C, H, W) # 推理配置A: 4 cameras test_input_a = (B, 4, C, H, W) ✅ 支持 # 推理配置B: 8 cameras test_input_b = (B, 8, C, H, W) ✅ 支持 # 关键: adapter根据camera type选择,不依赖固定index ``` ### 2. 类型自适应 ```python # 示例: 不同配置下的type mapping 配置A (nuScenes): cameras = ['wide', 'wide', 'wide', 'wide', 'wide', 'wide'] → 6个cameras都用wide adapter 配置B (定制): cameras = ['wide', 'tele', 'wide', 'wide'] → camera[0,2,3]用wide adapter → camera[1]用tele adapter 配置C (混合): cameras = ['tele', 'fisheye', 'ultra_wide', 'wide', 'wide'] → 每个camera根据type自动选择对应adapter ``` ### 3. 位置感知 ```python # Position encoding考虑camera的物理位置 # 前置cameras pos_front = [1.5, 0.0, 1.5, 0, 0, 0] → position_embed_front # 侧置cameras pos_left = [0.0, 0.8, 1.5, 0, 0, 90] → position_embed_left # 后置cameras pos_back = [-1.5, 0.0, 1.5, 0, 0, 180] → position_embed_back # 效果: # 相同type但不同position的cameras # 会得到不同的adaptation ``` --- ## 🧪 验证测试 ### 测试1: 不同数量 ```python # test_variable_num_cameras.py def test_variable_cameras(): """测试3-8个cameras""" model = build_enhanced_camera_model() for num_cams in [3, 4, 5, 6, 8]: print(f"\n测试 {num_cams} cameras:") # 创建输入 img = torch.randn(2, num_cams, 3, 900, 1600).cuda() camera_types = ['wide'] * num_cams camera_positions = get_default_positions(num_cams) # 前向传播 output, weights = model( img, camera_types=camera_types, camera_positions=camera_positions, ) print(f" ✅ 输出shape: {output.shape}") print(f" ✅ Camera权重: {weights.mean(dim=0)}") print("\n✅ 所有测试通过!") # 运行测试 test_variable_cameras() ``` ### 测试2: 不同类型组合 ```python def test_camera_type_combinations(): """测试不同类型组合""" configs = [ # 全部wide ['wide', 'wide', 'wide', 'wide'], # wide + tele ['wide', 'tele', 'wide', 'wide'], # 混合类型 ['wide', 'tele', 'fisheye', 'ultra_wide'], # 多tele ['tele', 'tele', 'wide', 'wide'], ] for types in configs: print(f"\n测试类型组合: {types}") output = model(img, camera_types=types, ...) print(f" ✅ 成功") ``` ### 测试3: 不同位置配置 ```python def test_camera_positions(): """测试相同type但不同position""" # 配置A: 标准nuScenes位置 pos_a = [ [1.5, 0.0, 1.5, 0, 0, 0], # front [1.5, 0.5, 1.5, 0, 0, 60], # front_left [1.5, -0.5, 1.5, 0, 0, -60], # front_right [-1.5, 0.0, 1.5, 0, 0, 180], # back ] # 配置B: 自定义位置 (假设前装位置不同) pos_b = [ [2.0, 0.0, 1.8, 0, 0, 0], # front (更前,更高) [1.5, 0.6, 1.5, 0, 0, 45], # front_left (角度不同) [1.5, -0.6, 1.5, 0, 0, -45], # front_right [-2.0, 0.0, 1.5, 0, 0, 180], # back (更后) ] types = ['wide', 'wide', 'wide', 'wide'] # 测试两种位置配置 out_a = model(img, camera_types=types, camera_positions=pos_a) out_b = model(img, camera_types=types, camera_positions=pos_b) # 特征应该不同 (因为position encoding不同) diff = (out_a - out_b).abs().mean() print(f"Position difference: {diff:.4f}") # 应该>0 ``` --- ## 📊 能力矩阵 ### Camera Adapter Enhanced能做什么? | 能力 | 原始Adapter | 增强Adapter | 说明 | |------|-----------|------------|------| | **动态数量** | ❌ 固定N | ✅ 1-12可变 | 训练/推理N可不同 | | **类型支持** | ⚠️ 有限 | ✅ 无限 | 任意定义新type | | **位置适配** | ❌ 无 | ✅ 完整 | 3D position encoding | | **缺失处理** | ❌ 无 | ✅ 支持 | 可以skip某些cameras | | **类型混合** | ❌ 无 | ✅ 任意 | wide+tele+fisheye | | **权重学习** | ❌ 固定 | ✅ 动态 | 自动学习camera重要性 | | **迁移学习** | ⚠️ 困难 | ✅ 容易 | 6cam→4cam无缝 | --- ## 🎯 具体回答您的问题 ### Q1: 支持不同数量? **A: ✅ 完全支持** ```python # 同一个训练好的模型 # 场景1: 白天6 cameras全开 cameras_day = ['CAM_FRONT', 'CAM_FR', 'CAM_FL', 'CAM_BACK', 'CAM_BL', 'CAM_BR'] types_day = ['wide', 'wide', 'wide', 'wide', 'wide', 'wide'] → 使用6个cameras推理 # 场景2: 夜晚后cameras故障,只用4个 cameras_night = ['CAM_FRONT', 'CAM_FR', 'CAM_FL', 'CAM_BACK'] types_night = ['wide', 'wide', 'wide', 'wide'] → 使用4个cameras推理 # 场景3: 特殊场景添加临时camera cameras_special = ['CAM_FRONT', ..., 'CAM_TEMP'] types_special = ['wide', 'wide', 'wide', 'wide', 'wide', 'wide', 'wide'] → 使用7个cameras推理 关键: 每次根据camera_types选择adapter,不依赖固定数量 ``` ### Q2: 支持不同类型? **A: ✅ 完全支持** ```python # 预定义3种adapters camera_types_supported = ['wide', 'tele', 'fisheye'] # 使用时任意组合 config_1 = ['wide', 'wide', 'wide', 'wide'] # 4个wide config_2 = ['wide', 'tele', 'wide', 'wide'] # 3 wide + 1 tele config_3 = ['tele', 'fisheye', 'wide', 'ultra_wide'] # 混合 # 每个camera根据type选择对应adapter # wide cameras → wide_adapter # tele cameras → tele_adapter # fisheye cameras → fisheye_adapter # 如果需要新类型,添加即可: camera_types_supported = ['wide', 'tele', 'fisheye', 'ultra_wide', 'thermal'] # 重新训练,自动学习新adapter ``` ### Q3: 支持位置不同? **A: ✅ 完全支持** ```python # 位置信息通过position encoding融入 # 示例: 3个wide cameras在不同位置 cameras = [ {'type': 'wide', 'position': [1.5, 0.0, 1.5, 0, 0, 0]}, # 正前 {'type': 'wide', 'position': [0.0, 0.8, 1.5, 0, 0, 90]}, # 左侧 {'type': 'wide', 'position': [-1.5, 0.0, 1.5, 0, 0, 180]}, # 正后 ] # 处理流程: for i, cam in enumerate(cameras): cam_feat = x[:, i] # 1. Type adaptation (都是wide) type_adapted = wide_adapter(cam_feat) # 2. Position encoding (不同) pos_embed = position_encoder(cam['position']) # [1.5,0,1.5,0,0,0] → embed_front # [0.0,0.8,1.5,0,0,90] → embed_left # [-1.5,0,1.5,0,0,180] → embed_back # 3. Fusion final_feat = fuse(type_adapted, pos_embed) # → 相同type但不同位置的cameras得到不同特征! # 结果: 3个wide cameras,但因为位置不同,处理也不同 ✅ ``` --- ## 💡 组合示例 ### 真实场景: 复杂配置 ```yaml # 8 cameras, 4种类型, 8个不同位置 camera_config: num_cameras: 8 cameras: # 前方: wide + tele双目 - {name: CAM_F_WIDE, type: wide, position: [2.0, 0.0, 1.8, 0, 0, 0]} - {name: CAM_F_TELE, type: tele, position: [2.0, 0.0, 1.9, 0, 0, 0]} # 前侧: wide - {name: CAM_FL, type: wide, position: [1.5, 0.7, 1.6, 0, 0, 55]} - {name: CAM_FR, type: wide, position: [1.5, -0.7, 1.6, 0, 0, -55]} # 侧方: fisheye大视角 - {name: CAM_L, type: fisheye, position: [0.0, 1.0, 1.5, 0, 0, 90]} - {name: CAM_R, type: fisheye, position: [0.0, -1.0, 1.5, 0, 0, -90]} # 后方: ultra_wide - {name: CAM_BL, type: ultra_wide, position: [-1.5, 0.5, 1.5, 0, 0, 140]} - {name: CAM_BR, type: ultra_wide, position: [-1.5, -0.5, 1.5, 0, 0, -140]} model: encoders: camera: vtransform: type: EnhancedCameraAwareLSS camera_types: ['wide', 'tele', 'fisheye', 'ultra_wide'] # 4种type adapters # 8个不同positions # 全部自动处理! ``` **处理流程**: ``` CAM_F_WIDE: type=wide, pos=[2.0,0,1.8,0,0,0] → wide_adapter(feat) + position_encoder([2.0,0,1.8,0,0,0]) → 前方wide camera的特征 CAM_F_TELE: type=tele, pos=[2.0,0,1.9,0,0,0] → tele_adapter(feat) + position_encoder([2.0,0,1.9,0,0,0]) → 前方tele camera的特征 (与wide不同!) CAM_L: type=fisheye, pos=[0,1.0,1.5,0,0,90] → fisheye_adapter(feat) + position_encoder([0,1.0,1.5,0,0,90]) → 左侧fisheye camera的特征 所有8个cameras → 各自独特的adaptation → BEV pooling融合 ``` --- ## ✅ 总结回答 ### 方案2 Enhanced Camera Adapter **完全支持**: 1. ✅ **不同数量**: - 训练时6个,推理时3/4/5/6/8都可以 - 动态适配,无需重新训练 - Camera个数在合理范围内(1-12)任意变化 2. ✅ **不同类型**: - 预定义N种camera types (wide, tele, fisheye, ...) - 每种type有独立adapter - 可以任意组合使用 - 新增type只需重新训练adapter 3. ✅ **位置不同**: - Position encoder处理3D位置 - [x, y, z, roll, pitch, yaw]全部考虑 - 相同type不同position → 不同adaptation - 完全灵活 ### 实现复杂度 ``` 代码量: ~500行 参数量: +6M (3类型 × 2M/adapter) 训练时间: 从epoch_20开始,5 epochs,约2.5天 开发时间: 3-4天 ``` ### 与MoE对比 | 特性 | Enhanced Adapter | MoE | |------|-----------------|-----| | 动态数量 | ✅ 完全支持 | ✅ 支持 | | 不同类型 | ✅ 完全支持 | ✅ 支持 | | 位置适配 | ✅ **显式设计** | ⚠️ 需额外实现 | | 实现复杂度 | ⭐⭐ 中等 | ⭐⭐⭐⭐ 高 | | 参数效率 | ⭐⭐⭐⭐ 高 | ⭐⭐ 低 | | 训练稳定性 | ⭐⭐⭐⭐⭐ 很好 | ⭐⭐⭐ 一般 | | 可解释性 | ⭐⭐⭐⭐⭐ 强 | ⭐⭐ 弱 | **结论**: Enhanced Camera Adapter **优于MoE** ✅ --- ## 🚀 需要我现在实现吗? 我可以立即为您创建: 1. ✅ **完整代码实现** (`mmdet3d/models/modules/camera_adapter_enhanced.py`) 2. ✅ **集成到LSS** (`mmdet3d/models/vtransforms/enhanced_camera_lss.py`) 3. ✅ **配置文件模板** (支持3/4/5/6/8 cameras) 4. ✅ **测试脚本** (验证动态数量/类型/位置) 5. ✅ **使用文档** (如何配置和训练) **实现时间**: 1天 **测试时间**: 1天 **训练时间**: 2-3天 (从epoch_20开始) **要我现在开始实现吗?** 🚀