314 lines
14 KiB
Python
314 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Phase 4A Task-specific GCA 配置分析
|
|||
|
|
分析 multitask_BEV2X_phase4a_stage1_task_gca.yaml 的网络架构和特征尺寸
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import yaml
|
|||
|
|
import math
|
|||
|
|
|
|||
|
|
def analyze_phase4a_gca_config():
|
|||
|
|
"""分析Phase 4A Task-specific GCA配置"""
|
|||
|
|
|
|||
|
|
print("=" * 80)
|
|||
|
|
print("🚀 BEVFusion Phase 4A - Task-specific GCA 架构分析")
|
|||
|
|
print("=" * 80)
|
|||
|
|
|
|||
|
|
# 读取配置文件
|
|||
|
|
config_path = "configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_task_gca.yaml"
|
|||
|
|
|
|||
|
|
with open(config_path, 'r', encoding='utf-8') as f:
|
|||
|
|
config = yaml.safe_load(f)
|
|||
|
|
|
|||
|
|
print(f"📁 配置文件: {config_path}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 1. 输入规格分析 ==========
|
|||
|
|
print("📊 1. 输入规格分析")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
voxel_size = config['voxel_size']
|
|||
|
|
point_cloud_range = config['point_cloud_range']
|
|||
|
|
image_size = config.get('image_size', [256, 704]) # 从_base_继承
|
|||
|
|
|
|||
|
|
print(f"🔹 LiDAR体素尺寸: {voxel_size}")
|
|||
|
|
print(f"🔹 点云范围: {point_cloud_range}")
|
|||
|
|
print(f"🔹 图像尺寸: {image_size}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 计算BEV网格尺寸
|
|||
|
|
bev_width = int((point_cloud_range[3] - point_cloud_range[0]) / voxel_size[0])
|
|||
|
|
bev_height = int((point_cloud_range[4] - point_cloud_range[1]) / voxel_size[1])
|
|||
|
|
bev_depth = int((point_cloud_range[5] - point_cloud_range[2]) / voxel_size[2])
|
|||
|
|
|
|||
|
|
print(f"🔹 BEV网格尺寸: {bev_width}×{bev_height}×{bev_depth}")
|
|||
|
|
print(f" ├── 宽度: {bev_width} ({point_cloud_range[0]}m ~ {point_cloud_range[3]}m)")
|
|||
|
|
print(f" ├── 高度: {bev_height} ({point_cloud_range[1]}m ~ {point_cloud_range[4]}m)")
|
|||
|
|
print(f" └── 深度: {bev_depth} ({point_cloud_range[2]}m ~ {point_cloud_range[5]}m)")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 2. 编码器分析 ==========
|
|||
|
|
print("🏗️ 2. 编码器架构分析")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
model = config['model']
|
|||
|
|
encoders = model['encoders']
|
|||
|
|
|
|||
|
|
# Camera Encoder
|
|||
|
|
print("📷 Camera Encoder:")
|
|||
|
|
camera = encoders['camera']
|
|||
|
|
backbone = camera['backbone']
|
|||
|
|
|
|||
|
|
print(f" ├── Backbone: {backbone['type']}")
|
|||
|
|
print(f" ├── embed_dims: {backbone['embed_dims']}")
|
|||
|
|
print(f" ├── depths: {backbone['depths']} (总层数: {sum(backbone['depths'])})")
|
|||
|
|
print(f" ├── num_heads: {backbone['num_heads']}")
|
|||
|
|
print(f" ├── out_indices: {backbone['out_indices']}")
|
|||
|
|
|
|||
|
|
neck = camera['neck']
|
|||
|
|
print(f" ├── Neck: {neck['type']}")
|
|||
|
|
print(f" │ ├── in_channels: {neck['in_channels']}")
|
|||
|
|
print(f" │ ├── out_channels: {neck['out_channels']}")
|
|||
|
|
print(f" │ └── num_outs: {neck['num_outs']}")
|
|||
|
|
|
|||
|
|
vtransform = camera['vtransform']
|
|||
|
|
print(f" └── VTransform: {vtransform['type']}")
|
|||
|
|
print(f" ├── in_channels: {vtransform['in_channels']}")
|
|||
|
|
print(f" ├── out_channels: {vtransform['out_channels']}")
|
|||
|
|
print(f" ├── image_size: {vtransform['image_size']}")
|
|||
|
|
print(f" ├── feature_size: {vtransform['feature_size']}")
|
|||
|
|
print(f" └── downsample: {vtransform['downsample']}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# LiDAR Encoder
|
|||
|
|
print("🔍 LiDAR Encoder:")
|
|||
|
|
lidar = encoders['lidar']
|
|||
|
|
voxelize = lidar['voxelize']
|
|||
|
|
|
|||
|
|
print(f" ├── Voxelize:")
|
|||
|
|
print(f" │ ├── max_num_points: {voxelize['max_num_points']}")
|
|||
|
|
print(f" │ ├── voxel_size: {voxelize['voxel_size']}")
|
|||
|
|
print(f" │ ├── max_voxels: {voxelize['max_voxels']}")
|
|||
|
|
print(f" │ └── point_cloud_range: {voxelize['point_cloud_range']}")
|
|||
|
|
|
|||
|
|
backbone_lidar = lidar['backbone']
|
|||
|
|
print(f" └── Backbone: {backbone_lidar['type']}")
|
|||
|
|
print(f" ├── in_channels: {backbone_lidar['in_channels']}")
|
|||
|
|
print(f" ├── output_channels: {backbone_lidar['output_channels']}")
|
|||
|
|
print(f" ├── sparse_shape: {backbone_lidar['sparse_shape']}")
|
|||
|
|
print(f" └── encoder_channels: {backbone_lidar['encoder_channels']}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# Fuser
|
|||
|
|
print("🔗 Fuser:")
|
|||
|
|
fuser = model['fuser']
|
|||
|
|
print(f" ├── Type: {fuser['type']}")
|
|||
|
|
print(f" ├── in_channels: {fuser['in_channels']} (Camera + LiDAR)")
|
|||
|
|
print(f" └── out_channels: {fuser['out_channels']}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 3. 解码器分析 ==========
|
|||
|
|
print("📈 3. 解码器架构分析")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
decoder = model['decoder']
|
|||
|
|
|
|||
|
|
# Backbone
|
|||
|
|
backbone_dec = decoder['backbone']
|
|||
|
|
print("🔧 Decoder Backbone:")
|
|||
|
|
print(f" ├── Type: {backbone_dec['type']}")
|
|||
|
|
print(f" ├── in_channels: {backbone_dec['in_channels']}")
|
|||
|
|
print(f" ├── out_channels: {backbone_dec['out_channels']}")
|
|||
|
|
print(f" └── layer_nums: {backbone_dec['layer_nums']}")
|
|||
|
|
|
|||
|
|
# Neck
|
|||
|
|
neck_dec = decoder['neck']
|
|||
|
|
print(f"🔧 Decoder Neck:")
|
|||
|
|
print(f" ├── Type: {neck_dec['type']}")
|
|||
|
|
print(f" ├── in_channels: {neck_dec['in_channels']}")
|
|||
|
|
print(f" ├── out_channels: {neck_dec['out_channels']}")
|
|||
|
|
print(f" └── upsample_strides: {neck_dec['upsample_strides']}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 4. Task-specific GCA 分析 ==========
|
|||
|
|
print("🎯 4. Task-specific GCA 机制")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
task_gca = model['task_specific_gca']
|
|||
|
|
print("✨ Task-specific GCA 配置:")
|
|||
|
|
print(f" ├── enabled: {task_gca['enabled']}")
|
|||
|
|
print(f" ├── in_channels: {task_gca['in_channels']} (原始BEV通道数)")
|
|||
|
|
print(f" ├── reduction: {task_gca['reduction']}")
|
|||
|
|
print(f" ├── use_max_pool: {task_gca['use_max_pool']}")
|
|||
|
|
print(f" ├── object_reduction: {task_gca['object_reduction']} (检测GCA)")
|
|||
|
|
print(f" └── map_reduction: {task_gca['map_reduction']} (分割GCA)")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print("📋 GCA机制说明:")
|
|||
|
|
print(" • 检测GCA: 从512通道中选择对检测最有益的特征")
|
|||
|
|
print(" - 强化: 物体边界、中心点、空间关系")
|
|||
|
|
print(" - 抑制: 语义纹理、全局语义")
|
|||
|
|
print(" • 分割GCA: 从512通道中选择对分割最有益的特征")
|
|||
|
|
print(" - 强化: 语义纹理、连续性、全局语义")
|
|||
|
|
print(" - 抑制: 物体边界(精确)、中心点")
|
|||
|
|
print(" • 结果: 各取所需,性能最大化 ✅")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 5. 头部分析 ==========
|
|||
|
|
print("🎯 5. 任务头部分析")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
heads = model['heads']
|
|||
|
|
|
|||
|
|
# Object Head (3D Detection)
|
|||
|
|
object_head = heads['object']
|
|||
|
|
print("🚗 3D检测头 (Object Head):")
|
|||
|
|
print(f" ├── in_channels: {object_head['in_channels']}")
|
|||
|
|
print(f" ├── train_cfg.grid_size: {object_head['train_cfg']['grid_size']}")
|
|||
|
|
print(f" └── test_cfg.grid_size: {object_head['test_cfg']['grid_size']}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# Map Head (BEV Segmentation)
|
|||
|
|
map_head = heads['map']
|
|||
|
|
print("🗺️ BEV分割头 (Map Head):")
|
|||
|
|
print(f" ├── Type: {map_head['type']}")
|
|||
|
|
print(f" ├── in_channels: {map_head['in_channels']}")
|
|||
|
|
print(f" ├── classes: {config.get('map_classes', '6 classes')}")
|
|||
|
|
print(f" ├── loss: {map_head['loss']}")
|
|||
|
|
print(f" ├── deep_supervision: {map_head['deep_supervision']}")
|
|||
|
|
print(f" ├── use_dice_loss: {map_head['use_dice_loss']}")
|
|||
|
|
print(f" ├── dice_weight: {map_head['dice_weight']}")
|
|||
|
|
print(f" ├── focal_alpha: {map_head['focal_alpha']}")
|
|||
|
|
print(f" ├── focal_gamma: {map_head['focal_gamma']}")
|
|||
|
|
print(f" ├── decoder_channels: {map_head['decoder_channels']}")
|
|||
|
|
print(f" ├── use_internal_gca: {map_head['use_internal_gca']}")
|
|||
|
|
print(f" ├── adaptive_multiscale: {map_head['adaptive_multiscale']}")
|
|||
|
|
print(f" └── adaptive_dilation_rates: {map_head['adaptive_dilation_rates']}")
|
|||
|
|
|
|||
|
|
# Grid Transform
|
|||
|
|
grid_transform = map_head['grid_transform']
|
|||
|
|
input_scope = grid_transform['input_scope']
|
|||
|
|
output_scope = grid_transform['output_scope']
|
|||
|
|
|
|||
|
|
input_res = input_scope[0][2] # xbound resolution
|
|||
|
|
output_res = output_scope[0][2] # xbound resolution
|
|||
|
|
|
|||
|
|
input_width = int((input_scope[0][1] - input_scope[0][0]) / input_res)
|
|||
|
|
input_height = int((input_scope[0][3] - input_scope[0][2]) / input_res) if len(input_scope[0]) > 3 else input_width
|
|||
|
|
|
|||
|
|
output_width = int((output_scope[0][1] - output_scope[0][0]) / output_res)
|
|||
|
|
output_height = int((output_scope[0][3] - output_scope[0][2]) / output_res) if len(output_scope[0]) > 3 else output_width
|
|||
|
|
|
|||
|
|
print(f" └── Grid Transform:")
|
|||
|
|
print(f" ├── 输入范围: {input_scope}")
|
|||
|
|
print(f" │ ├── 分辨率: {input_res}m/pixel")
|
|||
|
|
print(f" │ └── 尺寸: {input_width}×{input_height}")
|
|||
|
|
print(f" ├── 输出范围: {output_scope}")
|
|||
|
|
print(f" │ ├── 分辨率: {output_res}m/pixel")
|
|||
|
|
print(f" │ └── 尺寸: {output_width}×{output_height}")
|
|||
|
|
print(f" └── 缩放倍数: {input_res/output_res:.1f}x 上采样")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 6. 特征尺寸计算 ==========
|
|||
|
|
print("📏 6. 特征尺寸计算")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
print("🌊 特征流尺寸变化:")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# Camera特征流
|
|||
|
|
print("📷 Camera特征流:")
|
|||
|
|
print(f" ├── 输入图像: {image_size[0]}×{image_size[1]}×3")
|
|||
|
|
print(f" ├── Swin输出特征: Stage1: {image_size[0]//4}×{image_size[1]//4}×192")
|
|||
|
|
print(f" │ Stage2: {image_size[0]//8}×{image_size[1]//8}×384")
|
|||
|
|
print(f" │ Stage3: {image_size[0]//16}×{image_size[1]//16}×768")
|
|||
|
|
print(f" │ Stage4: {image_size[0]//32}×{image_size[1]//32}×768")
|
|||
|
|
print(f" ├── LSS FPN输出: {vtransform['feature_size'][0]}×{vtransform['feature_size'][1]}×256")
|
|||
|
|
print(f" └── VTransform输出: {bev_width}×{bev_height}×80")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# LiDAR特征流
|
|||
|
|
print("🔍 LiDAR特征流:")
|
|||
|
|
print(f" ├── 稀疏输入: {bev_width}×{bev_height}×{bev_depth}")
|
|||
|
|
print(f" └── SparseEncoder输出: {bev_width}×{bev_height}×128")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 融合和解码
|
|||
|
|
print("🔗 融合与解码:")
|
|||
|
|
print(f" ├── ConvFuser输入: Camera(80ch) + LiDAR(128ch) = 208ch")
|
|||
|
|
print(f" ├── ConvFuser输出: {bev_width}×{bev_height}×256")
|
|||
|
|
print(f" ├── SECOND Backbone: {bev_width}×{bev_height}×256 → {bev_width//2}×{bev_height//2}×256")
|
|||
|
|
print(f" ├── SECONDFPN输出: {bev_width}×{bev_height}×512 (融合多尺度)")
|
|||
|
|
print(f" └── Task-specific GCA: {bev_width}×{bev_height}×512 → 任务特定特征")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 分割头详细尺寸
|
|||
|
|
print("🗺️ 分割头特征尺寸:")
|
|||
|
|
print(f" ├── 输入BEV: {bev_width}×{bev_height}×512")
|
|||
|
|
print(f" ├── 4层渐进上采样 (512→256→256→128→128):")
|
|||
|
|
print(f" │ ├── Layer1: {bev_width}×{bev_height}×512 → {bev_width}×{bev_height}×256")
|
|||
|
|
print(f" │ ├── Layer2: {bev_width}×{bev_height}×256 → {bev_width}×{bev_height}×256")
|
|||
|
|
print(f" │ ├── Layer3: {bev_width}×{bev_height}×256 → {bev_width}×{bev_height}×128")
|
|||
|
|
print(f" │ └── Layer4: {bev_width}×{bev_height}×128 → {bev_width}×{bev_height}×128")
|
|||
|
|
print(f" ├── 自适应多尺度融合: {map_head['adaptive_multiscale']}")
|
|||
|
|
print(f" ├── 空洞率: {map_head['adaptive_dilation_rates']}")
|
|||
|
|
print(f" └── 最终输出: {output_width}×{output_height}×{len(config.get('map_classes', []))} (6类别)")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 7. 内存占用估计 ==========
|
|||
|
|
print("💾 7. 内存占用估计")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
# 计算主要特征图的内存占用
|
|||
|
|
bev_pixels = bev_width * bev_height
|
|||
|
|
seg_pixels = output_width * output_height
|
|||
|
|
|
|||
|
|
# 主要特征图内存 (FP32, 4 bytes per float)
|
|||
|
|
bev_512ch = bev_pixels * 512 * 4 / (1024**3) # GB
|
|||
|
|
bev_256ch = bev_pixels * 256 * 4 / (1024**3) # GB
|
|||
|
|
seg_output = seg_pixels * 6 * 4 / (1024**3) # GB
|
|||
|
|
|
|||
|
|
print(f"🔹 BEV特征图 (1440×1440×512ch): {bev_512ch:.2f} GB")
|
|||
|
|
print(f"🔹 BEV特征图 (1440×1440×256ch): {bev_256ch:.2f} GB")
|
|||
|
|
print(f"🔹 分割输出 (598×598×6ch): {seg_output:.2f} GB")
|
|||
|
|
print(f"🔹 总计主要特征: {bev_512ch + bev_256ch + seg_output:.2f} GB")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 8. 关键创新点 ==========
|
|||
|
|
print("🚀 8. Phase 4A 关键创新点")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
innovations = [
|
|||
|
|
"✨ Task-specific GCA: 检测和分割各自选择最优特征",
|
|||
|
|
"🎯 避免统一特征选择的折中问题",
|
|||
|
|
"🔧 自适应多尺度融合 (adaptive_multiscale)",
|
|||
|
|
"📏 渐进式4层解码器 (512→256→256→128→128)",
|
|||
|
|
"🎨 动态空洞率学习 (adaptive_dilation_rates: [1,3,6,12])",
|
|||
|
|
"📊 分割分辨率: 598×598 @ 0.167m/pixel",
|
|||
|
|
"🏗️ 检测使用完整1440×1440 BEV特征"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for innovation in innovations:
|
|||
|
|
print(f" {innovation}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# ========== 9. 训练配置摘要 ==========
|
|||
|
|
print("⚙️ 9. 训练配置摘要")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
print(f"🔹 训练轮数: {config['max_epochs']}")
|
|||
|
|
print(f"🔹 学习率: {config['optimizer']['lr']}")
|
|||
|
|
print(f"🔹 权重衰减: {config['optimizer']['weight_decay']}")
|
|||
|
|
print(f"🔹 梯度裁剪: {config['optimizer_config']['grad_clip']['max_norm']}")
|
|||
|
|
print(f"🔹 FP16初始scale: {config['fp16']['loss_scale']['init_scale']}")
|
|||
|
|
print(f"🔹 验证间隔: {config['evaluation']['interval']}")
|
|||
|
|
print(f"🔹 数据采样: val.load_interval = {config['data']['val']['load_interval']}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print("=" * 80)
|
|||
|
|
print("✅ Phase 4A Task-specific GCA 配置分析完成")
|
|||
|
|
print("=" * 80)
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
analyze_phase4a_gca_config()
|