bev-project/ANALYZE_PHASE4A_GCA_CONFIG.py

314 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Phase 4A Task-specific GCA 配置分析
分析 multitask_BEV2X_phase4a_stage1_task_gca.yaml 的网络架构和特征尺寸
"""
import yaml
import math
def analyze_phase4a_gca_config():
"""分析Phase 4A Task-specific GCA配置"""
print("=" * 80)
print("🚀 BEVFusion Phase 4A - Task-specific GCA 架构分析")
print("=" * 80)
# 读取配置文件
config_path = "configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_task_gca.yaml"
with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
print(f"📁 配置文件: {config_path}")
print()
# ========== 1. 输入规格分析 ==========
print("📊 1. 输入规格分析")
print("-" * 40)
voxel_size = config['voxel_size']
point_cloud_range = config['point_cloud_range']
image_size = config.get('image_size', [256, 704]) # 从_base_继承
print(f"🔹 LiDAR体素尺寸: {voxel_size}")
print(f"🔹 点云范围: {point_cloud_range}")
print(f"🔹 图像尺寸: {image_size}")
print()
# 计算BEV网格尺寸
bev_width = int((point_cloud_range[3] - point_cloud_range[0]) / voxel_size[0])
bev_height = int((point_cloud_range[4] - point_cloud_range[1]) / voxel_size[1])
bev_depth = int((point_cloud_range[5] - point_cloud_range[2]) / voxel_size[2])
print(f"🔹 BEV网格尺寸: {bev_width}×{bev_height}×{bev_depth}")
print(f" ├── 宽度: {bev_width} ({point_cloud_range[0]}m ~ {point_cloud_range[3]}m)")
print(f" ├── 高度: {bev_height} ({point_cloud_range[1]}m ~ {point_cloud_range[4]}m)")
print(f" └── 深度: {bev_depth} ({point_cloud_range[2]}m ~ {point_cloud_range[5]}m)")
print()
# ========== 2. 编码器分析 ==========
print("🏗️ 2. 编码器架构分析")
print("-" * 40)
model = config['model']
encoders = model['encoders']
# Camera Encoder
print("📷 Camera Encoder:")
camera = encoders['camera']
backbone = camera['backbone']
print(f" ├── Backbone: {backbone['type']}")
print(f" ├── embed_dims: {backbone['embed_dims']}")
print(f" ├── depths: {backbone['depths']} (总层数: {sum(backbone['depths'])})")
print(f" ├── num_heads: {backbone['num_heads']}")
print(f" ├── out_indices: {backbone['out_indices']}")
neck = camera['neck']
print(f" ├── Neck: {neck['type']}")
print(f" │ ├── in_channels: {neck['in_channels']}")
print(f" │ ├── out_channels: {neck['out_channels']}")
print(f" │ └── num_outs: {neck['num_outs']}")
vtransform = camera['vtransform']
print(f" └── VTransform: {vtransform['type']}")
print(f" ├── in_channels: {vtransform['in_channels']}")
print(f" ├── out_channels: {vtransform['out_channels']}")
print(f" ├── image_size: {vtransform['image_size']}")
print(f" ├── feature_size: {vtransform['feature_size']}")
print(f" └── downsample: {vtransform['downsample']}")
print()
# LiDAR Encoder
print("🔍 LiDAR Encoder:")
lidar = encoders['lidar']
voxelize = lidar['voxelize']
print(f" ├── Voxelize:")
print(f" │ ├── max_num_points: {voxelize['max_num_points']}")
print(f" │ ├── voxel_size: {voxelize['voxel_size']}")
print(f" │ ├── max_voxels: {voxelize['max_voxels']}")
print(f" │ └── point_cloud_range: {voxelize['point_cloud_range']}")
backbone_lidar = lidar['backbone']
print(f" └── Backbone: {backbone_lidar['type']}")
print(f" ├── in_channels: {backbone_lidar['in_channels']}")
print(f" ├── output_channels: {backbone_lidar['output_channels']}")
print(f" ├── sparse_shape: {backbone_lidar['sparse_shape']}")
print(f" └── encoder_channels: {backbone_lidar['encoder_channels']}")
print()
# Fuser
print("🔗 Fuser:")
fuser = model['fuser']
print(f" ├── Type: {fuser['type']}")
print(f" ├── in_channels: {fuser['in_channels']} (Camera + LiDAR)")
print(f" └── out_channels: {fuser['out_channels']}")
print()
# ========== 3. 解码器分析 ==========
print("📈 3. 解码器架构分析")
print("-" * 40)
decoder = model['decoder']
# Backbone
backbone_dec = decoder['backbone']
print("🔧 Decoder Backbone:")
print(f" ├── Type: {backbone_dec['type']}")
print(f" ├── in_channels: {backbone_dec['in_channels']}")
print(f" ├── out_channels: {backbone_dec['out_channels']}")
print(f" └── layer_nums: {backbone_dec['layer_nums']}")
# Neck
neck_dec = decoder['neck']
print(f"🔧 Decoder Neck:")
print(f" ├── Type: {neck_dec['type']}")
print(f" ├── in_channels: {neck_dec['in_channels']}")
print(f" ├── out_channels: {neck_dec['out_channels']}")
print(f" └── upsample_strides: {neck_dec['upsample_strides']}")
print()
# ========== 4. Task-specific GCA 分析 ==========
print("🎯 4. Task-specific GCA 机制")
print("-" * 40)
task_gca = model['task_specific_gca']
print("✨ Task-specific GCA 配置:")
print(f" ├── enabled: {task_gca['enabled']}")
print(f" ├── in_channels: {task_gca['in_channels']} (原始BEV通道数)")
print(f" ├── reduction: {task_gca['reduction']}")
print(f" ├── use_max_pool: {task_gca['use_max_pool']}")
print(f" ├── object_reduction: {task_gca['object_reduction']} (检测GCA)")
print(f" └── map_reduction: {task_gca['map_reduction']} (分割GCA)")
print()
print("📋 GCA机制说明:")
print(" • 检测GCA: 从512通道中选择对检测最有益的特征")
print(" - 强化: 物体边界、中心点、空间关系")
print(" - 抑制: 语义纹理、全局语义")
print(" • 分割GCA: 从512通道中选择对分割最有益的特征")
print(" - 强化: 语义纹理、连续性、全局语义")
print(" - 抑制: 物体边界(精确)、中心点")
print(" • 结果: 各取所需,性能最大化 ✅")
print()
# ========== 5. 头部分析 ==========
print("🎯 5. 任务头部分析")
print("-" * 40)
heads = model['heads']
# Object Head (3D Detection)
object_head = heads['object']
print("🚗 3D检测头 (Object Head):")
print(f" ├── in_channels: {object_head['in_channels']}")
print(f" ├── train_cfg.grid_size: {object_head['train_cfg']['grid_size']}")
print(f" └── test_cfg.grid_size: {object_head['test_cfg']['grid_size']}")
print()
# Map Head (BEV Segmentation)
map_head = heads['map']
print("🗺️ BEV分割头 (Map Head):")
print(f" ├── Type: {map_head['type']}")
print(f" ├── in_channels: {map_head['in_channels']}")
print(f" ├── classes: {config.get('map_classes', '6 classes')}")
print(f" ├── loss: {map_head['loss']}")
print(f" ├── deep_supervision: {map_head['deep_supervision']}")
print(f" ├── use_dice_loss: {map_head['use_dice_loss']}")
print(f" ├── dice_weight: {map_head['dice_weight']}")
print(f" ├── focal_alpha: {map_head['focal_alpha']}")
print(f" ├── focal_gamma: {map_head['focal_gamma']}")
print(f" ├── decoder_channels: {map_head['decoder_channels']}")
print(f" ├── use_internal_gca: {map_head['use_internal_gca']}")
print(f" ├── adaptive_multiscale: {map_head['adaptive_multiscale']}")
print(f" └── adaptive_dilation_rates: {map_head['adaptive_dilation_rates']}")
# Grid Transform
grid_transform = map_head['grid_transform']
input_scope = grid_transform['input_scope']
output_scope = grid_transform['output_scope']
input_res = input_scope[0][2] # xbound resolution
output_res = output_scope[0][2] # xbound resolution
input_width = int((input_scope[0][1] - input_scope[0][0]) / input_res)
input_height = int((input_scope[0][3] - input_scope[0][2]) / input_res) if len(input_scope[0]) > 3 else input_width
output_width = int((output_scope[0][1] - output_scope[0][0]) / output_res)
output_height = int((output_scope[0][3] - output_scope[0][2]) / output_res) if len(output_scope[0]) > 3 else output_width
print(f" └── Grid Transform:")
print(f" ├── 输入范围: {input_scope}")
print(f" │ ├── 分辨率: {input_res}m/pixel")
print(f" │ └── 尺寸: {input_width}×{input_height}")
print(f" ├── 输出范围: {output_scope}")
print(f" │ ├── 分辨率: {output_res}m/pixel")
print(f" │ └── 尺寸: {output_width}×{output_height}")
print(f" └── 缩放倍数: {input_res/output_res:.1f}x 上采样")
print()
# ========== 6. 特征尺寸计算 ==========
print("📏 6. 特征尺寸计算")
print("-" * 40)
print("🌊 特征流尺寸变化:")
print()
# Camera特征流
print("📷 Camera特征流:")
print(f" ├── 输入图像: {image_size[0]}×{image_size[1]}×3")
print(f" ├── Swin输出特征: Stage1: {image_size[0]//4}×{image_size[1]//4}×192")
print(f" │ Stage2: {image_size[0]//8}×{image_size[1]//8}×384")
print(f" │ Stage3: {image_size[0]//16}×{image_size[1]//16}×768")
print(f" │ Stage4: {image_size[0]//32}×{image_size[1]//32}×768")
print(f" ├── LSS FPN输出: {vtransform['feature_size'][0]}×{vtransform['feature_size'][1]}×256")
print(f" └── VTransform输出: {bev_width}×{bev_height}×80")
print()
# LiDAR特征流
print("🔍 LiDAR特征流:")
print(f" ├── 稀疏输入: {bev_width}×{bev_height}×{bev_depth}")
print(f" └── SparseEncoder输出: {bev_width}×{bev_height}×128")
print()
# 融合和解码
print("🔗 融合与解码:")
print(f" ├── ConvFuser输入: Camera(80ch) + LiDAR(128ch) = 208ch")
print(f" ├── ConvFuser输出: {bev_width}×{bev_height}×256")
print(f" ├── SECOND Backbone: {bev_width}×{bev_height}×256 → {bev_width//2}×{bev_height//2}×256")
print(f" ├── SECONDFPN输出: {bev_width}×{bev_height}×512 (融合多尺度)")
print(f" └── Task-specific GCA: {bev_width}×{bev_height}×512 → 任务特定特征")
print()
# 分割头详细尺寸
print("🗺️ 分割头特征尺寸:")
print(f" ├── 输入BEV: {bev_width}×{bev_height}×512")
print(f" ├── 4层渐进上采样 (512→256→256→128→128):")
print(f" │ ├── Layer1: {bev_width}×{bev_height}×512 → {bev_width}×{bev_height}×256")
print(f" │ ├── Layer2: {bev_width}×{bev_height}×256 → {bev_width}×{bev_height}×256")
print(f" │ ├── Layer3: {bev_width}×{bev_height}×256 → {bev_width}×{bev_height}×128")
print(f" │ └── Layer4: {bev_width}×{bev_height}×128 → {bev_width}×{bev_height}×128")
print(f" ├── 自适应多尺度融合: {map_head['adaptive_multiscale']}")
print(f" ├── 空洞率: {map_head['adaptive_dilation_rates']}")
print(f" └── 最终输出: {output_width}×{output_height}×{len(config.get('map_classes', []))} (6类别)")
print()
# ========== 7. 内存占用估计 ==========
print("💾 7. 内存占用估计")
print("-" * 40)
# 计算主要特征图的内存占用
bev_pixels = bev_width * bev_height
seg_pixels = output_width * output_height
# 主要特征图内存 (FP32, 4 bytes per float)
bev_512ch = bev_pixels * 512 * 4 / (1024**3) # GB
bev_256ch = bev_pixels * 256 * 4 / (1024**3) # GB
seg_output = seg_pixels * 6 * 4 / (1024**3) # GB
print(f"🔹 BEV特征图 (1440×1440×512ch): {bev_512ch:.2f} GB")
print(f"🔹 BEV特征图 (1440×1440×256ch): {bev_256ch:.2f} GB")
print(f"🔹 分割输出 (598×598×6ch): {seg_output:.2f} GB")
print(f"🔹 总计主要特征: {bev_512ch + bev_256ch + seg_output:.2f} GB")
print()
# ========== 8. 关键创新点 ==========
print("🚀 8. Phase 4A 关键创新点")
print("-" * 40)
innovations = [
"✨ Task-specific GCA: 检测和分割各自选择最优特征",
"🎯 避免统一特征选择的折中问题",
"🔧 自适应多尺度融合 (adaptive_multiscale)",
"📏 渐进式4层解码器 (512→256→256→128→128)",
"🎨 动态空洞率学习 (adaptive_dilation_rates: [1,3,6,12])",
"📊 分割分辨率: 598×598 @ 0.167m/pixel",
"🏗️ 检测使用完整1440×1440 BEV特征"
]
for innovation in innovations:
print(f" {innovation}")
print()
# ========== 9. 训练配置摘要 ==========
print("⚙️ 9. 训练配置摘要")
print("-" * 40)
print(f"🔹 训练轮数: {config['max_epochs']}")
print(f"🔹 学习率: {config['optimizer']['lr']}")
print(f"🔹 权重衰减: {config['optimizer']['weight_decay']}")
print(f"🔹 梯度裁剪: {config['optimizer_config']['grad_clip']['max_norm']}")
print(f"🔹 FP16初始scale: {config['fp16']['loss_scale']['init_scale']}")
print(f"🔹 验证间隔: {config['evaluation']['interval']}")
print(f"🔹 数据采样: val.load_interval = {config['data']['val']['load_interval']}")
print()
print("=" * 80)
print("✅ Phase 4A Task-specific GCA 配置分析完成")
print("=" * 80)
if __name__ == "__main__":
analyze_phase4a_gca_config()