# BEVFusion 模型优化启动计划 **开始时间**: 2025-10-30 **Baseline**: Epoch 23 (NDS 0.6941, mAP 0.6446, mIoU 0.4130) **目标**: 准备Orin部署的优化模型 --- ## 🎯 优化目标 ### 最终部署目标 ``` 硬件: NVIDIA Orin 270T 推理时间: <80ms (理想<60ms) 吞吐量: >12 FPS (理想>16 FPS) 功耗: <60W (理想<45W) 精度损失: <3% ``` ### 优化路线 ``` 原始模型: 110M参数, 450 GFLOPs, 90ms@A100 ↓ 剪枝模型: 60M参数, 250 GFLOPs, 50ms@A100 (-45%) ↓ INT8模型: 15M参数, 62 GFLOPs, 40ms@A100 (-56%) ↓ TensorRT: 15M参数, 优化kernel, 30ms@A100 (-67%) ↓ Orin部署: 50-60ms推理, 16+ FPS, <50W 目标达成✅ ``` --- ## 📋 三阶段优化计划 ### 阶段1: 模型分析(1-2天,立即开始) #### 任务清单 - [ ] 分析模型参数量和FLOPs - [ ] Profiling推理性能瓶颈 - [ ] 敏感度分析(哪些层可剪枝) - [ ] 确定剪枝策略 #### 需要的工具 ```python tools/analysis/ ├── model_complexity.py # 模型复杂度分析 ├── profile_inference.py # 推理性能profiling ├── sensitivity_analysis.py # 敏感度分析 └── layer_statistics.py # 层统计信息 ``` --- ### 阶段2: 模型剪枝(3-5天) #### 目标 ``` 参数量: 110M → 60M (-45%) FLOPs: 450G → 250G (-44%) 精度损失: <1.5% ``` #### 剪枝策略 ``` 1. SwinTransformer Backbone - 通道剪枝: 减少20-30% channels - 层数剪枝: 可选择减少attention层 2. FPN Neck - 通道剪枝: 减少25-30% channels 3. Decoder - 通道剪枝: 减少20% channels 4. Detection/Segmentation Heads - 谨慎剪枝: 减少10-15% (影响精度) ``` #### 剪枝工具 - Torch-Pruning (推荐) - torch.nn.utils.prune (内置) --- ### 阶段3: 量化训练(4-6天) #### 目标 ``` 模型大小: 441MB (FP32) → 110MB (INT8) (-75%) 推理速度: 2-3倍提升 精度损失: <2% (累计<3%) ``` #### 量化策略 ``` 1. PTQ (Post-Training Quantization) - 快速验证可行性 - 预期精度损失: 2-3% 2. QAT (Quantization-Aware Training) - 训练恢复精度 - 5个epochs, lr=1e-6 - 预期精度恢复: 1-2% ``` --- ## 🚀 立即行动:阶段1启动 ### Step 1: 模型复杂度分析 创建分析脚本: ```python # tools/analysis/model_complexity.py import torch import torch.nn as nn from thop import profile, clever_format from mmcv import Config from mmdet3d.models import build_model def analyze_model_complexity(config_file, checkpoint_file=None): """分析模型复杂度""" # 加载配置 cfg = Config.fromfile(config_file) # 构建模型 model = build_model(cfg.model) model.eval() if checkpoint_file: checkpoint = torch.load(checkpoint_file, map_location='cpu') model.load_state_dict(checkpoint['state_dict']) # 统计参数量 total_params = sum(p.numel() for p in model.parameters()) trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("=" * 80) print("模型参数统计") print("=" * 80) print(f"总参数量: {total_params:,} ({total_params/1e6:.2f}M)") print(f"可训练参数: {trainable_params:,} ({trainable_params/1e6:.2f}M)") print(f"模型大小 (FP32): {total_params * 4 / 1024 / 1024:.2f} MB") print() # 分模块统计 print("=" * 80) print("各模块参数统计") print("=" * 80) module_params = {} for name, module in model.named_children(): params = sum(p.numel() for p in module.parameters()) module_params[name] = params print(f"{name:30s}: {params:12,} ({params/total_params*100:5.2f}%)") print() # FLOPs统计(需要dummy input) print("=" * 80) print("计算量统计 (需要dummy input)") print("=" * 80) # 创建dummy inputs batch_size = 1 dummy_images = torch.randn(batch_size, 6, 3, 256, 704) # 6个相机视角 dummy_points = torch.randn(batch_size, 40000, 5) # 点云 try: flops, params = profile(model, inputs=(dummy_images, dummy_points)) flops, params = clever_format([flops, params], "%.3f") print(f"FLOPs: {flops}") print(f"Params: {params}") except Exception as e: print(f"FLOPs计算失败: {e}") print("可能需要修改model forward以支持profile") return model, total_params, module_params if __name__ == '__main__': import sys if len(sys.argv) < 2: print("用法: python model_complexity.py [checkpoint_file]") sys.exit(1) config_file = sys.argv[1] checkpoint_file = sys.argv[2] if len(sys.argv) > 2 else None model, total_params, module_params = analyze_model_complexity( config_file, checkpoint_file ) print("\n分析完成!") ``` ### Step 2: 推理性能Profiling ```python # tools/analysis/profile_inference.py import torch import time import numpy as np from mmcv import Config from mmdet3d.models import build_model from mmdet3d.datasets import build_dataloader, build_dataset def profile_inference(config_file, checkpoint_file, num_samples=100): """Profiling推理性能""" # 加载配置和模型 cfg = Config.fromfile(config_file) model = build_model(cfg.model).cuda() checkpoint = torch.load(checkpoint_file) model.load_state_dict(checkpoint['state_dict']) model.eval() # 构建数据集 dataset = build_dataset(cfg.data.val) data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=0, dist=False, shuffle=False ) # 预热 print("预热GPU...") with torch.no_grad(): for i, data in enumerate(data_loader): if i >= 10: break _ = model(return_loss=False, rescale=True, **data) # 性能测试 print(f"\n开始profiling (测试{num_samples}个样本)...") times = [] with torch.no_grad(): for i, data in enumerate(data_loader): if i >= num_samples: break torch.cuda.synchronize() start = time.time() _ = model(return_loss=False, rescale=True, **data) torch.cuda.synchronize() end = time.time() times.append((end - start) * 1000) # ms if (i + 1) % 10 == 0: print(f" 已处理: {i+1}/{num_samples}") # 统计 times = np.array(times) print("\n" + "=" * 80) print("推理性能统计") print("=" * 80) print(f"平均推理时间: {np.mean(times):.2f} ms") print(f"中位数: {np.median(times):.2f} ms") print(f"最小值: {np.min(times):.2f} ms") print(f"最大值: {np.max(times):.2f} ms") print(f"标准差: {np.std(times):.2f} ms") print(f"P95: {np.percentile(times, 95):.2f} ms") print(f"P99: {np.percentile(times, 99):.2f} ms") print(f"\n吞吐量: {1000/np.mean(times):.2f} FPS") print("=" * 80) return times if __name__ == '__main__': import sys if len(sys.argv) < 3: print("用法: python profile_inference.py [num_samples]") sys.exit(1) config_file = sys.argv[1] checkpoint_file = sys.argv[2] num_samples = int(sys.argv[3]) if len(sys.argv) > 3 else 100 times = profile_inference(config_file, checkpoint_file, num_samples) print("\nProfileing完成!") ``` ### Step 3: 敏感度分析 ```python # tools/analysis/sensitivity_analysis.py import torch import torch.nn as nn import copy from tqdm import tqdm from mmcv import Config from mmdet3d.models import build_model from mmdet3d.datasets import build_dataloader, build_dataset from mmdet3d.apis import single_gpu_test def prune_layer_channels(model, layer_name, ratio=0.5): """临时剪枝指定层的通道""" # 这里简化处理,实际需要根据层类型处理 pruned_model = copy.deepcopy(model) # 找到目标层并剪枝 for name, module in pruned_model.named_modules(): if name == layer_name: if isinstance(module, nn.Conv2d): # 简化:只保留前50%的通道 out_channels = module.out_channels keep_channels = int(out_channels * (1 - ratio)) # 这里需要实际的剪枝实现 pass return pruned_model def evaluate_model(model, data_loader): """快速评估模型""" model.eval() results = [] with torch.no_grad(): for data in tqdm(data_loader, desc="Evaluating"): result = model(return_loss=False, rescale=True, **data) results.extend(result) # 简化:返回平均分数(实际需要计算mAP/NDS) return len(results) # 占位符 def analyze_sensitivity(config_file, checkpoint_file, prune_ratio=0.5): """分析各层剪枝敏感度""" print("加载模型...") cfg = Config.fromfile(config_file) model = build_model(cfg.model).cuda() checkpoint = torch.load(checkpoint_file) model.load_state_dict(checkpoint['state_dict']) # 构建数据集(使用少量样本快速测试) print("构建数据集...") cfg.data.val.ann_file = cfg.data.val.ann_file # 使用mini val set dataset = build_dataset(cfg.data.val) data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=0, dist=False, shuffle=False ) # Baseline性能 print("\n评估baseline性能...") baseline_score = evaluate_model(model, data_loader) print(f"Baseline score: {baseline_score}") # 分析各层敏感度 sensitivities = {} print(f"\n开始敏感度分析 (剪枝比例: {prune_ratio})...") for name, module in tqdm(model.named_modules()): # 只分析Conv2d层 if not isinstance(module, nn.Conv2d): continue if module.out_channels < 64: # 跳过小层 continue print(f"\n测试层: {name}") # 临时剪枝该层 pruned_model = prune_layer_channels(model, name, prune_ratio) # 评估 pruned_score = evaluate_model(pruned_model, data_loader) # 计算敏感度 sensitivity = baseline_score - pruned_score sensitivities[name] = sensitivity print(f" 剪枝后score: {pruned_score}") print(f" 敏感度: {sensitivity:.4f}") del pruned_model # 排序并保存 sorted_sens = sorted(sensitivities.items(), key=lambda x: x[1]) print("\n" + "=" * 80) print("敏感度排序 (从低到高,低敏感度=易剪枝)") print("=" * 80) for name, sens in sorted_sens[:20]: # 显示前20个 print(f"{name:60s}: {sens:.4f}") return sensitivities if __name__ == '__main__': import sys if len(sys.argv) < 3: print("用法: python sensitivity_analysis.py ") sys.exit(1) config_file = sys.argv[1] checkpoint_file = sys.argv[2] sensitivities = analyze_sensitivity(config_file, checkpoint_file) # 保存结果 import json with open('sensitivity_results.json', 'w') as f: json.dump(sensitivities, f, indent=2) print("\n敏感度分析完成!结果已保存到 sensitivity_results.json") ``` --- ## 📊 立即执行的命令 ### 1. 模型复杂度分析(5分钟) ```bash cd /workspace/bevfusion # 创建分析目录 mkdir -p tools/analysis # 创建并运行分析脚本 python tools/analysis/model_complexity.py \ configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_enhanced_phase1_HIGHRES.yaml \ runs/enhanced_from_epoch19/epoch_23.pth \ > analysis_results/model_complexity.txt cat analysis_results/model_complexity.txt ``` ### 2. 推理性能Profiling(15分钟) ```bash # Profiling推理性能 python tools/analysis/profile_inference.py \ configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_enhanced_phase1_HIGHRES.yaml \ runs/enhanced_from_epoch19/epoch_23.pth \ 100 \ > analysis_results/inference_profile.txt cat analysis_results/inference_profile.txt ``` ### 3. 敏感度分析(1-2小时,可选) ```bash # 敏感度分析(使用mini val set快速测试) python tools/analysis/sensitivity_analysis.py \ configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_enhanced_phase1_HIGHRES.yaml \ runs/enhanced_from_epoch19/epoch_23.pth \ > analysis_results/sensitivity_analysis.txt ``` --- ## 📋 预期分析结果 基于BEVFusion架构,预期结果: ### 模型复杂度 ``` 总参数量: ~110M - Camera Encoder (SwinT): ~47M (43%) ← 最大模块 - LiDAR Encoder: ~19M (17%) - Fuser: ~2M (2%) - Decoder: ~16M (14%) - Detection Head: ~18M (16%) - Segmentation Head: ~8M (7%) FLOPs: ~450 GFLOPs 模型大小: ~441 MB (FP32) ``` ### 推理性能 (A100) ``` 平均推理时间: ~90ms - Camera branch: ~40ms (44%) ← 最大瓶颈 - LiDAR branch: ~17ms (19%) - Fusion + Decoder: ~15ms (17%) - Heads: ~18ms (20%) 吞吐量: ~11 FPS ``` ### 优化潜力 ``` 1. Camera Encoder剪枝 - 潜力: 减少40-50%参数 - 加速: 20-30% - 敏感度: 中等 2. Decoder简化 - 潜力: 减少30-40%参数 - 加速: 10-15% - 敏感度: 低 3. INT8量化 - 加速: 2-3倍 - 精度损失: <2% ``` --- ## 🎯 今天的目标 ### 必须完成 - [ ] 创建分析工具脚本 - [ ] 运行模型复杂度分析 - [ ] 运行推理性能profiling - [ ] 生成分析报告 ### 可选 - [ ] 敏感度分析(如果时间允许) - [ ] 确定剪枝策略 - [ ] 准备剪枝工具 --- ## 📅 后续7天计划 ``` Day 1 (今天): ✓ 模型分析 ✓ Profiling ✓ 确定优化策略 Day 2-3: → 实施剪枝 → 剪枝模型微调(3 epochs) Day 4: → 评估剪枝模型 → PTQ量化测试 Day 5-6: → QAT量化训练(5 epochs) Day 7: → 评估量化模型 → 生成优化报告 → 准备TensorRT转换 ``` --- ## 🚀 立即开始 **当前Stage 1训练正在进行**(GPU 0-3),**可以并行进行模型分析**(GPU 4-7或CPU) ### 创建分析工具 ```bash cd /workspace/bevfusion mkdir -p tools/analysis mkdir -p analysis_results # 创建分析脚本(见上面的Python代码) # 然后运行分析 ``` --- **状态**: 🚀 准备开始模型优化 **重点**: 先分析,再优化 **并行**: 不影响Stage 1训练