104 lines
3.4 KiB
Bash
Executable File
104 lines
3.4 KiB
Bash
Executable File
#!/bin/bash
|
||
# FP16训练状态完整验证脚本
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
LOG_FILE=$(ls -t phase4a_stage1_fp16*.log 2>/dev/null | head -1)
|
||
|
||
echo "================================================================================"
|
||
echo "FP16训练状态验证"
|
||
echo "================================================================================"
|
||
echo ""
|
||
|
||
echo "【1. 日志文件】"
|
||
if [ -n "$LOG_FILE" ]; then
|
||
echo " ✓ 日志: $LOG_FILE"
|
||
echo " ✓ 大小: $(ls -lh "$LOG_FILE" | awk '{print $5}')"
|
||
else
|
||
echo " ❌ 未找到FP16训练日志"
|
||
exit 1
|
||
fi
|
||
echo ""
|
||
|
||
echo "【2. FP16 Hook检查】"
|
||
FP16_HOOK_COUNT=$(grep -c "Fp16OptimizerHook" "$LOG_FILE")
|
||
if [ "$FP16_HOOK_COUNT" -gt 0 ]; then
|
||
echo " ✅ Fp16OptimizerHook已加载 (出现${FP16_HOOK_COUNT}次)"
|
||
grep "Fp16OptimizerHook" "$LOG_FILE" | head -5
|
||
else
|
||
echo " ❌ 未找到Fp16OptimizerHook"
|
||
fi
|
||
echo ""
|
||
|
||
echo "【3. 显存使用对比】"
|
||
echo " 训练日志报告的显存 (memory字段):"
|
||
grep -oP "memory: \K[0-9]+" "$LOG_FILE" | tail -5 | awk '{print " " $1 " MB (" $1/1024 " GB)"}'
|
||
echo ""
|
||
echo " nvidia-smi实际显存:"
|
||
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits | awk '{print " GPU " $1 ": " $2 " MB (" $2/1024 " GB)"}'
|
||
echo ""
|
||
|
||
echo "【4. 训练速度】"
|
||
echo " 最近5个iteration的速度:"
|
||
grep "time:" "$LOG_FILE" | tail -5 | grep -oP "time: \K[0-9.]+" | awk '{print " " $1 " s/iter"}'
|
||
echo ""
|
||
|
||
echo "【5. Grad Norm检查 (FP16特征)】"
|
||
echo " 最近5个iteration的grad_norm:"
|
||
grep "grad_norm:" "$LOG_FILE" | tail -5 | grep -oP "grad_norm: \K[a-z0-9.]+" | awk '{
|
||
if ($1 == "nan") {
|
||
print " " $1 " (FP16 dynamic scaling启动阶段)"
|
||
} else {
|
||
print " " $1 " (FP16已稳定)"
|
||
}
|
||
}'
|
||
echo ""
|
||
|
||
echo "【6. 配置文件验证】"
|
||
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_fp16.yaml"
|
||
if [ -f "$CONFIG_FILE" ]; then
|
||
echo " 配置文件: $CONFIG_FILE"
|
||
if grep -q "^fp16:" "$CONFIG_FILE"; then
|
||
echo " ✅ fp16字段存在:"
|
||
grep -A 2 "^fp16:" "$CONFIG_FILE"
|
||
else
|
||
echo " ❌ fp16字段不存在"
|
||
fi
|
||
else
|
||
echo " ❌ 配置文件不存在"
|
||
fi
|
||
echo ""
|
||
|
||
echo "【7. 当前训练进度】"
|
||
LATEST_ITER=$(grep "Epoch \[1\]" "$LOG_FILE" | tail -1)
|
||
if [ -n "$LATEST_ITER" ]; then
|
||
echo " $LATEST_ITER" | grep -oP "Epoch.*memory: [0-9]+"
|
||
else
|
||
echo " 无法获取"
|
||
fi
|
||
echo ""
|
||
|
||
echo "================================================================================"
|
||
echo "验证总结"
|
||
echo "================================================================================"
|
||
|
||
# 综合判断
|
||
if [ "$FP16_HOOK_COUNT" -gt 0 ]; then
|
||
MEMORY=$(grep -oP "memory: \K[0-9]+" "$LOG_FILE" | tail -1)
|
||
if [ -n "$MEMORY" ] && [ "$MEMORY" -lt 22000 ]; then
|
||
echo "✅ FP16已正确启用!"
|
||
echo " - Fp16OptimizerHook: 已加载"
|
||
echo " - 显存占用: ${MEMORY}MB (< 22GB,符合FP16预期)"
|
||
echo " - 对比FP32: ~29GB → ${MEMORY}MB (节省约$(( (29000-MEMORY)*100/29000 ))%)"
|
||
else
|
||
echo "⚠️ FP16 Hook已加载,但显存未明显下降"
|
||
echo " - 当前显存: ${MEMORY}MB"
|
||
echo " - 建议检查配置文件"
|
||
fi
|
||
else
|
||
echo "❌ FP16未启用"
|
||
echo " - 未检测到Fp16OptimizerHook"
|
||
fi
|
||
echo "================================================================================"
|
||
|