bev-project/archive/scripts_old/CHECK_FP16_STATUS.sh

104 lines
3.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# FP16训练状态完整验证脚本
cd /workspace/bevfusion
LOG_FILE=$(ls -t phase4a_stage1_fp16*.log 2>/dev/null | head -1)
echo "================================================================================"
echo "FP16训练状态验证"
echo "================================================================================"
echo ""
echo "【1. 日志文件】"
if [ -n "$LOG_FILE" ]; then
echo " ✓ 日志: $LOG_FILE"
echo " ✓ 大小: $(ls -lh "$LOG_FILE" | awk '{print $5}')"
else
echo " ❌ 未找到FP16训练日志"
exit 1
fi
echo ""
echo "【2. FP16 Hook检查】"
FP16_HOOK_COUNT=$(grep -c "Fp16OptimizerHook" "$LOG_FILE")
if [ "$FP16_HOOK_COUNT" -gt 0 ]; then
echo " ✅ Fp16OptimizerHook已加载 (出现${FP16_HOOK_COUNT}次)"
grep "Fp16OptimizerHook" "$LOG_FILE" | head -5
else
echo " ❌ 未找到Fp16OptimizerHook"
fi
echo ""
echo "【3. 显存使用对比】"
echo " 训练日志报告的显存 (memory字段):"
grep -oP "memory: \K[0-9]+" "$LOG_FILE" | tail -5 | awk '{print " " $1 " MB (" $1/1024 " GB)"}'
echo ""
echo " nvidia-smi实际显存:"
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits | awk '{print " GPU " $1 ": " $2 " MB (" $2/1024 " GB)"}'
echo ""
echo "【4. 训练速度】"
echo " 最近5个iteration的速度:"
grep "time:" "$LOG_FILE" | tail -5 | grep -oP "time: \K[0-9.]+" | awk '{print " " $1 " s/iter"}'
echo ""
echo "【5. Grad Norm检查 (FP16特征)】"
echo " 最近5个iteration的grad_norm:"
grep "grad_norm:" "$LOG_FILE" | tail -5 | grep -oP "grad_norm: \K[a-z0-9.]+" | awk '{
if ($1 == "nan") {
print " " $1 " (FP16 dynamic scaling启动阶段)"
} else {
print " " $1 " (FP16已稳定)"
}
}'
echo ""
echo "【6. 配置文件验证】"
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_fp16.yaml"
if [ -f "$CONFIG_FILE" ]; then
echo " 配置文件: $CONFIG_FILE"
if grep -q "^fp16:" "$CONFIG_FILE"; then
echo " ✅ fp16字段存在:"
grep -A 2 "^fp16:" "$CONFIG_FILE"
else
echo " ❌ fp16字段不存在"
fi
else
echo " ❌ 配置文件不存在"
fi
echo ""
echo "【7. 当前训练进度】"
LATEST_ITER=$(grep "Epoch \[1\]" "$LOG_FILE" | tail -1)
if [ -n "$LATEST_ITER" ]; then
echo " $LATEST_ITER" | grep -oP "Epoch.*memory: [0-9]+"
else
echo " 无法获取"
fi
echo ""
echo "================================================================================"
echo "验证总结"
echo "================================================================================"
# 综合判断
if [ "$FP16_HOOK_COUNT" -gt 0 ]; then
MEMORY=$(grep -oP "memory: \K[0-9]+" "$LOG_FILE" | tail -1)
if [ -n "$MEMORY" ] && [ "$MEMORY" -lt 22000 ]; then
echo "✅ FP16已正确启用"
echo " - Fp16OptimizerHook: 已加载"
echo " - 显存占用: ${MEMORY}MB (< 22GB符合FP16预期)"
echo " - 对比FP32: ~29GB → ${MEMORY}MB (节省约$(( (29000-MEMORY)*100/29000 ))%)"
else
echo "⚠️ FP16 Hook已加载但显存未明显下降"
echo " - 当前显存: ${MEMORY}MB"
echo " - 建议检查配置文件"
fi
else
echo "❌ FP16未启用"
echo " - 未检测到Fp16OptimizerHook"
fi
echo "================================================================================"