103 lines
3.3 KiB
Bash
Executable File
103 lines
3.3 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Phase 4A 训练监控脚本
|
|
|
|
echo "========================================================================"
|
|
echo "Phase 4A BEV 2x 训练监控"
|
|
echo "更新时间: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo "========================================================================"
|
|
echo ""
|
|
|
|
# 1. 检查训练进程
|
|
echo "【1. 训练进程状态】"
|
|
TRAIN_PROC=$(ps aux | grep "python tools/train.py" | grep -v grep | wc -l)
|
|
if [ $TRAIN_PROC -gt 0 ]; then
|
|
echo "✅ 训练进程运行中 ($TRAIN_PROC 个进程)"
|
|
ps aux | grep "python tools/train.py" | grep -v grep | head -3
|
|
else
|
|
echo "❌ 训练进程未运行"
|
|
fi
|
|
echo ""
|
|
|
|
# 2. GPU状态
|
|
echo "【2. GPU状态】"
|
|
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits | \
|
|
awk -F', ' '{printf " GPU %s: 利用率=%s%%, 显存=%s/%sMB, 温度=%s°C\n", $1, $2, $3, $4, $5}'
|
|
echo ""
|
|
|
|
# 3. 最新训练日志
|
|
echo "【3. 最新训练进度】"
|
|
if [ -f "phase4a_bev2x.log" ]; then
|
|
tail -30 phase4a_bev2x.log | grep -E "Epoch \[[0-9]+\].*loss:" | tail -5
|
|
echo ""
|
|
|
|
# 提取最新状态
|
|
LATEST=$(tail -100 phase4a_bev2x.log | grep -E "Epoch \[[0-9]+\].*loss:" | tail -1)
|
|
if [ ! -z "$LATEST" ]; then
|
|
EPOCH=$(echo "$LATEST" | grep -oP 'Epoch \[\K[0-9]+')
|
|
ITER=$(echo "$LATEST" | grep -oP '\]\[\K[0-9]+(?=/)')
|
|
TOTAL=$(echo "$LATEST" | grep -oP '/\K[0-9]+(?=\])')
|
|
LOSS=$(echo "$LATEST" | grep -oP 'loss: \K[0-9.]+')
|
|
LR=$(echo "$LATEST" | grep -oP 'lr: \K[0-9.e-]+')
|
|
ETA=$(echo "$LATEST" | grep -oP 'eta: \K[^,]+')
|
|
|
|
if [ ! -z "$EPOCH" ]; then
|
|
PROGRESS=$(awk "BEGIN {printf \"%.1f\", ($ITER/$TOTAL)*100}")
|
|
echo " 当前Epoch: $EPOCH/20"
|
|
echo " 进度: $ITER/$TOTAL ($PROGRESS%)"
|
|
echo " Loss: $LOSS"
|
|
echo " 学习率: $LR"
|
|
echo " 预计剩余: $ETA"
|
|
fi
|
|
fi
|
|
else
|
|
echo " ⚠️ 日志文件不存在"
|
|
fi
|
|
echo ""
|
|
|
|
# 4. Checkpoint状态
|
|
echo "【4. Checkpoint状态】"
|
|
if [ -d "runs/phase4a_bev2x" ]; then
|
|
CKPT_COUNT=$(ls runs/phase4a_bev2x/*.pth 2>/dev/null | wc -l)
|
|
echo " 已保存checkpoint: $CKPT_COUNT 个"
|
|
if [ $CKPT_COUNT -gt 0 ]; then
|
|
echo " 最新5个:"
|
|
ls -lth runs/phase4a_bev2x/*.pth 2>/dev/null | head -5 | \
|
|
awk '{printf " %s %s\n", $9, $5}'
|
|
fi
|
|
else
|
|
echo " ⚠️ 输出目录不存在"
|
|
fi
|
|
echo ""
|
|
|
|
# 5. 磁盘空间
|
|
echo "【5. 磁盘空间】"
|
|
df -h /workspace | tail -1 | awk '{printf " 使用: %s / %s (%s)\n", $3, $2, $5}'
|
|
echo ""
|
|
|
|
# 6. 错误检查
|
|
echo "【6. 错误检查】"
|
|
if [ -f "phase4a_bev2x.log" ]; then
|
|
ERROR_COUNT=$(grep -i "error\|exception\|traceback" phase4a_bev2x.log | grep -v "ErrorAnalysisHook" | wc -l)
|
|
if [ $ERROR_COUNT -gt 0 ]; then
|
|
echo " ⚠️ 发现 $ERROR_COUNT 个错误"
|
|
echo " 最新错误:"
|
|
grep -i "error\|exception" phase4a_bev2x.log | grep -v "ErrorAnalysisHook" | tail -3
|
|
else
|
|
echo " ✅ 无错误"
|
|
fi
|
|
else
|
|
echo " ⚠️ 日志文件不存在"
|
|
fi
|
|
echo ""
|
|
|
|
echo "========================================================================"
|
|
echo "更新: 每5分钟自动刷新"
|
|
echo "手动刷新: bash monitor_phase4a.sh"
|
|
echo "实时日志: tail -f phase4a_bev2x.log"
|
|
echo "========================================================================"
|
|
|
|
|
|
|
|
|