bev-project/archive/scripts_old/MONITOR_TASK_GCA.sh

46 lines
1.8 KiB
Bash
Executable File

#!/bin/bash
# Task-specific GCA训练监控脚本
echo "═══════════════════════════════════════════════════════════════════"
echo " Task-specific GCA训练监控"
echo "═══════════════════════════════════════════════════════════════════"
LOG_DIR="/data/runs/phase4a_stage1_task_gca"
LATEST_LOG=$(ls -t ${LOG_DIR}/train_*.log 2>/dev/null | head -1)
if [ -z "$LATEST_LOG" ]; then
echo "❌ 未找到训练日志"
exit 1
fi
echo "日志文件: $LATEST_LOG"
echo ""
# 检查训练进程
echo "━━━ 训练进程状态 ━━━"
TRAIN_PROC=$(ps aux | grep "tools/train.py" | grep -v grep)
if [ -z "$TRAIN_PROC" ]; then
echo "❌ 训练进程未运行"
else
echo "✅ 训练进程正在运行:"
echo "$TRAIN_PROC" | head -1
fi
echo ""
echo "━━━ GPU状态 ━━━"
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total --format=csv,noheader
echo ""
echo "━━━ 最新训练日志 (最后100行) ━━━"
tail -n 100 "$LATEST_LOG"
echo ""
echo "═══════════════════════════════════════════════════════════════════"
echo "监控命令:"
echo " 实时日志: tail -f $LATEST_LOG"
echo " 关键指标: tail -f $LATEST_LOG | grep -E 'Epoch|loss/map/divider'"
echo " GPU监控: nvidia-smi -l 5"
echo "═══════════════════════════════════════════════════════════════════"