46 lines
1.8 KiB
Bash
Executable File
46 lines
1.8 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Task-specific GCA训练监控脚本
|
|
|
|
echo "═══════════════════════════════════════════════════════════════════"
|
|
echo " Task-specific GCA训练监控"
|
|
echo "═══════════════════════════════════════════════════════════════════"
|
|
|
|
LOG_DIR="/data/runs/phase4a_stage1_task_gca"
|
|
LATEST_LOG=$(ls -t ${LOG_DIR}/train_*.log 2>/dev/null | head -1)
|
|
|
|
if [ -z "$LATEST_LOG" ]; then
|
|
echo "❌ 未找到训练日志"
|
|
exit 1
|
|
fi
|
|
|
|
echo "日志文件: $LATEST_LOG"
|
|
echo ""
|
|
|
|
# 检查训练进程
|
|
echo "━━━ 训练进程状态 ━━━"
|
|
TRAIN_PROC=$(ps aux | grep "tools/train.py" | grep -v grep)
|
|
if [ -z "$TRAIN_PROC" ]; then
|
|
echo "❌ 训练进程未运行"
|
|
else
|
|
echo "✅ 训练进程正在运行:"
|
|
echo "$TRAIN_PROC" | head -1
|
|
fi
|
|
|
|
echo ""
|
|
echo "━━━ GPU状态 ━━━"
|
|
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total --format=csv,noheader
|
|
|
|
echo ""
|
|
echo "━━━ 最新训练日志 (最后100行) ━━━"
|
|
tail -n 100 "$LATEST_LOG"
|
|
|
|
echo ""
|
|
echo "═══════════════════════════════════════════════════════════════════"
|
|
echo "监控命令:"
|
|
echo " 实时日志: tail -f $LATEST_LOG"
|
|
echo " 关键指标: tail -f $LATEST_LOG | grep -E 'Epoch|loss/map/divider'"
|
|
echo " GPU监控: nvidia-smi -l 5"
|
|
echo "═══════════════════════════════════════════════════════════════════"
|
|
|