bev-project/monitor_all_tasks.sh

65 lines
2.1 KiB
Bash
Raw Normal View History

#!/bin/bash
# 监控所有并行任务
cd /workspace/bevfusion
echo "========================================================================"
echo "BEVFusion并行任务监控 - $(date '+%Y-%m-%d %H:%M:%S')"
echo "========================================================================"
echo ""
echo "=== 任务1: Stage 1训练 (GPU 0-3) ==="
echo "----------------------------------------"
TRAIN_LOG=$(ls -t phase4a_stage1_*.log 2>/dev/null | head -1)
if [ -n "$TRAIN_LOG" ]; then
echo "日志: $TRAIN_LOG"
tail -3 "$TRAIN_LOG" | grep "Epoch \[" | tail -2 || echo "等待训练日志..."
else
echo "❌ 未找到训练日志"
fi
echo ""
echo "=== 任务2: Epoch 23评估 (GPU 4-7) ==="
echo "----------------------------------------"
EVAL_LOG=$(find eval_results -name "eval.log" 2>/dev/null | head -1)
if [ -n "$EVAL_LOG" ]; then
echo "日志: $EVAL_LOG"
tail -20 "$EVAL_LOG" | tail -10 || echo "评估刚启动..."
else
echo "评估尚未开始或日志未生成"
fi
echo ""
echo "=== GPU使用情况 ==="
echo "----------------------------------------"
nvidia-smi --query-gpu=index,name,memory.used,memory.total,utilization.gpu,temperature.gpu --format=csv,noheader | \
awk -F', ' '{
gpu=$1; mem=$3; total=$4; util=$5; temp=$6;
gsub(" MiB", "", mem); gsub(" MiB", "", total);
pct = int(mem/total*100);
if (gpu < 4) task="训练"; else task="评估";
printf "GPU %s (%s): %s/%s (%d%%) | 利用率:%s | 温度:%s\n", gpu, task, mem"MB", total"MB", pct, util, temp;
}'
echo ""
echo "=== 进程统计 ==="
echo "----------------------------------------"
TRAIN_PROCS=$(ps aux | grep "train.py" | grep python | grep -v grep | wc -l)
EVAL_PROCS=$(ps aux | grep "test.py" | grep python | grep -v grep | wc -l)
echo "训练进程: $TRAIN_PROCS"
echo "评估进程: $EVAL_PROCS"
echo "总计: $(($TRAIN_PROCS + $EVAL_PROCS))"
echo ""
echo "=== 快速命令 ==="
echo "----------------------------------------"
echo "训练日志: tail -f $TRAIN_LOG | grep 'Epoch \\['"
echo "评估日志: tail -f $EVAL_LOG"
echo "GPU监控: watch -n 5 nvidia-smi"
echo ""
echo "========================================================================"