bev-project/archive_scripts/monitor_eval_and_training.sh

159 lines
5.6 KiB
Bash
Executable File

#!/bin/bash
# 同时监控Epoch 23评估和Stage 1训练
SLEEP_INTERVAL=30 # 每30秒刷新一次
clear
while true; do
clear
echo "========================================================================"
echo "BEVFusion 并行任务监控"
echo "更新时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo "========================================================================"
echo ""
# GPU状态
echo "--- GPU状态 ---"
nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw --format=csv,noheader,nounits | \
while IFS=',' read -r idx name util mem_used mem_total temp power; do
# 去除空格
idx=$(echo $idx | xargs)
util=$(echo $util | xargs)
mem_used=$(echo $mem_used | xargs)
mem_total=$(echo $mem_total | xargs)
temp=$(echo $temp | xargs)
power=$(echo $power | xargs)
# 判断GPU用途
if [ $idx -le 3 ]; then
task="[训练]"
else
task="[评估]"
fi
# 格式化输出
printf "GPU %d %-12s: 利用率%3d%% | 显存%5d/%5d MB (%2d%%) | %2d°C | %5.1fW\n" \
$idx "$task" $util $mem_used $mem_total $((mem_used*100/mem_total)) $temp $power
done
echo ""
# Stage 1训练状态
echo "--- Stage 1 训练 (GPU 0-3) ---"
TRAIN_LOG=$(ls -t phase4a_stage1_*.log 2>/dev/null | head -1)
if [ -f "$TRAIN_LOG" ]; then
# 提取最新训练信息
LATEST_TRAIN=$(tail -100 "$TRAIN_LOG" | grep "Epoch \[" | tail -1)
if [ -n "$LATEST_TRAIN" ]; then
echo "最新进度: $LATEST_TRAIN"
# 提取epoch和iter
EPOCH=$(echo "$LATEST_TRAIN" | grep -oP 'Epoch \[\K\d+')
ITER=$(echo "$LATEST_TRAIN" | grep -oP 'iter: \K\d+')
TOTAL_ITER=$(echo "$LATEST_TRAIN" | grep -oP 'iter: \d+/\K\d+')
if [ -n "$EPOCH" ] && [ -n "$ITER" ] && [ -n "$TOTAL_ITER" ]; then
PROGRESS=$((ITER * 100 / TOTAL_ITER))
echo "Epoch $EPOCH 进度: $ITER/$TOTAL_ITER ($PROGRESS%)"
fi
# 提取loss
LOSS=$(echo "$LATEST_TRAIN" | grep -oP 'loss: \K[\d\.]+')
if [ -n "$LOSS" ]; then
echo "当前Loss: $LOSS"
fi
else
echo "等待训练数据..."
fi
else
echo "未找到训练日志"
fi
echo ""
# Epoch 23评估状态
echo "--- Epoch 23 评估 (GPU 4-7) ---"
EVAL_LOG=$(ls -t eval_epoch23_*.log 2>/dev/null | head -1)
if [ -f "$EVAL_LOG" ]; then
# 检查评估是否在运行
EVAL_PID=$(pgrep -f "test.py.*epoch_23")
if [ -n "$EVAL_PID" ]; then
echo "状态: ✅ 运行中 (PID: $EVAL_PID)"
# 提取当前阶段
CURRENT_STAGE=$(tail -50 "$EVAL_LOG" | grep -E "阶段[123]" | tail -1)
if [ -n "$CURRENT_STAGE" ]; then
echo "$CURRENT_STAGE"
fi
# 提取进度信息
PROGRESS_INFO=$(tail -100 "$EVAL_LOG" | grep -E "(Evaluating|Processing)" | tail -1)
if [ -n "$PROGRESS_INFO" ]; then
echo "$PROGRESS_INFO"
fi
# 运行时长
EVAL_START=$(stat -c %Y "$EVAL_LOG")
EVAL_NOW=$(date +%s)
EVAL_DURATION=$((EVAL_NOW - EVAL_START))
EVAL_HOURS=$((EVAL_DURATION / 3600))
EVAL_MINS=$(((EVAL_DURATION % 3600) / 60))
echo "运行时长: ${EVAL_HOURS}小时${EVAL_MINS}分钟"
else
# 检查是否已完成
if grep -q "全部评估完成" "$EVAL_LOG" 2>/dev/null; then
echo "状态: ✅ 已完成"
# 查找结果目录
EVAL_DIR=$(grep "输出目录:" "$EVAL_LOG" | tail -1 | awk '{print $2}')
if [ -d "$EVAL_DIR" ]; then
echo "结果: $EVAL_DIR"
# 显示摘要(如果存在)
if [ -f "$EVAL_DIR/SUMMARY.txt" ]; then
echo ""
echo "--- 评估结果摘要 ---"
head -30 "$EVAL_DIR/SUMMARY.txt"
fi
fi
else
echo "状态: ❌ 未运行"
echo "启动命令: bash EVAL_EPOCH23_COMPLETE.sh"
fi
fi
else
echo "状态: ⏳ 未启动"
echo "启动命令: nohup bash EVAL_EPOCH23_COMPLETE.sh > eval_epoch23_\$(date +%Y%m%d_%H%M%S).log 2>&1 &"
fi
echo ""
# 系统资源
echo "--- 系统资源 ---"
# CPU
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
echo "CPU使用率: ${CPU_USAGE}%"
# 内存
MEM_INFO=$(free -h | grep Mem)
MEM_USED=$(echo $MEM_INFO | awk '{print $3}')
MEM_TOTAL=$(echo $MEM_INFO | awk '{print $2}')
echo "内存使用: $MEM_USED / $MEM_TOTAL"
# 磁盘
DISK_INFO=$(df -h /workspace | tail -1)
DISK_USED=$(echo $DISK_INFO | awk '{print $3}')
DISK_TOTAL=$(echo $DISK_INFO | awk '{print $2}')
DISK_PERCENT=$(echo $DISK_INFO | awk '{print $5}')
echo "磁盘使用: $DISK_USED / $DISK_TOTAL ($DISK_PERCENT)"
echo ""
echo "========================================================================"
echo "按 Ctrl+C 退出监控"
echo "刷新间隔: ${SLEEP_INTERVAL}"
echo "========================================================================"
sleep $SLEEP_INTERVAL
done