159 lines
5.6 KiB
Bash
Executable File
159 lines
5.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# 同时监控Epoch 23评估和Stage 1训练
|
|
|
|
SLEEP_INTERVAL=30 # 每30秒刷新一次
|
|
|
|
clear
|
|
|
|
while true; do
|
|
clear
|
|
echo "========================================================================"
|
|
echo "BEVFusion 并行任务监控"
|
|
echo "更新时间: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo "========================================================================"
|
|
echo ""
|
|
|
|
# GPU状态
|
|
echo "--- GPU状态 ---"
|
|
nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw --format=csv,noheader,nounits | \
|
|
while IFS=',' read -r idx name util mem_used mem_total temp power; do
|
|
# 去除空格
|
|
idx=$(echo $idx | xargs)
|
|
util=$(echo $util | xargs)
|
|
mem_used=$(echo $mem_used | xargs)
|
|
mem_total=$(echo $mem_total | xargs)
|
|
temp=$(echo $temp | xargs)
|
|
power=$(echo $power | xargs)
|
|
|
|
# 判断GPU用途
|
|
if [ $idx -le 3 ]; then
|
|
task="[训练]"
|
|
else
|
|
task="[评估]"
|
|
fi
|
|
|
|
# 格式化输出
|
|
printf "GPU %d %-12s: 利用率%3d%% | 显存%5d/%5d MB (%2d%%) | %2d°C | %5.1fW\n" \
|
|
$idx "$task" $util $mem_used $mem_total $((mem_used*100/mem_total)) $temp $power
|
|
done
|
|
echo ""
|
|
|
|
# Stage 1训练状态
|
|
echo "--- Stage 1 训练 (GPU 0-3) ---"
|
|
TRAIN_LOG=$(ls -t phase4a_stage1_*.log 2>/dev/null | head -1)
|
|
if [ -f "$TRAIN_LOG" ]; then
|
|
# 提取最新训练信息
|
|
LATEST_TRAIN=$(tail -100 "$TRAIN_LOG" | grep "Epoch \[" | tail -1)
|
|
if [ -n "$LATEST_TRAIN" ]; then
|
|
echo "最新进度: $LATEST_TRAIN"
|
|
|
|
# 提取epoch和iter
|
|
EPOCH=$(echo "$LATEST_TRAIN" | grep -oP 'Epoch \[\K\d+')
|
|
ITER=$(echo "$LATEST_TRAIN" | grep -oP 'iter: \K\d+')
|
|
TOTAL_ITER=$(echo "$LATEST_TRAIN" | grep -oP 'iter: \d+/\K\d+')
|
|
|
|
if [ -n "$EPOCH" ] && [ -n "$ITER" ] && [ -n "$TOTAL_ITER" ]; then
|
|
PROGRESS=$((ITER * 100 / TOTAL_ITER))
|
|
echo "Epoch $EPOCH 进度: $ITER/$TOTAL_ITER ($PROGRESS%)"
|
|
fi
|
|
|
|
# 提取loss
|
|
LOSS=$(echo "$LATEST_TRAIN" | grep -oP 'loss: \K[\d\.]+')
|
|
if [ -n "$LOSS" ]; then
|
|
echo "当前Loss: $LOSS"
|
|
fi
|
|
else
|
|
echo "等待训练数据..."
|
|
fi
|
|
else
|
|
echo "未找到训练日志"
|
|
fi
|
|
echo ""
|
|
|
|
# Epoch 23评估状态
|
|
echo "--- Epoch 23 评估 (GPU 4-7) ---"
|
|
EVAL_LOG=$(ls -t eval_epoch23_*.log 2>/dev/null | head -1)
|
|
if [ -f "$EVAL_LOG" ]; then
|
|
# 检查评估是否在运行
|
|
EVAL_PID=$(pgrep -f "test.py.*epoch_23")
|
|
|
|
if [ -n "$EVAL_PID" ]; then
|
|
echo "状态: ✅ 运行中 (PID: $EVAL_PID)"
|
|
|
|
# 提取当前阶段
|
|
CURRENT_STAGE=$(tail -50 "$EVAL_LOG" | grep -E "阶段[123]" | tail -1)
|
|
if [ -n "$CURRENT_STAGE" ]; then
|
|
echo "$CURRENT_STAGE"
|
|
fi
|
|
|
|
# 提取进度信息
|
|
PROGRESS_INFO=$(tail -100 "$EVAL_LOG" | grep -E "(Evaluating|Processing)" | tail -1)
|
|
if [ -n "$PROGRESS_INFO" ]; then
|
|
echo "$PROGRESS_INFO"
|
|
fi
|
|
|
|
# 运行时长
|
|
EVAL_START=$(stat -c %Y "$EVAL_LOG")
|
|
EVAL_NOW=$(date +%s)
|
|
EVAL_DURATION=$((EVAL_NOW - EVAL_START))
|
|
EVAL_HOURS=$((EVAL_DURATION / 3600))
|
|
EVAL_MINS=$(((EVAL_DURATION % 3600) / 60))
|
|
echo "运行时长: ${EVAL_HOURS}小时${EVAL_MINS}分钟"
|
|
else
|
|
# 检查是否已完成
|
|
if grep -q "全部评估完成" "$EVAL_LOG" 2>/dev/null; then
|
|
echo "状态: ✅ 已完成"
|
|
|
|
# 查找结果目录
|
|
EVAL_DIR=$(grep "输出目录:" "$EVAL_LOG" | tail -1 | awk '{print $2}')
|
|
if [ -d "$EVAL_DIR" ]; then
|
|
echo "结果: $EVAL_DIR"
|
|
|
|
# 显示摘要(如果存在)
|
|
if [ -f "$EVAL_DIR/SUMMARY.txt" ]; then
|
|
echo ""
|
|
echo "--- 评估结果摘要 ---"
|
|
head -30 "$EVAL_DIR/SUMMARY.txt"
|
|
fi
|
|
fi
|
|
else
|
|
echo "状态: ❌ 未运行"
|
|
echo "启动命令: bash EVAL_EPOCH23_COMPLETE.sh"
|
|
fi
|
|
fi
|
|
else
|
|
echo "状态: ⏳ 未启动"
|
|
echo "启动命令: nohup bash EVAL_EPOCH23_COMPLETE.sh > eval_epoch23_\$(date +%Y%m%d_%H%M%S).log 2>&1 &"
|
|
fi
|
|
echo ""
|
|
|
|
# 系统资源
|
|
echo "--- 系统资源 ---"
|
|
|
|
# CPU
|
|
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
|
|
echo "CPU使用率: ${CPU_USAGE}%"
|
|
|
|
# 内存
|
|
MEM_INFO=$(free -h | grep Mem)
|
|
MEM_USED=$(echo $MEM_INFO | awk '{print $3}')
|
|
MEM_TOTAL=$(echo $MEM_INFO | awk '{print $2}')
|
|
echo "内存使用: $MEM_USED / $MEM_TOTAL"
|
|
|
|
# 磁盘
|
|
DISK_INFO=$(df -h /workspace | tail -1)
|
|
DISK_USED=$(echo $DISK_INFO | awk '{print $3}')
|
|
DISK_TOTAL=$(echo $DISK_INFO | awk '{print $2}')
|
|
DISK_PERCENT=$(echo $DISK_INFO | awk '{print $5}')
|
|
echo "磁盘使用: $DISK_USED / $DISK_TOTAL ($DISK_PERCENT)"
|
|
|
|
echo ""
|
|
echo "========================================================================"
|
|
echo "按 Ctrl+C 退出监控"
|
|
echo "刷新间隔: ${SLEEP_INTERVAL}秒"
|
|
echo "========================================================================"
|
|
|
|
sleep $SLEEP_INTERVAL
|
|
done
|
|
|