#!/bin/bash # 同时监控Epoch 23评估和Stage 1训练 SLEEP_INTERVAL=30 # 每30秒刷新一次 clear while true; do clear echo "========================================================================" echo "BEVFusion 并行任务监控" echo "更新时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "========================================================================" echo "" # GPU状态 echo "--- GPU状态 ---" nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw --format=csv,noheader,nounits | \ while IFS=',' read -r idx name util mem_used mem_total temp power; do # 去除空格 idx=$(echo $idx | xargs) util=$(echo $util | xargs) mem_used=$(echo $mem_used | xargs) mem_total=$(echo $mem_total | xargs) temp=$(echo $temp | xargs) power=$(echo $power | xargs) # 判断GPU用途 if [ $idx -le 3 ]; then task="[训练]" else task="[评估]" fi # 格式化输出 printf "GPU %d %-12s: 利用率%3d%% | 显存%5d/%5d MB (%2d%%) | %2d°C | %5.1fW\n" \ $idx "$task" $util $mem_used $mem_total $((mem_used*100/mem_total)) $temp $power done echo "" # Stage 1训练状态 echo "--- Stage 1 训练 (GPU 0-3) ---" TRAIN_LOG=$(ls -t phase4a_stage1_*.log 2>/dev/null | head -1) if [ -f "$TRAIN_LOG" ]; then # 提取最新训练信息 LATEST_TRAIN=$(tail -100 "$TRAIN_LOG" | grep "Epoch \[" | tail -1) if [ -n "$LATEST_TRAIN" ]; then echo "最新进度: $LATEST_TRAIN" # 提取epoch和iter EPOCH=$(echo "$LATEST_TRAIN" | grep -oP 'Epoch \[\K\d+') ITER=$(echo "$LATEST_TRAIN" | grep -oP 'iter: \K\d+') TOTAL_ITER=$(echo "$LATEST_TRAIN" | grep -oP 'iter: \d+/\K\d+') if [ -n "$EPOCH" ] && [ -n "$ITER" ] && [ -n "$TOTAL_ITER" ]; then PROGRESS=$((ITER * 100 / TOTAL_ITER)) echo "Epoch $EPOCH 进度: $ITER/$TOTAL_ITER ($PROGRESS%)" fi # 提取loss LOSS=$(echo "$LATEST_TRAIN" | grep -oP 'loss: \K[\d\.]+') if [ -n "$LOSS" ]; then echo "当前Loss: $LOSS" fi else echo "等待训练数据..." fi else echo "未找到训练日志" fi echo "" # Epoch 23评估状态 echo "--- Epoch 23 评估 (GPU 4-7) ---" EVAL_LOG=$(ls -t eval_epoch23_*.log 2>/dev/null | head -1) if [ -f "$EVAL_LOG" ]; then # 检查评估是否在运行 EVAL_PID=$(pgrep -f "test.py.*epoch_23") if [ -n "$EVAL_PID" ]; then echo "状态: ✅ 运行中 (PID: $EVAL_PID)" # 提取当前阶段 CURRENT_STAGE=$(tail -50 "$EVAL_LOG" | grep -E "阶段[123]" | tail -1) if [ -n "$CURRENT_STAGE" ]; then echo "$CURRENT_STAGE" fi # 提取进度信息 PROGRESS_INFO=$(tail -100 "$EVAL_LOG" | grep -E "(Evaluating|Processing)" | tail -1) if [ -n "$PROGRESS_INFO" ]; then echo "$PROGRESS_INFO" fi # 运行时长 EVAL_START=$(stat -c %Y "$EVAL_LOG") EVAL_NOW=$(date +%s) EVAL_DURATION=$((EVAL_NOW - EVAL_START)) EVAL_HOURS=$((EVAL_DURATION / 3600)) EVAL_MINS=$(((EVAL_DURATION % 3600) / 60)) echo "运行时长: ${EVAL_HOURS}小时${EVAL_MINS}分钟" else # 检查是否已完成 if grep -q "全部评估完成" "$EVAL_LOG" 2>/dev/null; then echo "状态: ✅ 已完成" # 查找结果目录 EVAL_DIR=$(grep "输出目录:" "$EVAL_LOG" | tail -1 | awk '{print $2}') if [ -d "$EVAL_DIR" ]; then echo "结果: $EVAL_DIR" # 显示摘要(如果存在) if [ -f "$EVAL_DIR/SUMMARY.txt" ]; then echo "" echo "--- 评估结果摘要 ---" head -30 "$EVAL_DIR/SUMMARY.txt" fi fi else echo "状态: ❌ 未运行" echo "启动命令: bash EVAL_EPOCH23_COMPLETE.sh" fi fi else echo "状态: ⏳ 未启动" echo "启动命令: nohup bash EVAL_EPOCH23_COMPLETE.sh > eval_epoch23_\$(date +%Y%m%d_%H%M%S).log 2>&1 &" fi echo "" # 系统资源 echo "--- 系统资源 ---" # CPU CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) echo "CPU使用率: ${CPU_USAGE}%" # 内存 MEM_INFO=$(free -h | grep Mem) MEM_USED=$(echo $MEM_INFO | awk '{print $3}') MEM_TOTAL=$(echo $MEM_INFO | awk '{print $2}') echo "内存使用: $MEM_USED / $MEM_TOTAL" # 磁盘 DISK_INFO=$(df -h /workspace | tail -1) DISK_USED=$(echo $DISK_INFO | awk '{print $3}') DISK_TOTAL=$(echo $DISK_INFO | awk '{print $2}') DISK_PERCENT=$(echo $DISK_INFO | awk '{print $5}') echo "磁盘使用: $DISK_USED / $DISK_TOTAL ($DISK_PERCENT)" echo "" echo "========================================================================" echo "按 Ctrl+C 退出监控" echo "刷新间隔: ${SLEEP_INTERVAL}秒" echo "========================================================================" sleep $SLEEP_INTERVAL done