#!/bin/bash # Phase 4A 训练监控脚本 echo "========================================================================" echo "Phase 4A BEV 2x 训练监控" echo "更新时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "========================================================================" echo "" # 1. 检查训练进程 echo "【1. 训练进程状态】" TRAIN_PROC=$(ps aux | grep "python tools/train.py" | grep -v grep | wc -l) if [ $TRAIN_PROC -gt 0 ]; then echo "✅ 训练进程运行中 ($TRAIN_PROC 个进程)" ps aux | grep "python tools/train.py" | grep -v grep | head -3 else echo "❌ 训练进程未运行" fi echo "" # 2. GPU状态 echo "【2. GPU状态】" nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits | \ awk -F', ' '{printf " GPU %s: 利用率=%s%%, 显存=%s/%sMB, 温度=%s°C\n", $1, $2, $3, $4, $5}' echo "" # 3. 最新训练日志 echo "【3. 最新训练进度】" if [ -f "phase4a_bev2x.log" ]; then tail -30 phase4a_bev2x.log | grep -E "Epoch \[[0-9]+\].*loss:" | tail -5 echo "" # 提取最新状态 LATEST=$(tail -100 phase4a_bev2x.log | grep -E "Epoch \[[0-9]+\].*loss:" | tail -1) if [ ! -z "$LATEST" ]; then EPOCH=$(echo "$LATEST" | grep -oP 'Epoch \[\K[0-9]+') ITER=$(echo "$LATEST" | grep -oP '\]\[\K[0-9]+(?=/)') TOTAL=$(echo "$LATEST" | grep -oP '/\K[0-9]+(?=\])') LOSS=$(echo "$LATEST" | grep -oP 'loss: \K[0-9.]+') LR=$(echo "$LATEST" | grep -oP 'lr: \K[0-9.e-]+') ETA=$(echo "$LATEST" | grep -oP 'eta: \K[^,]+') if [ ! -z "$EPOCH" ]; then PROGRESS=$(awk "BEGIN {printf \"%.1f\", ($ITER/$TOTAL)*100}") echo " 当前Epoch: $EPOCH/20" echo " 进度: $ITER/$TOTAL ($PROGRESS%)" echo " Loss: $LOSS" echo " 学习率: $LR" echo " 预计剩余: $ETA" fi fi else echo " ⚠️ 日志文件不存在" fi echo "" # 4. Checkpoint状态 echo "【4. Checkpoint状态】" if [ -d "runs/phase4a_bev2x" ]; then CKPT_COUNT=$(ls runs/phase4a_bev2x/*.pth 2>/dev/null | wc -l) echo " 已保存checkpoint: $CKPT_COUNT 个" if [ $CKPT_COUNT -gt 0 ]; then echo " 最新5个:" ls -lth runs/phase4a_bev2x/*.pth 2>/dev/null | head -5 | \ awk '{printf " %s %s\n", $9, $5}' fi else echo " ⚠️ 输出目录不存在" fi echo "" # 5. 磁盘空间 echo "【5. 磁盘空间】" df -h /workspace | tail -1 | awk '{printf " 使用: %s / %s (%s)\n", $3, $2, $5}' echo "" # 6. 错误检查 echo "【6. 错误检查】" if [ -f "phase4a_bev2x.log" ]; then ERROR_COUNT=$(grep -i "error\|exception\|traceback" phase4a_bev2x.log | grep -v "ErrorAnalysisHook" | wc -l) if [ $ERROR_COUNT -gt 0 ]; then echo " ⚠️ 发现 $ERROR_COUNT 个错误" echo " 最新错误:" grep -i "error\|exception" phase4a_bev2x.log | grep -v "ErrorAnalysisHook" | tail -3 else echo " ✅ 无错误" fi else echo " ⚠️ 日志文件不存在" fi echo "" echo "========================================================================" echo "更新: 每5分钟自动刷新" echo "手动刷新: bash monitor_phase4a.sh" echo "实时日志: tail -f phase4a_bev2x.log" echo "========================================================================"