#!/bin/bash # 一键清理并启动FP16训练 set -e cd /workspace/bevfusion echo "==========================================================================" echo "清理僵尸进程并启动FP16训练" echo "==========================================================================" echo "" # 步骤1: 清理僵尸进程 echo "【步骤1/3】清理僵尸训练进程..." TRAIN_PIDS=$(ps aux | grep "train.py" | grep -v grep | awk '{print $2}') if [ -n "$TRAIN_PIDS" ]; then echo "发现训练进程,正在终止..." pkill -9 -f "train.py" || true pkill -9 -f "torchpack" || true echo "✓ 进程已终止" else echo "✓ 没有运行中的训练进程" fi echo "" echo "等待GPU释放..." sleep 5 # 检查GPU状态 echo "" echo "【步骤2/3】检查GPU状态..." nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader | while read line; do echo " GPU $line" done USED_MEM=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -1) if [ "$USED_MEM" -lt 1000 ]; then echo "✓ GPU显存已释放" else echo "⚠️ GPU显存仍被占用: ${USED_MEM}MB" echo "再等待5秒..." sleep 5 fi # 步骤3: 启动FP16训练 echo "" echo "【步骤3/3】启动FP16训练..." echo "" bash RESTART_PHASE4A_STAGE1_FP16.sh echo "" echo "==========================================================================" echo "✅ 训练已启动!" echo "==========================================================================" echo "" echo "监控命令:" echo " # 查看最新日志" echo " tail -f \$(ls -t phase4a_stage1_fp16*.log | head -1)" echo "" echo " # 查看GPU状态" echo " watch -n 2 nvidia-smi" echo "" echo " # 查看Loss" echo " tail -100 \$(ls -t phase4a_stage1_fp16*.log | head -1) | grep 'loss:'" echo ""