bev-project/CLEANUP_AND_START_FP16_BATC...

67 lines
1.8 KiB
Bash
Executable File

#!/bin/bash
# 一键清理并启动FP16训练
set -e
cd /workspace/bevfusion
echo "=========================================================================="
echo "清理僵尸进程并启动FP16训练"
echo "=========================================================================="
echo ""
# 步骤1: 清理僵尸进程
echo "【步骤1/3】清理僵尸训练进程..."
TRAIN_PIDS=$(ps aux | grep "train.py" | grep -v grep | awk '{print $2}')
if [ -n "$TRAIN_PIDS" ]; then
echo "发现训练进程,正在终止..."
pkill -9 -f "train.py" || true
pkill -9 -f "torchpack" || true
echo "✓ 进程已终止"
else
echo "✓ 没有运行中的训练进程"
fi
echo ""
echo "等待GPU释放..."
sleep 5
# 检查GPU状态
echo ""
echo "【步骤2/3】检查GPU状态..."
nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader | while read line; do
echo " GPU $line"
done
USED_MEM=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -1)
if [ "$USED_MEM" -lt 1000 ]; then
echo "✓ GPU显存已释放"
else
echo "⚠️ GPU显存仍被占用: ${USED_MEM}MB"
echo "再等待5秒..."
sleep 5
fi
# 步骤3: 启动FP16训练
echo ""
echo "【步骤3/3】启动FP16训练..."
echo ""
bash RESTART_PHASE4A_STAGE1_FP16.sh
echo ""
echo "=========================================================================="
echo "✅ 训练已启动!"
echo "=========================================================================="
echo ""
echo "监控命令:"
echo " # 查看最新日志"
echo " tail -f \$(ls -t phase4a_stage1_fp16*.log | head -1)"
echo ""
echo " # 查看GPU状态"
echo " watch -n 2 nvidia-smi"
echo ""
echo " # 查看Loss"
echo " tail -100 \$(ls -t phase4a_stage1_fp16*.log | head -1) | grep 'loss:'"
echo ""