67 lines
1.8 KiB
Bash
Executable File
67 lines
1.8 KiB
Bash
Executable File
#!/bin/bash
|
|
# 一键清理并启动FP16训练
|
|
|
|
set -e
|
|
|
|
cd /workspace/bevfusion
|
|
|
|
echo "=========================================================================="
|
|
echo "清理僵尸进程并启动FP16训练"
|
|
echo "=========================================================================="
|
|
echo ""
|
|
|
|
# 步骤1: 清理僵尸进程
|
|
echo "【步骤1/3】清理僵尸训练进程..."
|
|
TRAIN_PIDS=$(ps aux | grep "train.py" | grep -v grep | awk '{print $2}')
|
|
if [ -n "$TRAIN_PIDS" ]; then
|
|
echo "发现训练进程,正在终止..."
|
|
pkill -9 -f "train.py" || true
|
|
pkill -9 -f "torchpack" || true
|
|
echo "✓ 进程已终止"
|
|
else
|
|
echo "✓ 没有运行中的训练进程"
|
|
fi
|
|
|
|
echo ""
|
|
echo "等待GPU释放..."
|
|
sleep 5
|
|
|
|
# 检查GPU状态
|
|
echo ""
|
|
echo "【步骤2/3】检查GPU状态..."
|
|
nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader | while read line; do
|
|
echo " GPU $line"
|
|
done
|
|
|
|
USED_MEM=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -1)
|
|
if [ "$USED_MEM" -lt 1000 ]; then
|
|
echo "✓ GPU显存已释放"
|
|
else
|
|
echo "⚠️ GPU显存仍被占用: ${USED_MEM}MB"
|
|
echo "再等待5秒..."
|
|
sleep 5
|
|
fi
|
|
|
|
# 步骤3: 启动FP16训练
|
|
echo ""
|
|
echo "【步骤3/3】启动FP16训练..."
|
|
echo ""
|
|
bash RESTART_PHASE4A_STAGE1_FP16.sh
|
|
|
|
echo ""
|
|
echo "=========================================================================="
|
|
echo "✅ 训练已启动!"
|
|
echo "=========================================================================="
|
|
echo ""
|
|
echo "监控命令:"
|
|
echo " # 查看最新日志"
|
|
echo " tail -f \$(ls -t phase4a_stage1_fp16*.log | head -1)"
|
|
echo ""
|
|
echo " # 查看GPU状态"
|
|
echo " watch -n 2 nvidia-smi"
|
|
echo ""
|
|
echo " # 查看Loss"
|
|
echo " tail -100 \$(ls -t phase4a_stage1_fp16*.log | head -1) | grep 'loss:'"
|
|
echo ""
|
|
|