bev-project/archive_scripts/STOP_CURRENT_TRAINING.sh

58 lines
1.3 KiB
Bash
Raw Normal View History

#!/bin/bash
# 停止当前训练进程
echo "=========================================="
echo "停止当前训练进程"
echo "=========================================="
echo ""
# 查找训练进程
echo "当前运行的训练进程:"
ps aux | grep "train.py.*multitask.yaml" | grep -v grep
echo ""
echo "准备停止进程..."
echo ""
# 找到mpirun主进程
MPIRUN_PID=$(ps aux | grep "mpirun.*train.py" | grep -v grep | awk '{print $2}' | head -1)
if [ -z "$MPIRUN_PID" ]; then
echo "未找到运行中的训练进程"
exit 0
fi
echo "找到主进程 PID: $MPIRUN_PID"
echo ""
echo "发送SIGTERM信号优雅停止..."
# 优雅停止
kill -TERM $MPIRUN_PID
# 等待10秒
echo "等待进程退出..."
sleep 10
# 检查是否还在运行
if ps -p $MPIRUN_PID > /dev/null 2>&1; then
echo "进程仍在运行发送SIGKILL信号强制停止..."
kill -9 $MPIRUN_PID
sleep 2
fi
# 清理所有相关进程
echo "清理所有Python训练进程..."
pkill -9 -f "train.py.*multitask.yaml"
echo ""
echo "训练进程已停止"
echo ""
echo "剩余进程检查:"
ps aux | grep "train.py" | grep -v grep || echo "无训练进程"
echo ""
echo "=========================================="
echo "完成!"
echo "=========================================="