58 lines
1.3 KiB
Bash
58 lines
1.3 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
# 停止当前训练进程
|
|||
|
|
|
|||
|
|
echo "=========================================="
|
|||
|
|
echo "停止当前训练进程"
|
|||
|
|
echo "=========================================="
|
|||
|
|
echo ""
|
|||
|
|
|
|||
|
|
# 查找训练进程
|
|||
|
|
echo "当前运行的训练进程:"
|
|||
|
|
ps aux | grep "train.py.*multitask.yaml" | grep -v grep
|
|||
|
|
|
|||
|
|
echo ""
|
|||
|
|
echo "准备停止进程..."
|
|||
|
|
echo ""
|
|||
|
|
|
|||
|
|
# 找到mpirun主进程
|
|||
|
|
MPIRUN_PID=$(ps aux | grep "mpirun.*train.py" | grep -v grep | awk '{print $2}' | head -1)
|
|||
|
|
|
|||
|
|
if [ -z "$MPIRUN_PID" ]; then
|
|||
|
|
echo "未找到运行中的训练进程"
|
|||
|
|
exit 0
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
echo "找到主进程 PID: $MPIRUN_PID"
|
|||
|
|
echo ""
|
|||
|
|
echo "发送SIGTERM信号(优雅停止)..."
|
|||
|
|
|
|||
|
|
# 优雅停止
|
|||
|
|
kill -TERM $MPIRUN_PID
|
|||
|
|
|
|||
|
|
# 等待10秒
|
|||
|
|
echo "等待进程退出..."
|
|||
|
|
sleep 10
|
|||
|
|
|
|||
|
|
# 检查是否还在运行
|
|||
|
|
if ps -p $MPIRUN_PID > /dev/null 2>&1; then
|
|||
|
|
echo "进程仍在运行,发送SIGKILL信号(强制停止)..."
|
|||
|
|
kill -9 $MPIRUN_PID
|
|||
|
|
sleep 2
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 清理所有相关进程
|
|||
|
|
echo "清理所有Python训练进程..."
|
|||
|
|
pkill -9 -f "train.py.*multitask.yaml"
|
|||
|
|
|
|||
|
|
echo ""
|
|||
|
|
echo "训练进程已停止"
|
|||
|
|
echo ""
|
|||
|
|
echo "剩余进程检查:"
|
|||
|
|
ps aux | grep "train.py" | grep -v grep || echo "无训练进程"
|
|||
|
|
|
|||
|
|
echo ""
|
|||
|
|
echo "=========================================="
|
|||
|
|
echo "完成!"
|
|||
|
|
echo "=========================================="
|
|||
|
|
|