bev-project/archive_scripts/STOP_CURRENT_TRAINING.sh

58 lines
1.3 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 停止当前训练进程
echo "=========================================="
echo "停止当前训练进程"
echo "=========================================="
echo ""
# 查找训练进程
echo "当前运行的训练进程:"
ps aux | grep "train.py.*multitask.yaml" | grep -v grep
echo ""
echo "准备停止进程..."
echo ""
# 找到mpirun主进程
MPIRUN_PID=$(ps aux | grep "mpirun.*train.py" | grep -v grep | awk '{print $2}' | head -1)
if [ -z "$MPIRUN_PID" ]; then
echo "未找到运行中的训练进程"
exit 0
fi
echo "找到主进程 PID: $MPIRUN_PID"
echo ""
echo "发送SIGTERM信号优雅停止..."
# 优雅停止
kill -TERM $MPIRUN_PID
# 等待10秒
echo "等待进程退出..."
sleep 10
# 检查是否还在运行
if ps -p $MPIRUN_PID > /dev/null 2>&1; then
echo "进程仍在运行发送SIGKILL信号强制停止..."
kill -9 $MPIRUN_PID
sleep 2
fi
# 清理所有相关进程
echo "清理所有Python训练进程..."
pkill -9 -f "train.py.*multitask.yaml"
echo ""
echo "训练进程已停止"
echo ""
echo "剩余进程检查:"
ps aux | grep "train.py" | grep -v grep || echo "无训练进程"
echo ""
echo "=========================================="
echo "完成!"
echo "=========================================="