#!/bin/bash # 停止当前训练进程 echo "==========================================" echo "停止当前训练进程" echo "==========================================" echo "" # 查找训练进程 echo "当前运行的训练进程:" ps aux | grep "train.py.*multitask.yaml" | grep -v grep echo "" echo "准备停止进程..." echo "" # 找到mpirun主进程 MPIRUN_PID=$(ps aux | grep "mpirun.*train.py" | grep -v grep | awk '{print $2}' | head -1) if [ -z "$MPIRUN_PID" ]; then echo "未找到运行中的训练进程" exit 0 fi echo "找到主进程 PID: $MPIRUN_PID" echo "" echo "发送SIGTERM信号(优雅停止)..." # 优雅停止 kill -TERM $MPIRUN_PID # 等待10秒 echo "等待进程退出..." sleep 10 # 检查是否还在运行 if ps -p $MPIRUN_PID > /dev/null 2>&1; then echo "进程仍在运行,发送SIGKILL信号(强制停止)..." kill -9 $MPIRUN_PID sleep 2 fi # 清理所有相关进程 echo "清理所有Python训练进程..." pkill -9 -f "train.py.*multitask.yaml" echo "" echo "训练进程已停止" echo "" echo "剩余进程检查:" ps aux | grep "train.py" | grep -v grep || echo "无训练进程" echo "" echo "==========================================" echo "完成!" echo "=========================================="