#!/usr/bin/env python3 """ 监控Phase 4B训练状态 定期检查分割性能是否改善 """ import time import subprocess def check_training_status(): """检查训练状态""" try: # 检查进程 result = subprocess.run( "ps aux | grep python | grep -v grep | wc -l", shell=True, capture_output=True, text=True ) process_count = int(result.stdout.strip()) print(f"🖥️ Python进程数量: {process_count}") if process_count < 10: print("❌ 训练进程异常") return False # 检查最新日志 try: with open('phase4b_training_v2.log', 'r') as f: lines = f.readlines()[-10:] for line in lines: if 'divider/dice:' in line: # 提取dice loss值 import re match = re.search(r'loss/map/divider/dice: ([0-9.]+)', line) if match: dice_loss = float(match.group(1)) dice_coeff = 1 - dice_loss print(f"Dice Loss: {dice_loss:.4f}") print(f"Dice系数: {dice_coeff:.4f}") if dice_coeff > 0.5: print("🎉 分割性能显著改善!") elif dice_coeff > 0.1: print("✅ 分割性能有所改善") else: print("⚠️ 分割性能仍需改善") return True except FileNotFoundError: print("📄 日志文件不存在") return True except Exception as e: print(f"❌ 检查失败: {e}") return False def main(): print("🔍 开始监控Phase 4B训练状态...") print("=" * 50) for i in range(60): # 监控60次,每次间隔30秒 print(f"\n🔄 第 {i+1}/60 次检查 (每30秒)") print(f"时间: {time.strftime('%H:%M:%S')}") if not check_training_status(): print("❌ 训练异常,停止监控") break time.sleep(30) print("\n🏁 监控完成") if __name__ == "__main__": main()