bev-project/monitor_training.py

74 lines
2.3 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
监控Phase 4B训练状态
定期检查分割性能是否改善
"""
import time
import subprocess
def check_training_status():
"""检查训练状态"""
try:
# 检查进程
result = subprocess.run(
"ps aux | grep python | grep -v grep | wc -l",
shell=True, capture_output=True, text=True
)
process_count = int(result.stdout.strip())
print(f"🖥️ Python进程数量: {process_count}")
if process_count < 10:
print("❌ 训练进程异常")
return False
# 检查最新日志
try:
with open('phase4b_training_v2.log', 'r') as f:
lines = f.readlines()[-10:]
for line in lines:
if 'divider/dice:' in line:
# 提取dice loss值
import re
match = re.search(r'loss/map/divider/dice: ([0-9.]+)', line)
if match:
dice_loss = float(match.group(1))
dice_coeff = 1 - dice_loss
print(f"Dice Loss: {dice_loss:.4f}")
print(f"Dice系数: {dice_coeff:.4f}")
if dice_coeff > 0.5:
print("🎉 分割性能显著改善!")
elif dice_coeff > 0.1:
print("✅ 分割性能有所改善")
else:
print("⚠️ 分割性能仍需改善")
return True
except FileNotFoundError:
print("📄 日志文件不存在")
return True
except Exception as e:
print(f"❌ 检查失败: {e}")
return False
def main():
print("🔍 开始监控Phase 4B训练状态...")
print("=" * 50)
for i in range(60): # 监控60次每次间隔30秒
print(f"\n🔄 第 {i+1}/60 次检查 (每30秒)")
print(f"时间: {time.strftime('%H:%M:%S')}")
if not check_training_status():
print("❌ 训练异常,停止监控")
break
time.sleep(30)
print("\n🏁 监控完成")
if __name__ == "__main__":
main()