bev-project/monitor_training.py

74 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
监控Phase 4B训练状态
定期检查分割性能是否改善
"""
import time
import subprocess
def check_training_status():
"""检查训练状态"""
try:
# 检查进程
result = subprocess.run(
"ps aux | grep python | grep -v grep | wc -l",
shell=True, capture_output=True, text=True
)
process_count = int(result.stdout.strip())
print(f"🖥️ Python进程数量: {process_count}")
if process_count < 10:
print("❌ 训练进程异常")
return False
# 检查最新日志
try:
with open('phase4b_training_v2.log', 'r') as f:
lines = f.readlines()[-10:]
for line in lines:
if 'divider/dice:' in line:
# 提取dice loss值
import re
match = re.search(r'loss/map/divider/dice: ([0-9.]+)', line)
if match:
dice_loss = float(match.group(1))
dice_coeff = 1 - dice_loss
print(f"Dice Loss: {dice_loss:.4f}")
print(f"Dice系数: {dice_coeff:.4f}")
if dice_coeff > 0.5:
print("🎉 分割性能显著改善!")
elif dice_coeff > 0.1:
print("✅ 分割性能有所改善")
else:
print("⚠️ 分割性能仍需改善")
return True
except FileNotFoundError:
print("📄 日志文件不存在")
return True
except Exception as e:
print(f"❌ 检查失败: {e}")
return False
def main():
print("🔍 开始监控Phase 4B训练状态...")
print("=" * 50)
for i in range(60): # 监控60次每次间隔30秒
print(f"\n🔄 第 {i+1}/60 次检查 (每30秒)")
print(f"时间: {time.strftime('%H:%M:%S')}")
if not check_training_status():
print("❌ 训练异常,停止监控")
break
time.sleep(30)
print("\n🏁 监控完成")
if __name__ == "__main__":
main()