74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
监控Phase 4B训练状态
|
|||
|
|
定期检查分割性能是否改善
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import time
|
|||
|
|
import subprocess
|
|||
|
|
|
|||
|
|
def check_training_status():
|
|||
|
|
"""检查训练状态"""
|
|||
|
|
try:
|
|||
|
|
# 检查进程
|
|||
|
|
result = subprocess.run(
|
|||
|
|
"ps aux | grep python | grep -v grep | wc -l",
|
|||
|
|
shell=True, capture_output=True, text=True
|
|||
|
|
)
|
|||
|
|
process_count = int(result.stdout.strip())
|
|||
|
|
|
|||
|
|
print(f"🖥️ Python进程数量: {process_count}")
|
|||
|
|
|
|||
|
|
if process_count < 10:
|
|||
|
|
print("❌ 训练进程异常")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 检查最新日志
|
|||
|
|
try:
|
|||
|
|
with open('phase4b_training_v2.log', 'r') as f:
|
|||
|
|
lines = f.readlines()[-10:]
|
|||
|
|
for line in lines:
|
|||
|
|
if 'divider/dice:' in line:
|
|||
|
|
# 提取dice loss值
|
|||
|
|
import re
|
|||
|
|
match = re.search(r'loss/map/divider/dice: ([0-9.]+)', line)
|
|||
|
|
if match:
|
|||
|
|
dice_loss = float(match.group(1))
|
|||
|
|
dice_coeff = 1 - dice_loss
|
|||
|
|
print(f"Dice Loss: {dice_loss:.4f}")
|
|||
|
|
print(f"Dice系数: {dice_coeff:.4f}")
|
|||
|
|
if dice_coeff > 0.5:
|
|||
|
|
print("🎉 分割性能显著改善!")
|
|||
|
|
elif dice_coeff > 0.1:
|
|||
|
|
print("✅ 分割性能有所改善")
|
|||
|
|
else:
|
|||
|
|
print("⚠️ 分割性能仍需改善")
|
|||
|
|
return True
|
|||
|
|
except FileNotFoundError:
|
|||
|
|
print("📄 日志文件不存在")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ 检查失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
print("🔍 开始监控Phase 4B训练状态...")
|
|||
|
|
print("=" * 50)
|
|||
|
|
|
|||
|
|
for i in range(60): # 监控60次,每次间隔30秒
|
|||
|
|
print(f"\n🔄 第 {i+1}/60 次检查 (每30秒)")
|
|||
|
|
print(f"时间: {time.strftime('%H:%M:%S')}")
|
|||
|
|
|
|||
|
|
if not check_training_status():
|
|||
|
|
print("❌ 训练异常,停止监控")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
time.sleep(30)
|
|||
|
|
|
|||
|
|
print("\n🏁 监控完成")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|