74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
监控Phase 4B训练状态
|
||
定期检查分割性能是否改善
|
||
"""
|
||
|
||
import time
|
||
import subprocess
|
||
|
||
def check_training_status():
|
||
"""检查训练状态"""
|
||
try:
|
||
# 检查进程
|
||
result = subprocess.run(
|
||
"ps aux | grep python | grep -v grep | wc -l",
|
||
shell=True, capture_output=True, text=True
|
||
)
|
||
process_count = int(result.stdout.strip())
|
||
|
||
print(f"🖥️ Python进程数量: {process_count}")
|
||
|
||
if process_count < 10:
|
||
print("❌ 训练进程异常")
|
||
return False
|
||
|
||
# 检查最新日志
|
||
try:
|
||
with open('phase4b_training_v2.log', 'r') as f:
|
||
lines = f.readlines()[-10:]
|
||
for line in lines:
|
||
if 'divider/dice:' in line:
|
||
# 提取dice loss值
|
||
import re
|
||
match = re.search(r'loss/map/divider/dice: ([0-9.]+)', line)
|
||
if match:
|
||
dice_loss = float(match.group(1))
|
||
dice_coeff = 1 - dice_loss
|
||
print(f"Dice Loss: {dice_loss:.4f}")
|
||
print(f"Dice系数: {dice_coeff:.4f}")
|
||
if dice_coeff > 0.5:
|
||
print("🎉 分割性能显著改善!")
|
||
elif dice_coeff > 0.1:
|
||
print("✅ 分割性能有所改善")
|
||
else:
|
||
print("⚠️ 分割性能仍需改善")
|
||
return True
|
||
except FileNotFoundError:
|
||
print("📄 日志文件不存在")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ 检查失败: {e}")
|
||
return False
|
||
|
||
def main():
|
||
print("🔍 开始监控Phase 4B训练状态...")
|
||
print("=" * 50)
|
||
|
||
for i in range(60): # 监控60次,每次间隔30秒
|
||
print(f"\n🔄 第 {i+1}/60 次检查 (每30秒)")
|
||
print(f"时间: {time.strftime('%H:%M:%S')}")
|
||
|
||
if not check_training_status():
|
||
print("❌ 训练异常,停止监控")
|
||
break
|
||
|
||
time.sleep(30)
|
||
|
||
print("\n🏁 监控完成")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|