96 lines
3.2 KiB
Bash
Executable File
96 lines
3.2 KiB
Bash
Executable File
#!/bin/bash
|
||
# Epoch 23评估 - 修复版(同Docker并行,使用GPU 4-7)
|
||
# 关键修复: workers=0, CUDA_VISIBLE_DEVICES=4,5,6,7
|
||
|
||
set -e
|
||
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
echo "========================================================================"
|
||
echo "Phase 3 Epoch 23 评估(GPU 4-7并行)"
|
||
echo "========================================================================"
|
||
echo "Checkpoint: epoch_23.pth"
|
||
echo "使用GPU: 4-7 (不影响GPU 0-3的训练)"
|
||
echo "Workers: 0 (避免共享内存问题)"
|
||
echo "========================================================================"
|
||
echo ""
|
||
|
||
# 创建评估输出目录
|
||
EVAL_DIR="eval_results/epoch23_$(date +%Y%m%d_%H%M%S)"
|
||
mkdir -p "$EVAL_DIR"
|
||
|
||
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_enhanced_phase1_HIGHRES.yaml"
|
||
CHECKPOINT="runs/enhanced_from_epoch19/epoch_23.pth"
|
||
|
||
echo "配置: $CONFIG"
|
||
echo "Checkpoint: $CHECKPOINT"
|
||
echo "输出目录: $EVAL_DIR"
|
||
echo ""
|
||
|
||
# 验证训练仍在运行
|
||
echo "验证训练状态..."
|
||
TRAIN_PROCS=$(ps aux | grep "train.py" | grep python | grep -v grep | wc -l)
|
||
if [ "$TRAIN_PROCS" -gt 0 ]; then
|
||
echo "✓ 训练正常运行中 ($TRAIN_PROCS 进程)"
|
||
else
|
||
echo "⚠️ 警告: 未检测到训练进程"
|
||
fi
|
||
echo ""
|
||
|
||
echo "开始评估..."
|
||
echo "预计时间: 2-3小时"
|
||
echo "日志文件: $EVAL_DIR/eval.log"
|
||
echo ""
|
||
|
||
# 关键配置:
|
||
# - CUDA_VISIBLE_DEVICES=4,5,6,7: 只使用GPU 4-7
|
||
# - workers_per_gpu=0: 避免共享内存问题
|
||
# - -np 4: 使用4个进程(对应4张GPU)
|
||
|
||
CUDA_VISIBLE_DEVICES=4,5,6,7 \
|
||
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
|
||
PATH=/opt/conda/bin:$PATH \
|
||
/opt/conda/bin/torchpack dist-run -np 4 /opt/conda/bin/python tools/test.py \
|
||
"$CONFIG" \
|
||
"$CHECKPOINT" \
|
||
--eval bbox \
|
||
--out "$EVAL_DIR/results.pkl" \
|
||
--cfg-options data.workers_per_gpu=0 data.samples_per_gpu=1 \
|
||
2>&1 | tee "$EVAL_DIR/eval.log"
|
||
|
||
echo ""
|
||
echo "========================================================================"
|
||
echo "评估完成!"
|
||
echo "========================================================================"
|
||
echo "结果文件: $EVAL_DIR/results.pkl"
|
||
echo "日志文件: $EVAL_DIR/eval.log"
|
||
echo ""
|
||
|
||
# 提取关键性能指标
|
||
echo "========================================================================"
|
||
echo "性能指标摘要:"
|
||
echo "========================================================================"
|
||
grep -E "(NDS|mAP|mIoU|IoU)" "$EVAL_DIR/eval.log" | grep -v "UserWarning" | tail -50
|
||
|
||
echo ""
|
||
echo "完整结果请查看: $EVAL_DIR/eval.log"
|
||
echo "========================================================================"
|
||
|
||
# 验证训练仍在运行
|
||
TRAIN_PROCS_AFTER=$(ps aux | grep "train.py" | grep python | grep -v grep | wc -l)
|
||
echo ""
|
||
echo "训练状态确认: $TRAIN_PROCS_AFTER 进程运行中"
|
||
if [ "$TRAIN_PROCS_AFTER" -lt "$TRAIN_PROCS" ]; then
|
||
echo "⚠️ 警告: 训练进程数减少,请检查训练状态!"
|
||
else
|
||
echo "✓ 训练未受影响"
|
||
fi
|
||
echo "========================================================================"
|
||
|
||
|
||
|