#!/bin/bash # ๐Ÿš€๐Ÿš€๐Ÿš€ BEVFusion Phase 4B: RMT-PPAD Segmentation Training ๐Ÿš€๐Ÿš€๐Ÿš€ # ๅขžๅผบๆ•ฐๅ€ผ็จณๅฎšๆ€ง็‰ˆๆœฌ - ๅŒ…ๅซNaN/infๆฃ€ๆต‹ๅ’Œ่‡ชๅŠจๆขๅคๆœบๅˆถ echo "๐Ÿ”ง๐Ÿ”ง๐Ÿ”ง ๆ•ฐๅ€ผ็จณๅฎšๆ€งๅขžๅผบ่ฎญ็ปƒๅฏๅŠจ่„šๆœฌ ๐Ÿ”ง๐Ÿ”ง๐Ÿ”ง" echo "Phase 4B: RMT-PPAD Transformerๅˆ†ๅ‰ฒ้›†ๆˆ" echo "ๆ—ถ้—ด: $(date)" echo "===========================================" # ้…็ฝฎๅ‚ๆ•ฐ CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml" WORK_DIR="/data/runs/phase4b_rmtppad_segmentation" GPUS=8 PORT=29500 # ๐Ÿ”„๐Ÿ”„๐Ÿ”„ ๆขๅค่ฎญ็ปƒ้…็ฝฎ ๐Ÿ”„๐Ÿ”„๐Ÿ”„ RESUME_FROM="/workspace/bevfusion/runs/run-326653dc-41917c58/latest.pth" AUTO_RESUME=false # ๆ˜ฏๅฆ่‡ชๅŠจไปŽๆœ€ๆ–ฐcheckpointๆขๅค echo "๐Ÿ“‹ ่ฎญ็ปƒ้…็ฝฎ:" echo " - ้…็ฝฎๆ–‡ไปถ: $CONFIG_FILE" echo " - ๅทฅไฝœ็›ฎๅฝ•: $WORK_DIR" echo " - GPUๆ•ฐ้‡: $GPUS" echo " - ไธป็ซฏๅฃ: $PORT" if [ -f "$RESUME_FROM" ]; then echo " - ๆขๅคๆจกๅผ: โœ… ไปŽcheckpointๆขๅค" echo " - Checkpoint: $RESUME_FROM" else echo " - ๆขๅคๆจกๅผ: โŒ ไปŽๅคดๅผ€ๅง‹่ฎญ็ปƒ" fi echo "" # ๅˆ›ๅปบๅทฅไฝœ็›ฎๅฝ• mkdir -p $WORK_DIR # ๐Ÿ”ง๐Ÿ”ง๐Ÿ”ง ๆ•ฐๅ€ผ็จณๅฎšๆ€ง็›‘ๆŽงๅ‡ฝๆ•ฐ ๐Ÿ”ง๐Ÿ”ง๐Ÿ”ง monitor_numerical_stability() { echo "๐Ÿ“Š ๅฏๅŠจๆ•ฐๅ€ผ็จณๅฎšๆ€ง็›‘ๆŽง..." while true; do # ๆฃ€ๆŸฅๆ˜ฏๅฆๆœ‰ๆ–ฐ็š„ๆ•ฐๅ€ผ้—ฎ้ข˜ if grep -q "Warning: Invalid values detected in cost matrix" $WORK_DIR/train.log 2>/dev/null; then echo "โš ๏ธ ๆฃ€ๆต‹ๅˆฐๆ•ฐๅ€ผ้—ฎ้ข˜๏ผŒ่ฎฐๅฝ•ๅทฒไฟๅญ˜ๅˆฐๆ—ฅๅฟ—" fi # ๆฃ€ๆŸฅ่ฎญ็ปƒๆ˜ฏๅฆไปๅœจ่ฟ่กŒ if ! pgrep -f "python.*train.py" > /dev/null; then echo "โ„น๏ธ ่ฎญ็ปƒ่ฟ›็จ‹ๅทฒ็ป“ๆŸ" break fi sleep 60 # ๆฏๅˆ†้’Ÿๆฃ€ๆŸฅไธ€ๆฌก done } # ๅฏๅŠจ็›‘ๆŽง๏ผˆๅŽๅฐ่ฟ่กŒ๏ผ‰ monitor_numerical_stability & # ๐Ÿš€ ๅฏๅŠจๅˆ†ๅธƒๅผ่ฎญ็ปƒ echo "๐Ÿš€ ๅฏๅŠจ8ๅกๅˆ†ๅธƒๅผ่ฎญ็ปƒ..." # ๆฃ€ๆŸฅๆ˜ฏๅฆไปŽcheckpointๆขๅค RESUME_ARGS="" if [ -f "$RESUME_FROM" ]; then echo "โœ… ๆ‰พๅˆฐcheckpointๆ–‡ไปถ: $RESUME_FROM" RESUME_ARGS="--resume-from $RESUME_FROM" echo "๐Ÿ”„ ๅฐ†ไปŽcheckpointๆขๅค่ฎญ็ปƒ" elif [ "$AUTO_RESUME" = true ]; then echo "๐Ÿ” ๅฏ็”จ่‡ชๅŠจๆขๅคๆจกๅผ๏ผŒๆŸฅๆ‰พๆœ€ๆ–ฐcheckpoint..." LATEST_CHECKPOINT=$(find /workspace/bevfusion/runs/ -name "latest.pth" -type l -exec ls -t {} + 2>/dev/null | head -1) if [ -n "$LATEST_CHECKPOINT" ]; then RESUME_ARGS="--resume-from $LATEST_CHECKPOINT" echo "๐Ÿ”„ ่‡ชๅŠจๆขๅคไปŽ: $LATEST_CHECKPOINT" fi fi # โœ… ่ฎพ็ฝฎ็Žฏๅขƒๅ˜้‡ (ๅ‚่€ƒๆˆๅŠŸ่„šๆœฌ) export PATH=/opt/conda/bin:$PATH export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH # ่ฎพ็ฝฎGPU export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # ้ชŒ่ฏ็Žฏๅขƒ echo "" echo "=== ็Žฏๅขƒ้ชŒ่ฏ ===" /opt/conda/bin/python -c "import torch; print('โœ… PyTorch:', torch.__version__)" || { echo "โŒ PyTorchๅฏผๅ…ฅๅคฑ่ดฅ" exit 1 } /opt/conda/bin/python -c "import mmcv; print('โœ… mmcv:', mmcv.__version__)" || { echo "โŒ mmcvๅฏผๅ…ฅๅคฑ่ดฅ" exit 1 } which torchpack || { echo "โŒ torchpackๆœชๆ‰พๅˆฐ" exit 1 } echo "โœ… torchpack: $(which torchpack)" echo "ๅ‘ฝไปค: torchpack dist-run -np $GPUS /opt/conda/bin/python tools/train.py $CONFIG_FILE --load_from $RESUME_FROM --data.samples_per_gpu 1 --data.workers_per_gpu 0" torchpack dist-run \ -np $GPUS \ /opt/conda/bin/python tools/train.py \ $CONFIG_FILE \ --load_from "$RESUME_FROM" \ --data.samples_per_gpu 1 \ --data.workers_per_gpu 0 \ > "$WORK_DIR/train.log" 2>&1 & # ่Žทๅ–่ฟ›็จ‹ID TRAIN_PID=$! echo "" echo "โœ… Phase 4B training started!" echo "๐Ÿ“Š Check results in: $WORK_DIR" echo "๐Ÿ“ˆ Monitor progress: tail -f $WORK_DIR/train.log" echo "" echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" echo "โœ… ่ฎญ็ปƒๅทฒๅœจๅŽๅฐๅฏๅŠจ๏ผ" echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" echo "" echo "่ฟ›็จ‹ID: $TRAIN_PID" echo "ๆ—ฅๅฟ—ๆ–‡ไปถ: $WORK_DIR/train.log" echo "" echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" echo "็›‘ๆŽงๅ‘ฝไปค:" echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" echo "" echo "1. ๆŸฅ็œ‹ๅฎžๆ—ถๆ—ฅๅฟ—:" echo " tail -f $WORK_DIR/train.log" echo "" echo "2. ๆŸฅ็œ‹ๅ…ณ้”ฎๆŒ‡ๆ ‡:" echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'" echo "" echo "3. ๆฃ€ๆŸฅ่ฟ›็จ‹็Šถๆ€:" echo " ps aux | grep $TRAIN_PID" echo "" echo "4. GPU็›‘ๆŽง:" echo " nvidia-smi -l 5" echo "" echo "5. ๅœๆญข่ฎญ็ปƒ:" echo " kill $TRAIN_PID" echo "" echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”"