"""
Global Context Aggregation (GCA) Module

Reference: RMT-PPAD (2025) - Real-time Multi-task Learning for Panoptic Perception
论文: arXiv:2508.06529

核心思想:
    通过全局平均池化捕获全局上下文信息，然后通过通道注意力机制
    重标定特征，增强全局一致性和语义理解。

实现细节:
    1. 全局平均池化: 将空间维度压缩为1×1
    2. 两层MLP: 降维→ReLU→升维→Sigmoid
    3. 特征重标定: 原特征 × 注意力权重

适用场景:
    - BEV分割（特别是细长结构如Divider、Lane）
    - 3D目标检测（Heatmap生成）
    - 任何需要全局一致性的任务
"""

import torch
import torch.nn as nn


class GCA(nn.Module):
    """
    Global Context Aggregation Module
    
    通过全局池化捕获全局上下文，然后通过通道注意力重标定特征。
    本质上是Squeeze-and-Excitation (SE) Network的变体。
    
    Args:
        in_channels (int): 输入特征通道数
        reduction (int): 降维比例，默认4（中间层通道数=in_channels/4）
            - reduction越大，参数越少，但表达能力越弱
            - 推荐: 4-8之间
        use_max_pool (bool): 是否同时使用最大池化（CBAM风格），默认False
            - True: 同时使用AvgPool和MaxPool，参数量翻倍
            - False: 只使用AvgPool（标准SE-Net）
        min_channels (int): 中间层最小通道数，防止过度降维，默认8
    
    Shape:
        - Input: (B, C, H, W) - 任意空间分辨率的特征图
        - Output: (B, C, H, W) - 与输入相同shape
    
    Examples:
        >>> # 基础用法
        >>> gca = GCA(in_channels=512, reduction=4)
        >>> x = torch.randn(2, 512, 180, 180)
        >>> out = gca(x)
        >>> print(out.shape)  # torch.Size([2, 512, 180, 180])
        
        >>> # CBAM风格（同时使用avg和max pool）
        >>> gca_cbam = GCA(in_channels=256, reduction=4, use_max_pool=True)
        >>> x = torch.randn(2, 256, 360, 360)
        >>> out = gca_cbam(x)
        
        >>> # 参数量统计
        >>> params = sum(p.numel() for p in gca.parameters())
        >>> print(f"Parameters: {params:,}")  # ~131,072 for C=512, r=4
    
    References:
        [1] RMT-PPAD: arXiv:2508.06529
        [2] SE-Net: "Squeeze-and-Excitation Networks", CVPR 2018
        [3] CBAM: "Convolutional Block Attention Module", ECCV 2018
    """
    
    def __init__(
        self, 
        in_channels: int, 
        reduction: int = 4,
        use_max_pool: bool = False,
        min_channels: int = 8
    ):
        super().__init__()
        
        # 参数校验
        assert in_channels > 0, f"in_channels must be positive, got {in_channels}"
        assert reduction > 0, f"reduction must be positive, got {reduction}"
        assert min_channels > 0, f"min_channels must be positive, got {min_channels}"
        
        self.in_channels = in_channels
        self.reduction = reduction
        self.use_max_pool = use_max_pool
        
        # 全局池化层
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        if use_max_pool:
            self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        # 通道注意力网络（两层MLP，使用1x1卷积实现）
        # 计算中间层通道数，确保不小于min_channels
        hidden_channels = max(in_channels // reduction, min_channels)
        
        self.fc = nn.Sequential(
            # 降维层：减少参数量，引入瓶颈
            nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
            nn.ReLU(inplace=True),
            
            # 升维层：恢复到原通道数
            nn.Conv2d(hidden_channels, in_channels, 1, bias=False),
            
            # Sigmoid: 输出归一化到[0,1]，作为注意力权重
            nn.Sigmoid()
        )
        
        # 初始化权重
        self._init_weights()
    
    def _init_weights(self):
        """
        初始化网络权重
        使用Kaiming初始化，适合ReLU激活函数
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        前向传播
        
        流程:
            1. 全局池化: (B, C, H, W) → (B, C, 1, 1)
            2. MLP: (B, C, 1, 1) → (B, C, 1, 1)
            3. 特征重标定: (B, C, H, W) × (B, C, 1, 1) → (B, C, H, W)
        
        Args:
            x (Tensor): 输入特征，shape=(B, C, H, W)
                - B: Batch size
                - C: Channels（必须等于in_channels）
                - H: Height（任意）
                - W: Width（任意）
        
        Returns:
            Tensor: 增强后的特征，shape=(B, C, H, W)
                - 与输入shape完全相同
                - 每个通道根据全局重要性被重新标定
        
        Note:
            - 该模块保持空间维度不变
            - 只调整通道维度的重要性
            - 可以插入任何需要全局上下文的位置
        """
        b, c, h, w = x.size()
        
        # 校验输入通道数
        assert c == self.in_channels, (
            f"Input channels {c} doesn't match module channels {self.in_channels}"
        )
        
        # ========== 第一步: 全局信息聚合 ==========
        if self.use_max_pool:
            # CBAM风格: 同时使用平均池化和最大池化
            # 平均池化: 捕获全局平均响应
            avg_out = self.avg_pool(x)  # (B, C, 1, 1)
            # 最大池化: 捕获全局最强响应
            max_out = self.max_pool(x)  # (B, C, 1, 1)
            
            # 分别通过MLP后相加（element-wise addition）
            # 这样可以综合两种池化的优势
            attention = self.fc(avg_out) + self.fc(max_out)
        else:
            # 标准GCA/SE-Net: 只使用平均池化
            # 平均池化: 将每个通道的空间信息压缩为一个标量
            # 数学表示: z_c = (1/HW) * Σ_{i,j} x_c(i,j)
            avg_out = self.avg_pool(x)  # (B, C, 1, 1)
            
            # 通过两层MLP生成通道注意力权重
            # FC1: C → C/r (降维，减少参数)
            # ReLU: 非线性激活
            # FC2: C/r → C (升维，恢复通道数)
            # Sigmoid: 归一化到[0,1]
            attention = self.fc(avg_out)  # (B, C, 1, 1)
        
        # ========== 第二步: 特征重标定 ==========
        # 逐通道相乘（channel-wise multiplication）
        # Broadcasting: (B, C, H, W) * (B, C, 1, 1) = (B, C, H, W)
        # 
        # 效果:
        # - 如果attention[c] ≈ 1: 该通道被保留（重要通道）
        # - 如果attention[c] ≈ 0: 该通道被抑制（不重要通道）
        out = x * attention
        
        return out
    
    def extra_repr(self) -> str:
        """
        额外信息，用于print(model)时显示
        
        Returns:
            str: 模块的关键参数信息
        """
        return (
            f"in_channels={self.in_channels}, "
            f"reduction={self.reduction}, "
            f"use_max_pool={self.use_max_pool}, "
            f"params≈{self._get_param_count()/1e6:.2f}M"
        )
    
    def _get_param_count(self) -> int:
        """计算参数量"""
        return sum(p.numel() for p in self.parameters())


# ========== 单元测试 ==========
if __name__ == "__main__":
    print("=" * 80)
    print("Testing GCA Module")
    print("=" * 80)
    
    # 测试1: 基础功能
    print("\n[Test 1] Basic functionality...")
    gca = GCA(in_channels=512, reduction=4)
    print(f"Module: {gca}")
    
    # 计算参数量
    params = sum(p.numel() for p in gca.parameters())
    print(f"Parameters: {params:,} ({params/1e6:.3f}M)")
    
    # 前向传播测试
    x = torch.randn(2, 512, 180, 180)
    out = gca(x)
    print(f"Input shape: {x.shape}")
    print(f"Output shape: {out.shape}")
    assert out.shape == x.shape, "❌ Shape mismatch!"
    print("✅ Basic test passed")
    
    # 测试2: CBAM风格
    print("\n[Test 2] CBAM-style (avg+max pool)...")
    gca_cbam = GCA(in_channels=256, reduction=4, use_max_pool=True)
    params_cbam = sum(p.numel() for p in gca_cbam.parameters())
    print(f"Parameters (CBAM): {params_cbam:,} ({params_cbam/1e6:.3f}M)")
    
    x2 = torch.randn(2, 256, 360, 360)
    out2 = gca_cbam(x2)
    print(f"Input shape: {x2.shape}")
    print(f"Output shape: {out2.shape}")
    assert out2.shape == x2.shape, "❌ Shape mismatch!"
    print("✅ CBAM test passed")
    
    # 测试3: 不同reduction ratio
    print("\n[Test 3] Different reduction ratios...")
    for r in [2, 4, 8, 16]:
        gca_r = GCA(in_channels=512, reduction=r)
        params_r = sum(p.numel() for p in gca_r.parameters())
        print(f"  reduction={r:2d}: {params_r:>8,} params ({params_r/1e6:.3f}M)")
    print("✅ Reduction ratio test passed")
    
    # 测试4: CUDA测试
    if torch.cuda.is_available():
        print("\n[Test 4] CUDA compatibility...")
        gca_cuda = gca.cuda()
        x_cuda = torch.randn(4, 512, 180, 180).cuda()
        
        # Warmup
        for _ in range(10):
            _ = gca_cuda(x_cuda)
        
        # 测速
        import time
        torch.cuda.synchronize()
        start = time.time()
        for _ in range(100):
            out_cuda = gca_cuda(x_cuda)
            torch.cuda.synchronize()
        elapsed = time.time() - start
        
        print(f"CUDA output shape: {out_cuda.shape}")
        print(f"Average latency: {elapsed/100*1000:.2f}ms (100 iterations)")
        print("✅ CUDA test passed")
    else:
        print("\n[Test 4] CUDA not available, skipped")
    
    # 测试5: 梯度测试
    print("\n[Test 5] Gradient flow...")
    gca_grad = GCA(in_channels=128, reduction=4)
    x_grad = torch.randn(2, 128, 100, 100, requires_grad=True)
    out_grad = gca_grad(x_grad)
    loss = out_grad.sum()
    loss.backward()
    
    assert x_grad.grad is not None, "❌ No gradient!"
    print(f"Input gradient shape: {x_grad.grad.shape}")
    print(f"Gradient mean: {x_grad.grad.mean():.6f}")
    print(f"Gradient std: {x_grad.grad.std():.6f}")
    print("✅ Gradient test passed")
    
    # 测试6: 边界情况
    print("\n[Test 6] Edge cases...")
    
    # 小通道数
    gca_small = GCA(in_channels=16, reduction=4, min_channels=4)
    x_small = torch.randn(1, 16, 50, 50)
    out_small = gca_small(x_small)
    assert out_small.shape == x_small.shape, "❌ Small channels failed!"
    print("  ✅ Small channels (16) passed")
    
    # 大特征图
    gca_large = GCA(in_channels=64, reduction=4)
    x_large = torch.randn(1, 64, 600, 600)
    out_large = gca_large(x_large)
    assert out_large.shape == x_large.shape, "❌ Large feature map failed!"
    print("  ✅ Large feature map (600×600) passed")
    
    # 小特征图
    x_tiny = torch.randn(1, 512, 1, 1)
    # 如果在GPU上测试，需要移到CPU
    gca_cpu = GCA(in_channels=512, reduction=4)
    out_tiny = gca_cpu(x_tiny)
    assert out_tiny.shape == x_tiny.shape, "❌ Tiny feature map failed!"
    print("  ✅ Tiny feature map (1×1) passed")
    
    print("\n" + "=" * 80)
    print("✅ All tests passed! GCA module is ready to use.")
    print("=" * 80)
    
    # 打印使用建议
    print("\nUsage recommendations:")
    print("  - For segmentation heads (512 channels): reduction=4")
    print("  - For detection heads (128 channels): reduction=4")
    print("  - Use use_max_pool=False for efficiency (standard SE-Net)")
    print("  - Expected latency: 2-3ms on V100")
    print("  - Expected improvement: 3-5% for fine structures (divider, lane)")