Taken from https://github.com/ModelTC/LightLLM/blob/8ed97c74c18f11505b048b1ba00ba5c0cef8bff6/lightllm/common/fused_moe/deepep_scatter_gather.py and updated to fit vllm needs and terminology.
 
 _fwd_kernel_ep_gather(
    total_token_num,
    input_tensor,
    input_tensor_stride0,
    input_tensor_stride1,
    recv_topk_ids,
    recv_topk_ids_stride0,
    recv_topk_ids_stride1,
    recv_topk_weight,
    recv_topk_weight_stride0,
    recv_topk_weight_stride1,
    input_index,
    input_index_stride0,
    input_index_stride1,
    output_tensor,
    output_tensor_stride0,
    output_tensor_stride1,
    topk_num: constexpr,
    expert_map,
    HAS_EXPERT_MAP: constexpr,
    BLOCK_D: constexpr,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
  
 _fwd_kernel_ep_scatter_1(
    num_recv_tokens_per_expert,
    expert_start_loc,
    m_indices,
    num_experts: constexpr,
    BLOCK_E: constexpr,
    BLOCK_EXPERT_NUM: constexpr,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
  
 _fwd_kernel_ep_scatter_2(
    total_token_num,
    expert_start_loc,
    recv_x,
    recv_x_stride0,
    recv_x_stride1,
    recv_x_scale,
    recv_x_scale_stride0,
    recv_x_scale_stride1,
    recv_topk,
    recv_topk_stride0,
    recv_topk_stride1,
    output_tensor,
    output_tensor_stride0,
    output_tensor_stride1,
    output_tensor_scale,
    output_tensor_scale_stride0,
    output_tensor_scale_stride1,
    output_index,
    output_index_stride0,
    output_index_stride1,
    topk_num: constexpr,
    expert_map,
    HAS_EXPERT_MAP: constexpr,
    HIDDEN_SIZE: constexpr,
    HIDDEN_SIZE_PAD: constexpr,
    SCALE_HIDDEN_SIZE: constexpr,
    SCALE_HIDDEN_SIZE_PAD: constexpr,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
  
    
 compute_aligned_M(
    M: int,
    num_topk: int,
    local_num_experts: int,
    alignment: int,
    expert_tokens_meta: ExpertTokensMetadata | None,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
  
 deepgemm_moe_permute(
    aq: Tensor,
    aq_scale: Tensor,
    topk_ids: Tensor,
    local_num_experts: int,
    expert_map: Tensor | None,
    expert_tokens_meta: ExpertTokensMetadata | None,
    aq_out: Tensor | None = None,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
  
 deepgemm_unpermute_and_reduce(
    a: Tensor,
    topk_ids: Tensor,
    topk_weights: Tensor,
    inv_perm: Tensor,
    expert_map: Tensor | None,
    output: Tensor,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
  
 ep_gather(
    input_tensor: Tensor,
    recv_topk_ids: Tensor,
    recv_topk_weight: Tensor,
    input_index: Tensor,
    expert_map: Tensor | None,
    output_tensor: Tensor,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
  
 ep_scatter(
    recv_x: Tensor,
    recv_x_scale: Tensor,
    recv_topk: Tensor,
    num_recv_tokens_per_expert: Tensor,
    expert_map: Tensor | None,
    expert_start_loc: Tensor,
    output_tensor: Tensor,
    output_tensor_scale: Tensor,
    m_indices: Tensor,
    output_index: Tensor,
)