_count_expert_num_tokens(
    topk_ids_ptr,
    expert_num_tokens_ptr,
    num_experts,
    topk_numel,
    expert_map,
    HAS_EXPERT_MAP: constexpr,
    BLOCK_SIZE: constexpr,
)
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
  A permutation routine that works on fp8 types.
Source code in vllm/model_executor/layers/fused_moe/utils.py
   
 _fp8_quantize(
    A: Tensor,
    A_scale: Tensor | None,
    per_act_token: bool,
    block_shape: list[int] | None = None,
) -> tuple[Tensor, Tensor]
Perform fp8 quantization on the inputs. If a block_shape is provided, the output will be blocked.
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _int8_quantize(
    A: Tensor,
    A_scale: Tensor | None,
    per_act_token: bool,
    block_shape: list[int] | None = None,
) -> tuple[Tensor, Tensor]
Perform int8 quantization on the inputs. If a block_shape is provided, the output will be blocked.
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _mxfp4_quantize(
    A: Tensor,
    A_scale: Tensor | None,
    per_act_token_quant: bool,
    block_shape: list[int] | None = None,
) -> tuple[Tensor, None]
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _mxfp6_e2m3_quantize(
    A: Tensor,
    A_scale: Tensor | None,
    per_act_token_quant: bool,
    block_shape: list[int] | None = None,
) -> tuple[Tensor, None]
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _mxfp6_e3m2_quantize(
    A: Tensor,
    A_scale: Tensor | None,
    per_act_token_quant: bool,
    block_shape: list[int] | None = None,
) -> tuple[Tensor, None]
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _mxfp8_e4m3_quantize(
    A: Tensor,
    A_scale: Tensor | None,
    per_act_token_quant: bool,
    block_shape: list[int] | None = None,
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _nvfp4_quantize(
    A: Tensor,
    A_scale: Tensor | None,
    is_sf_swizzled_layout: bool,
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/layers/fused_moe/utils.py
   
  Shrink the given tensor and apply the given view to it. This is used to resize the intermediate fused_moe caches.
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 _validate_scale_shape(
    a: Tensor,
    a_scale: Tensor | None,
    per_act_token_quant: bool,
    block_shape: list[int] | None,
) -> None
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
    
 count_expert_num_tokens(
    topk_ids: Tensor,
    num_local_experts: int,
    expert_map: Tensor | None,
) -> Tensor
Count the number to tokens assigned to each expert.
Parameters: - topk_ids (torch.Tensor): Tensor mapping each token to its list of experts. - num_local_experts (int): Number of experts in this rank. - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard.
Returns: A tensor of size num_local_experts, where tensor[i] holds the number of tokens assigned to the ith expert.
Source code in vllm/model_executor/layers/fused_moe/utils.py
  
 moe_kernel_quantize_input(
    A: Tensor,
    A_scale: Tensor | None,
    quant_dtype: None | dtype | str,
    per_act_token_quant: bool,
    block_shape: list[int] | None = None,
    is_fp4_scale_swizzled: bool = True,
) -> tuple[Tensor, Tensor | None]