From 6f55f90ade39a90d51e6950c8f548b5b4b156d6c Mon Sep 17 00:00:00 2001 From: facebook-github-bot Date: Fri, 13 Dec 2024 04:56:24 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20pytorch/?= =?UTF-8?q?FBGEMM@5d361fc5f6f591c8274276a98eb4f34686a55729=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...able_batched_embeddings_ops_inference.html | 117 +++++++++++++++++- output.json | 58 ++++----- output.txt | 2 +- searchindex.js | 2 +- 4 files changed, 143 insertions(+), 36 deletions(-) diff --git a/_modules/fbgemm_gpu/split_table_batched_embeddings_ops_inference.html b/_modules/fbgemm_gpu/split_table_batched_embeddings_ops_inference.html index 36962c1371..48c14388b0 100644 --- a/_modules/fbgemm_gpu/split_table_batched_embeddings_ops_inference.html +++ b/_modules/fbgemm_gpu/split_table_batched_embeddings_ops_inference.html @@ -452,11 +452,14 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_inference

construct_cache_state, DEFAULT_SCALE_BIAS_SIZE_IN_BYTES, EmbeddingLocation, + EmbeddingSpecInfo, + get_new_embedding_location, MAX_PREFETCH_DEPTH, PoolingMode, RecordCacheMetrics, round_up, SplitState, + tensor_to_device, ) from fbgemm_gpu.utils.loader import load_torch_module, load_torch_module_bc @@ -819,6 +822,9 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_inference

# (feature_names, rows, dims, weights_tys, locations) = zip(*embedding_specs) # Pyre workaround self.feature_names: List[str] = [e[0] for e in embedding_specs] + self.cache_load_factor: float = cache_load_factor + self.cache_sets: int = cache_sets + self.cache_reserved_memory: float = cache_reserved_memory rows: List[int] = [e[1] for e in embedding_specs] dims: List[int] = [e[2] for e in embedding_specs] weights_tys: List[SparseType] = [e[3] for e in embedding_specs] @@ -1695,11 +1701,7 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_inference

torch.zeros(1, dtype=torch.int64, device=self.current_device), persistent=False, ) - self.register_buffer( - "total_cache_hash_size", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, - ) + self.total_cache_hash_size = 0 self.register_buffer( "cache_index_table_map", torch.zeros(1, dtype=torch.int64, device=self.current_device), @@ -1888,6 +1890,111 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_inference

self.lxu_state.fill_(0) self.timestep_counter.reset() + def move_to_device_with_cache( + self, device: torch.device, cache_load_factor: float + ) -> None: + """ + Moves the TBE to the specified device, and updates the cache state accordingly. + """ + if ( + self.current_device == device + and self.cache_load_factor == cache_load_factor + ): + return + + location = get_new_embedding_location(device, cache_load_factor) + if device.type != "cpu": + self.use_cpu = False + + weights = self.split_embedding_weights() + is_meta = self.current_device.type == "meta" + index_remapping_array: torch.Tensor + index_remappings_array_offsets: torch.Tensor + original_rows_per_table: torch.Tensor + if not is_meta: + # Record weights and pruning tensors for setting + # weights and pruning tensors for TBE on new device + if device.type == "cpu": + for i, weight in enumerate(weights): + weights[i] = ( + weight[0].to(device), + weight[1].to(device) if weight[1] is not None else None, + ) + ( + index_remapping_array, + index_remappings_array_offsets, + original_rows_per_table, + ) = ( + self.index_remappings_array.to(device), + self.index_remappings_array_offsets.to(device), + self.original_rows_per_table.to(device), + ) + + self.reset_weights_placements_and_offsets(device, location.value) + self.recompute_module_buffers() + self.weight_initialized = False + self.initialize_weights() + + # Ensure all weights are on the same device + if device.type != "cpu": + self.weights_host = torch.zeros(0, device=device, dtype=torch.uint8) + + if location != EmbeddingLocation.DEVICE: + self.weights_dev = torch.zeros(0, device=device, dtype=torch.uint8) + + for name, buf in self.named_buffers(): + if buf.is_meta: + self.register_buffer(name, tensor_to_device(buf, device)) + + self.current_device = device + + if not is_meta: + self.assign_embedding_weights(weights) + self.index_remappings_array = index_remapping_array + self.index_remappings_array_offsets = index_remappings_array_offsets + self.original_rows_per_table = original_rows_per_table + + if cache_load_factor is not None: + self.update_cache_load_factor(cache_load_factor) + + def update_cache_load_factor(self, cache_load_factor: float = 0.2) -> None: + """ + Updates cache_load_factor and embedding location for weights after TBE has already been initialized + Assumes that the location of the weights is already set correctly + """ + rows = [ + embedding_spec[EmbeddingSpecInfo.rows] + for embedding_spec in self.embedding_specs + ] + locations = [ + embedding_spec[EmbeddingSpecInfo.embedding_location] + for embedding_spec in self.embedding_specs + ] + # pyre-ignore[6] + cache_state = construct_cache_state(rows, locations, self.feature_table_map) + + cached_dims = [ + rounded_row_size_in_bytes( + embedding_spec[EmbeddingSpecInfo.dims], # pyre-ignore[6] + embedding_spec[EmbeddingSpecInfo.sparse_type], # pyre-ignore[6] + 16, + self.scale_bias_size_in_bytes, + ) + for embedding_spec in self.embedding_specs + if embedding_spec[EmbeddingSpecInfo.embedding_location] + == EmbeddingLocation.MANAGED_CACHING + ] + + self.max_D_cache: int = max(cached_dims) if len(cached_dims) > 0 else 0 + + self._apply_cache_state( + cache_state, + self.cache_algorithm, + cache_load_factor, + self.cache_sets, + self.cache_reserved_memory, + ) +
[docs] @torch.jit.export def split_embedding_weights_with_scale_bias( self, split_scale_bias_mode: int = 1 diff --git a/output.json b/output.json index 7631fc865b..195c04ca04 100644 --- a/output.json +++ b/output.json @@ -2,6 +2,7 @@ {"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 66, "status": "unchecked", "code": 0, "uri": "#fbgemm-gpu-build-setup-env", "info": ""} {"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 76, "status": "unchecked", "code": 0, "uri": "#fbgemm-gpu-build-setup-tools-install-compiler-clang", "info": ""} {"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 76, "status": "unchecked", "code": 0, "uri": "#fbgemm-gpu-build-setup-tools-install-compiler-gcc", "info": ""} +{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 82, "status": "working", "code": 0, "uri": "https://visualstudio.microsoft.com/vs/older-downloads/", "info": ""} {"filename": "general/documentation/Overview.rst", "lineno": 71, "status": "unchecked", "code": 0, "uri": "#fbgemm-gpu-build-process-cpu", "info": ""} {"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 115, "status": "unchecked", "code": 0, "uri": "#fbgemm-gpu-build-setup-cuda-image", "info": ""} {"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 127, "status": "unchecked", "code": 0, "uri": "#fbgemm-gpu-build-setup-cuda-install", "info": ""} @@ -19,68 +20,67 @@ {"filename": "fbgemm_gpu-stable-api/python_api.rst", "lineno": 27, "status": "unchecked", "code": 0, "uri": "#tbe-ops-inference-stable-api", "info": ""} {"filename": "fbgemm_gpu-stable-api/python_api.rst", "lineno": 27, "status": "unchecked", "code": 0, "uri": "#tbe-ops-training-stable-api", "info": ""} {"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 4, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/blob/main/.github/scripts/setup_env.bash", "info": ""} -{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 41, "status": "working", "code": 0, "uri": "https://github.com/asmjit/asmjit", "info": ""} +{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 48, "status": "working", "code": 0, "uri": "https://github.com/pytorch/cpuinfo", "info": ""} {"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 55, "status": "working", "code": 0, "uri": "https://github.com/google/googletest", "info": ""} +{"filename": "general/documentation/Cpp.rst", "lineno": 6, "status": "redirected", "code": 302, "uri": "https://breathe.readthedocs.io", "info": "https://breathe.readthedocs.io/en/latest/"} +{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 41, "status": "working", "code": 0, "uri": "https://github.com/asmjit/asmjit", "info": ""} {"filename": "general/ContactUs.rst", "lineno": 17, "status": "redirected", "code": 301, "uri": "https://bit.ly/ptslack", "info": "https://docs.google.com/forms/d/e/1FAIpQLSeADnUNW36fjKjYzyHDOzEB_abKQE9b6gqqW9NXse6O0MWh0A/viewform"} -{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 48, "status": "working", "code": 0, "uri": "https://github.com/pytorch/cpuinfo", "info": ""} -{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 139, "status": "working", "code": 0, "uri": "https://developer.nvidia.com/cudnn", "info": ""} -{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 114, "status": "redirected", "code": 301, "uri": "https://developer.nvidia.com/nvidia-management-library-nvml", "info": "https://developer.nvidia.com/management-library-nvml"} -{"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 142, "status": "working", "code": 0, "uri": "https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.5/page/How_to_Install_ROCm.html", "info": ""} {"filename": "general/Contributing.rst", "lineno": 27, "status": "working", "code": 0, "uri": "https://code.facebook.com/cla", "info": ""} +{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 139, "status": "working", "code": 0, "uri": "https://developer.nvidia.com/cudnn", "info": ""} +{"filename": "fbgemm_gpu-cpp-api/memory_utils.rst", "lineno": 4, "status": "working", "code": 0, "uri": "https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html", "info": ""} {"filename": "fbgemm_gpu-python-api/pooled_embedding_modules.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/constants.html#None", "info": ""} +{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 114, "status": "redirected", "code": 301, "uri": "https://developer.nvidia.com/nvidia-management-library-nvml", "info": "https://developer.nvidia.com/management-library-nvml"} {"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 72, "status": "redirected", "code": 301, "uri": "https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html", "info": "https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"} -{"filename": "fbgemm_gpu-cpp-api/memory_utils.rst", "lineno": 4, "status": "working", "code": 0, "uri": "https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html", "info": ""} {"filename": "general/documentation/Python.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/exceptions.html#AttributeError", "info": ""} +{"filename": "general/documentation/Python.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/ctypes.html#ctypes.c_ulong", "info": ""} {"filename": "general/documentation/Python.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/exceptions.html#ValueError", "info": ""} {"filename": "fbgemm_gpu-python-api/tbe_ops_inference.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/functions.html#bool", "info": ""} -{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 31, "status": "working", "code": 0, "uri": "https://docs.conda.io/en/latest/miniconda.html", "info": ""} {"filename": "fbgemm_gpu-python-api/tbe_ops_inference.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/functions.html#float", "info": ""} -{"filename": "general/documentation/Python.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/ctypes.html#ctypes.c_ulong", "info": ""} -{"filename": "fbgemm_gpu-python-api/tbe_ops_inference.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/stdtypes.html#str", "info": ""} +{"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 142, "status": "working", "code": 0, "uri": "https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.5/page/How_to_Install_ROCm.html", "info": ""} {"filename": "fbgemm_gpu-python-api/pooled_embedding_modules.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/functions.html#int", "info": ""} +{"filename": "fbgemm_gpu-python-api/tbe_ops_inference.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/stdtypes.html#str", "info": ""} {"filename": "fbgemm_gpu-overview/jagged-tensor-ops/JaggedTensorOps.rst", "lineno": 172, "status": "working", "code": 0, "uri": "https://en.wikipedia.org/wiki/Hadamard_product_(matrices)", "info": ""} +{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 31, "status": "working", "code": 0, "uri": "https://docs.conda.io/en/latest/miniconda.html", "info": ""} {"filename": "fbgemm_gpu-python-api/tbe_ops_training.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/typing.html#typing.Dict", "info": ""} {"filename": "fbgemm_gpu-python-api/pooled_embedding_modules.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/typing.html#typing.List", "info": ""} {"filename": "fbgemm_gpu-python-api/tbe_ops_inference.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://docs.python.org/3/library/typing.html#typing.Tuple", "info": ""} -{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 221, "status": "redirected", "code": 301, "uri": "https://github.com/ROCmSoftwarePlatform/MIOpen", "info": "https://github.com/ROCm/MIOpen"} -{"filename": "general/documentation/Cpp.rst", "lineno": 6, "status": "redirected", "code": 302, "uri": "https://breathe.readthedocs.io", "info": "https://breathe.readthedocs.io/en/latest/"} +{"filename": "general/Contributing.rst", "lineno": 8, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/blob/main/CODE_OF_CONDUCT.md", "info": ""} {"filename": "general/ContactUs.rst", "lineno": 11, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/discussions", "info": ""} +{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 221, "status": "redirected", "code": 301, "uri": "https://github.com/ROCmSoftwarePlatform/MIOpen", "info": "https://github.com/ROCm/MIOpen"} {"filename": "general/ContactUs.rst", "lineno": 7, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/issues", "info": ""} -{"filename": "general/Contributing.rst", "lineno": 8, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/blob/main/CODE_OF_CONDUCT.md", "info": ""} -{"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 326, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/issues/1618", "info": ""} -{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 171, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/issues/1666", "info": ""} {"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 171, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/issues/1094", "info": ""} {"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 220, "status": "working", "code": 0, "uri": "https://github.com/pytorch/pytorch/blob/main/RELEASE.md", "info": ""} -{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 171, "status": "working", "code": 0, "uri": "https://github.com/pytorch/pytorch/issues/77939", "info": ""} +{"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 326, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/issues/1618", "info": ""} +{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 171, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/issues/1666", "info": ""} +{"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 72, "status": "working", "code": 0, "uri": "https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml", "info": ""} {"filename": "index.rst", "lineno": 7, "status": "redirected", "code": 302, "uri": "https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native/quantized/cpu", "info": "https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native/quantized/cpu"} {"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 96, "status": "working", "code": 0, "uri": "https://hub.docker.com/r/nvidia/cuda", "info": ""} -{"filename": "fbgemm_gpu-overview/jagged-tensor-ops/JaggedTensorOps.rst", "lineno": 7, "status": "working", "code": 0, "uri": "https://github.com/pytorch/pytorch/issues/25032", "info": ""} -{"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 72, "status": "working", "code": 0, "uri": "https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml", "info": ""} -{"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 46, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/releases", "info": ""} -{"filename": "general/documentation/Python.rst", "lineno": 55, "status": "working", "code": 0, "uri": "https://peps.python.org/pep-0287/", "info": ""} {"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 183, "status": "working", "code": 0, "uri": "https://hub.docker.com/r/rocm/dev-ubuntu-22.04", "info": ""} -{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 174, "status": "working", "code": 0, "uri": "https://hub.docker.com/r/rocm/rocm-terminal", "info": ""} {"filename": "general/documentation/Sphinx.rst", "lineno": 149, "status": "working", "code": 0, "uri": "https://graphviz.org/documentation/", "info": ""} -{"filename": "fbgemm_gpu-cpp-api/experimental_ops.rst", "lineno": 6, "status": "working", "code": 0, "uri": "https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4N2at6TensorE", "info": ""} +{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 174, "status": "working", "code": 0, "uri": "https://hub.docker.com/r/rocm/rocm-terminal", "info": ""} +{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 171, "status": "working", "code": 0, "uri": "https://github.com/pytorch/pytorch/issues/77939", "info": ""} +{"filename": "general/documentation/Python.rst", "lineno": 55, "status": "working", "code": 0, "uri": "https://peps.python.org/pep-0287/", "info": ""} {"filename": "fbgemm_gpu-cpp-api/quantize_ops.rst", "lineno": 11, "status": "working", "code": 0, "uri": "https://pytorch.org/cppdocs/api/classc10_1_1_error.html#_CPPv4N3c105ErrorE", "info": ""} -{"filename": "fbgemm_gpu-python-api/tbe_ops_inference.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://pytorch.org/docs/main/tensor_attributes.html#torch.dtype", "info": ""} {"filename": "fbgemm_gpu-python-api/pooled_embedding_modules.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://pytorch.org/docs/main/tensor_attributes.html#torch.device", "info": ""} -{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 352, "status": "working", "code": 0, "uri": "https://pytorch.org/get-started/locally/", "info": ""} +{"filename": "fbgemm_gpu-python-api/tbe_ops_inference.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://pytorch.org/docs/main/tensor_attributes.html#torch.dtype", "info": ""} {"filename": "fbgemm_gpu-python-api/pooled_embedding_modules.rst", "lineno": 1, "status": "working", "code": 0, "uri": "https://pytorch.org/docs/main/tensors.html#torch.Tensor", "info": ""} -{"filename": "fbgemm-development/BuildInstructions.rst", "lineno": 82, "status": "working", "code": 0, "uri": "https://visualstudio.microsoft.com/vs/older-downloads/", "info": ""} +{"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 352, "status": "working", "code": 0, "uri": "https://pytorch.org/get-started/locally/", "info": ""} +{"filename": "fbgemm_gpu-cpp-api/experimental_ops.rst", "lineno": 6, "status": "working", "code": 0, "uri": "https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4N2at6TensorE", "info": ""} +{"filename": "fbgemm_gpu-overview/jagged-tensor-ops/JaggedTensorOps.rst", "lineno": 7, "status": "working", "code": 0, "uri": "https://github.com/pytorch/pytorch/issues/25032", "info": ""} {"filename": "fbgemm_gpu-development/BuildInstructions.rst", "lineno": 197, "status": "working", "code": 0, "uri": "https://rocm.docs.amd.com/en/latest/", "info": ""} {"filename": "fbgemm_gpu-cpp-api/memory_utils.rst", "lineno": 4, "status": "working", "code": 0, "uri": "https://man7.org/linux/man-pages/man2/madvise.2.html", "info": ""} -{"filename": "general/documentation/Overview.rst", "lineno": 142, "status": "working", "code": 0, "uri": "https://www.netlify.com/", "info": ""} {"filename": "general/documentation/Cpp.rst", "lineno": 6, "status": "working", "code": 0, "uri": "https://www.doxygen.nl/", "info": ""} +{"filename": "general/documentation/Overview.rst", "lineno": 142, "status": "working", "code": 0, "uri": "https://www.netlify.com/", "info": ""} {"filename": "general/documentation/Cpp.rst", "lineno": 6, "status": "working", "code": 0, "uri": "https://www.oracle.com/java/technologies/javase/javadoc-tool.html", "info": ""} -{"filename": "general/documentation/Python.rst", "lineno": 6, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/", "info": ""} {"filename": "general/documentation/Cpp.rst", "lineno": 75, "status": "working", "code": 0, "uri": "https://www.doxygen.nl/manual/commands.html#cmdlink", "info": ""} {"filename": "general/documentation/Python.rst", "lineno": 6, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/usage/extensions/example_google.html", "info": ""} -{"filename": "general/documentation/Python.rst", "lineno": 55, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html", "info": ""} +{"filename": "general/documentation/Python.rst", "lineno": 6, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/", "info": ""} {"filename": "general/documentation/Sphinx.rst", "lineno": 149, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/usage/extensions/graphviz.html", "info": ""} -{"filename": "general/Contributing.rst", "lineno": 34, "status": "redirected", "code": 301, "uri": "https://www.facebook.com/whitehat/", "info": "https://bugbounty.meta.com/?utm_source=facebook.com&utm_medium=redirect"} {"filename": "general/documentation/Sphinx.rst", "lineno": 115, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/usage/extensions/math.html#module-sphinx.ext.mathjax", "info": ""} -{"filename": "general/documentation/Sphinx.rst", "lineno": 115, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#math", "info": ""} +{"filename": "general/documentation/Python.rst", "lineno": 55, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html", "info": ""} {"filename": "general/documentation/Sphinx.rst", "lineno": 82, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-literalinclude", "info": ""} +{"filename": "general/documentation/Sphinx.rst", "lineno": 115, "status": "working", "code": 0, "uri": "https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#math", "info": ""} +{"filename": "general/Contributing.rst", "lineno": 34, "status": "redirected", "code": 301, "uri": "https://www.facebook.com/whitehat/", "info": "https://bugbounty.meta.com/?utm_source=facebook.com&utm_medium=redirect"} +{"filename": "fbgemm_gpu-development/InstallationInstructions.rst", "lineno": 46, "status": "working", "code": 0, "uri": "https://github.com/pytorch/FBGEMM/releases", "info": ""} {"filename": "fbgemm_gpu-overview/jagged-tensor-ops/JaggedTensorOps.rst", "lineno": 7, "status": "working", "code": 0, "uri": "https://www.tensorflow.org/guide/ragged_tensor", "info": ""} {"filename": "fbgemm-cpp-api/QuantUtils.rst", "lineno": 13, "status": "redirected", "code": 302, "uri": "https://www.jstatsoft.org/v08/i14/paper", "info": "https://www.jstatsoft.org/index.php/jss/article/download/v008i14/916"} diff --git a/output.txt b/output.txt index 5eb4ac0ca3..149b874a0d 100644 --- a/output.txt +++ b/output.txt @@ -1,8 +1,8 @@ +general/documentation/Cpp.rst:6: [redirected with Found] https://breathe.readthedocs.io to https://breathe.readthedocs.io/en/latest/ general/ContactUs.rst:17: [redirected permanently] https://bit.ly/ptslack to https://docs.google.com/forms/d/e/1FAIpQLSeADnUNW36fjKjYzyHDOzEB_abKQE9b6gqqW9NXse6O0MWh0A/viewform fbgemm_gpu-development/BuildInstructions.rst:114: [redirected permanently] https://developer.nvidia.com/nvidia-management-library-nvml to https://developer.nvidia.com/management-library-nvml fbgemm_gpu-development/InstallationInstructions.rst:72: [redirected permanently] https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html to https://docs.nvidia.com/cuda/cuda-installation-guide-linux/ fbgemm_gpu-development/BuildInstructions.rst:221: [redirected permanently] https://github.com/ROCmSoftwarePlatform/MIOpen to https://github.com/ROCm/MIOpen -general/documentation/Cpp.rst:6: [redirected with Found] https://breathe.readthedocs.io to https://breathe.readthedocs.io/en/latest/ index.rst:7: [redirected with Found] https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native/quantized/cpu to https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native/quantized/cpu general/Contributing.rst:34: [redirected permanently] https://www.facebook.com/whitehat/ to https://bugbounty.meta.com/?utm_source=facebook.com&utm_medium=redirect fbgemm-cpp-api/QuantUtils.rst:13: [redirected with Found] https://www.jstatsoft.org/v08/i14/paper to https://www.jstatsoft.org/index.php/jss/article/download/v008i14/916 diff --git a/searchindex.js b/searchindex.js index 25ced4d8a7..4d3da1c436 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["fbgemm-cpp-api/QuantUtils", "fbgemm-cpp-api/tbe_cpu_autovec", "fbgemm-development/BuildInstructions", "fbgemm_gpu-cpp-api/embedding_ops", "fbgemm_gpu-cpp-api/experimental_ops", "fbgemm_gpu-cpp-api/input_combine", "fbgemm_gpu-cpp-api/jagged_tensor_ops", "fbgemm_gpu-cpp-api/layout_transform_ops", "fbgemm_gpu-cpp-api/memory_utils", "fbgemm_gpu-cpp-api/merge_pooled_embeddings", "fbgemm_gpu-cpp-api/quantize_ops", "fbgemm_gpu-cpp-api/sparse_ops", "fbgemm_gpu-cpp-api/split_table_batched_embeddings", "fbgemm_gpu-cpp-api/ssd_embedding_ops", "fbgemm_gpu-development/BuildInstructions", "fbgemm_gpu-development/InstallationInstructions", "fbgemm_gpu-development/TestInstructions", "fbgemm_gpu-overview/jagged-tensor-ops/JaggedTensorOps", "fbgemm_gpu-python-api/jagged_tensor_ops", "fbgemm_gpu-python-api/pooled_embedding_modules", "fbgemm_gpu-python-api/pooled_embedding_ops", "fbgemm_gpu-python-api/quantize_ops", "fbgemm_gpu-python-api/sparse_ops", "fbgemm_gpu-python-api/tbe_ops_inference", "fbgemm_gpu-python-api/tbe_ops_training", "fbgemm_gpu-stable-api/python_api", "general/ContactUs", "general/Contributing", "general/License", "general/documentation/Cpp", "general/documentation/Overview", "general/documentation/Python", "general/documentation/Sphinx", "index"], "filenames": ["fbgemm-cpp-api/QuantUtils.rst", "fbgemm-cpp-api/tbe_cpu_autovec.rst", "fbgemm-development/BuildInstructions.rst", "fbgemm_gpu-cpp-api/embedding_ops.rst", "fbgemm_gpu-cpp-api/experimental_ops.rst", "fbgemm_gpu-cpp-api/input_combine.rst", "fbgemm_gpu-cpp-api/jagged_tensor_ops.rst", "fbgemm_gpu-cpp-api/layout_transform_ops.rst", "fbgemm_gpu-cpp-api/memory_utils.rst", "fbgemm_gpu-cpp-api/merge_pooled_embeddings.rst", "fbgemm_gpu-cpp-api/quantize_ops.rst", "fbgemm_gpu-cpp-api/sparse_ops.rst", "fbgemm_gpu-cpp-api/split_table_batched_embeddings.rst", "fbgemm_gpu-cpp-api/ssd_embedding_ops.rst", "fbgemm_gpu-development/BuildInstructions.rst", "fbgemm_gpu-development/InstallationInstructions.rst", "fbgemm_gpu-development/TestInstructions.rst", "fbgemm_gpu-overview/jagged-tensor-ops/JaggedTensorOps.rst", "fbgemm_gpu-python-api/jagged_tensor_ops.rst", "fbgemm_gpu-python-api/pooled_embedding_modules.rst", "fbgemm_gpu-python-api/pooled_embedding_ops.rst", "fbgemm_gpu-python-api/quantize_ops.rst", "fbgemm_gpu-python-api/sparse_ops.rst", "fbgemm_gpu-python-api/tbe_ops_inference.rst", "fbgemm_gpu-python-api/tbe_ops_training.rst", "fbgemm_gpu-stable-api/python_api.rst", "general/ContactUs.rst", "general/Contributing.rst", "general/License.rst", "general/documentation/Cpp.rst", "general/documentation/Overview.rst", "general/documentation/Python.rst", "general/documentation/Sphinx.rst", "index.rst"], "titles": ["Quantization Utilities", "TBE CPU Autovectorization", "Build Instructions", "Embedding Operators", "Experimental Operators", "Combine Input Operators", "Jagged Tensor Operators", "Layout Transformation Operators", "CUDA Memory Operators", "Pooled Embeddings Operators", "Quantization Operators", "Sparse Data Operators", "Table Batched Embedding Operators", "SSD Embedding Operators", "Build Instructions", "Installation Instructions", "Test Instructions", "Jagged Tensor Operators", "Jagged Tensor Operators", "Pooled Embedding Modules", "Pooled Embedding Operators", "Quantization Operators", "Sparse Operators", "Table Batched Embedding (TBE) Inference Module", "Table Batched Embedding (TBE) Training Module", "FBGEMM_GPU Stable Python API", "Contact Us", "Contributing", "License", "Adding Documentation to C++ Code", "Documentation", "Adding Documentation to Python Code", "Sphinx Documentation Pointers", "FBGEMM and FBGEMM_GPU Documentation Homepage"], "terms": {"templat": [0, 1, 14, 29], "typenam": [0, 1, 29], "t": [0, 2, 4, 8, 11, 14, 23, 24, 27, 29, 30], "layout_t": 0, "layout": [0, 33], "kcx": 0, "void": [0, 3, 8, 10, 12, 13], "quantizegroupwis": 0, "const": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 31], "float": [0, 1, 10, 23, 24, 29, 31], "src": 0, "int": [0, 1, 10, 19, 23, 24, 29, 31], "k": [0, 4], "c": [0, 12, 15, 17, 28, 30, 31, 32], "x": [0, 6, 13, 15, 17, 29, 31], "g": [0, 2, 11, 13, 14, 29, 31], "scale": [0, 1, 4, 10, 23], "std": [0, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 29, 31], "int32_t": [0, 1, 29, 31], "zero_point": 0, "dst": 0, "point": [0, 10, 29, 31], "data": [0, 1, 4, 8, 13, 17, 23, 24, 28, 33], "type": [0, 1, 2, 4, 10, 15, 17, 23, 24, 29], "paramet": [0, 1, 4, 8, 10, 11, 13, 19, 23, 24, 29, 30, 31], "output": [0, 1, 4, 6, 10, 11, 13, 19, 23, 24, 29, 31], "int8_t": [0, 3], "uint8_t": [0, 1, 10, 12], "ar": [0, 2, 6, 12, 13, 14, 15, 17, 19, 23, 24, 25, 28, 29, 30, 31], "support": [0, 2, 4, 13, 14, 15, 17, 23, 24, 25, 31, 33], "input": [0, 1, 4, 6, 8, 10, 11, 13, 17, 19, 23, 24, 29, 33], "tensor": [0, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 19, 23, 24, 25, 30, 31, 33], "kxc": 0, "correspond": [0, 11, 12, 13, 15, 17, 24, 29, 31], "kcr": 0, "kctr": 0, "weight": [0, 1, 3, 11, 12, 13, 23, 24], "time": [0, 2, 14, 15, 17], "dimens": [0, 4, 6, 8, 11, 17, 19, 23, 24, 31], "krsc": 0, "ktrsc": 0, "channel": [0, 14, 15, 26], "number": [0, 1, 2, 4, 10, 11, 13, 14, 17, 19, 23, 24, 30], "r": [0, 14, 16, 24, 30], "": [0, 2, 8, 14, 16, 17, 27, 29, 30, 31], "group": [0, 4, 17, 29], "function": [0, 2, 13, 14, 23, 24, 25, 29, 31], "perform": [0, 2, 10, 11, 13, 17, 19, 23, 24, 25, 33], "channelwis": 0, "1": [0, 1, 2, 4, 11, 12, 13, 14, 15, 16, 17, 19, 23, 24, 30, 31, 32], "groupwis": 0, "per": [0, 17, 23, 24], "size": [0, 2, 4, 8, 10, 11, 17, 19, 23, 24], "should": [0, 10, 11, 12, 14, 15, 17, 23, 27, 29, 30, 31], "equal": [0, 17, 24, 31], "zero": [0, 23, 24, 31], "reprsent": 0, "fusedquantizedequant": 0, "int64_t": [0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "len": [0, 17, 24], "tensorquantizationparam": 0, "qparam": [0, 23], "thread_id": 0, "0": [0, 2, 4, 10, 11, 12, 13, 14, 15, 17, 19, 23, 24, 25, 31], "num_thread": 0, "noise_ratio": 0, "0f": 0, "fuse": [0, 10, 24], "integ": [0, 8, 10, 17, 24], "dequant": [0, 10], "kernel": [0, 2, 8, 10, 13, 16, 33], "acceler": 0, "awar": 0, "train": [0, 13, 25, 33], "fp32": [0, 1, 10, 23, 24], "valu": [0, 6, 8, 10, 11, 12, 13, 23, 24, 29, 30, 31], "u": [0, 14, 32, 33], "int8": [0, 23], "us": [0, 1, 2, 4, 8, 11, 13, 14, 15, 16, 17, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33], "provid": [0, 2, 13, 14, 15, 16, 25, 28, 29, 30, 31, 33], "back": [0, 8, 12, 14, 15], "inputtyp": 0, "floatorhalftofusednbitrowwisequantizedsbhalf": [0, 21], "bit_rat": [0, 10], "size_t": [0, 10, 13, 29], "input_row": 0, "input_column": 0, "convert": [0, 8, 10, 13, 17, 31], "fp16": [0, 10, 23, 24], "rowwis": [0, 10, 24], "bitrat": 0, "specifi": [0, 2, 4, 10, 11, 13, 14, 23, 24], "bit": [0, 1, 10, 23], "bia": [0, 1, 4, 10, 23], "each": [0, 1, 4, 10, 11, 13, 14, 15, 17, 19, 23, 24, 31], "row": [0, 1, 6, 10, 12, 13, 17, 19, 23, 24, 31], "store": [0, 10, 11, 12, 13], "itself": [0, 17, 30], "end": [0, 1, 15, 17, 23, 32], "can": [0, 1, 2, 10, 11, 13, 14, 15, 17, 24, 25, 29, 30, 31, 32], "4": [0, 10, 14, 15, 17, 19, 23, 24, 31], "8": [0, 10, 14, 15, 17, 19, 24], "uint32_t": 0, "xor128": 0, "random": [0, 23], "gener": [0, 2, 11, 13, 14, 15, 19, 24, 29, 32], "9": [0, 13, 14, 15, 17, 19, 24], "base": [0, 2, 11, 12, 13, 14, 17, 24], "thi": [0, 2, 6, 8, 9, 10, 11, 13, 14, 15, 17, 19, 23, 24, 26, 27, 28, 29, 31, 32, 33], "paper": 0, "findminmax": 0, "m": [0, 14, 15, 16], "min": 0, "max": [0, 4, 24], "find": [0, 12, 14], "matrix": [0, 2, 33], "bool": [0, 1, 4, 8, 9, 10, 12, 13, 23, 24], "a_symmetr": 0, "b_symmetr": 0, "quantizationgranular": 0, "q_gran": 0, "has_bia": 0, "fuse_relu": 0, "bias_typ": 0, "direct": [0, 12, 15, 28, 29, 31, 32], "fals": [0, 1, 8, 13, 23, 24, 30], "requantizeoutputprocessingavx2": 0, "out": [0, 1, 14, 26, 28, 30], "inp": 0, "block_type_t": 0, "block": [0, 1, 29, 31, 32], "ld_out": 0, "ld_in": 0, "requantizationparams_t": 0, "requant": 0, "avx2": [0, 2], "i": [0, 1, 2, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 23, 24, 27, 28, 29, 30, 31, 32, 33], "c_per_g": 0, "requantizeoutputprocessinggconvavx512": 0, "avx512": 0, "intyp": 1, "indextyp": 1, "offsettyp": 1, "outtyp": 1, "static": 1, "always_inlin": 1, "embeddingspmdm_autovec": 1, "block_siz": 1, "output_s": [1, 11], "index_s": 1, "data_s": 1, "indic": [1, 3, 12, 13, 17, 23, 24], "offsets_or_length": 1, "normalize_by_length": 1, "is_weight_posit": 1, "use_offset": 1, "output_strid": 1, "input_strid": 1, "no_bag": 1, "is_bf16_out": 1, "is_bf16_in": 1, "version": [1, 2, 15, 23, 25], "embeddingspmdm_ref": 1, "index": [1, 11, 12, 13, 14, 15, 17, 23, 24, 29, 31], "offset": [1, 3, 6, 11, 12, 23, 24], "element": [1, 10, 12, 13, 17, 24], "address": [1, 2, 13, 14], "sum": [1, 4, 11, 13, 19, 23, 24], "option": [1, 2, 3, 6, 8, 12, 14, 19, 23, 24], "null": 1, "non": [1, 4, 8, 23, 24], "whether": [1, 4, 8, 13, 14, 28], "normal": [1, 17], "length": [1, 4, 6, 11, 13, 19, 24, 31], "If": [1, 2, 13, 14, 15, 23, 24, 27, 29, 30, 31], "true": [1, 8, 13, 23, 24], "posit": [1, 4, 11, 13, 19], "set": [1, 8, 12, 13, 16, 17, 23, 24], "instead": [1, 14, 23, 30], "same": [1, 2, 4, 8, 11, 14, 17, 19, 24, 29, 30, 31], "scale_bias_last": [1, 10], "appear": [1, 15], "embed": [1, 2, 14, 15, 25, 30, 33], "bag": [1, 11, 24, 33], "bfloat16": [1, 10], "embeddingspmdmfp8_autovec": 1, "exponent_bit": 1, "exponent_bia": [1, 10], "expon": [1, 23], "note": [2, 12, 14, 15, 24, 29, 30, 31, 32], "The": [2, 4, 8, 10, 11, 13, 15, 16, 17, 19, 23, 24, 25, 27, 29, 30, 31, 32], "most": [2, 14, 15, 17, 30], "date": [2, 14, 15, 25, 30], "script": [2, 14, 15, 30], "bundl": [2, 14, 15, 30], "repo": [2, 14, 15, 30, 31], "under": [2, 14, 15, 27, 28, 30, 31], "setup_env": [2, 14, 15, 30], "bash": [2, 14, 15, 30], "step": [2, 13, 14, 15, 17, 24, 30, 31], "fbgemm_gpu": [2, 8, 14, 17, 19, 23, 24, 26, 27, 28, 29, 31], "follow": [2, 11, 14, 15, 17, 24, 25, 28, 29, 30, 31], "toolchain": [2, 14, 15], "run": [2, 14, 15, 19, 23, 30], "cpu": [2, 8, 9, 16, 23, 24, 30, 33], "higher": 2, "In": [2, 11, 13, 14, 15, 17, 27, 29, 31], "doe": [2, 3, 15, 29, 30, 31], "have": [2, 10, 11, 12, 14, 17, 24, 30], "ani": [2, 11, 14, 25, 27, 28, 30, 31], "intel": 2, "mkl": 2, "howev": [2, 14, 17, 25, 28], "comparison": 2, "some": [2, 14, 17, 30], "benchmark": 2, "found": [2, 14, 15, 25, 30], "path": [2, 13, 14, 16, 29, 32], "through": [2, 25, 27, 29, 31], "intel_mkl_dir": 2, "variabl": [2, 24], "built": [2, 14, 15, 30, 33], "report": [2, 15, 24], "otherwis": [2, 8, 13, 15, 23, 24, 28], "subset": 2, "all": [2, 11, 12, 13, 14, 15, 17, 19, 23, 24, 28, 30], "three": [2, 17], "git": [2, 14], "submodul": [2, 14], "custom": [2, 32], "desir": [2, 14, 17, 29], "thei": [2, 14, 30, 32], "asmjit_src_dir": 2, "cpuinfo_src_dir": 2, "googletest_source_dir": 2, "With": 2, "inner": [2, 17], "take": [2, 14, 23], "one": [2, 4, 10, 11, 12, 14, 15, 23, 24, 29, 31], "doesn": 2, "fit": [2, 28], "approach": 2, "so": [2, 11, 14, 15, 16, 17, 19], "implement": [2, 4, 10, 13, 14, 17, 24], "dynam": 2, "effici": [2, 33], "shape": [2, 4, 17, 19, 24], "specif": [2, 11, 13, 14, 23, 24, 28], "vector": [2, 5, 6, 7, 8, 9, 13, 31], "code": [2, 13, 14, 28, 30], "third": 2, "parti": 2, "call": [2, 8, 13, 15, 23], "detect": [2, 16], "runtim": [2, 14], "pytorch": [2, 13, 17, 26, 30, 31, 33], "project": [2, 27], "dispatch": [2, 8], "optim": [2, 10, 13, 24], "test": [2, 10, 14, 15, 25, 27, 33], "you": [2, 27, 29, 31], "don": [2, 11, 14, 30], "want": [2, 27], "togeth": [2, 29, 30], "default": [2, 11, 14, 15, 24], "turn": [2, 30], "off": [2, 15, 26], "simpli": [2, 14], "fbgemm_build_test": 2, "conda": [2, 16, 30], "For": [2, 15, 16, 17, 26, 28, 29, 30, 31, 32], "platform": [2, 14, 28], "machin": [2, 14, 15, 16, 33], "microsoft": [2, 10], "visual": 2, "studio": 2, "2019": 2, "newer": [2, 14], "recommend": [2, 6, 10, 14, 15, 17], "here": [2, 8, 14, 15, 27, 29, 30, 31, 32], "necessari": [2, 14, 24], "ninja": [2, 14], "etc": [2, 14, 23, 24], "n": [2, 10, 14, 15, 32], "env_nam": [2, 14, 15], "y": [2, 6, 14, 15, 30], "doxygen": [2, 29, 30], "make": [2, 12, 14, 27, 29, 30, 31], "openbla": 2, "packag": [2, 14, 16, 30], "onli": [2, 4, 10, 11, 12, 13, 16, 17, 24, 25, 27, 29, 30, 32], "clone": [2, 14], "along": [2, 14, 15, 19], "its": [2, 8, 10, 11, 14, 28, 30, 32], "insid": [2, 13, 14, 15, 16, 30, 32], "recurs": [2, 14], "http": [2, 14, 15, 27, 29, 30, 31], "github": [2, 14, 27], "com": [2, 14, 27], "cd": [2, 14, 16, 30], "assum": [2, 11, 24], "process": [2, 6, 13, 15, 17, 27, 31], "straightforward": 2, "creat": [2, 8, 14, 17, 27, 29, 31, 32], "directori": [2, 14, 16, 27, 29, 30], "mkdir": 2, "argument": [2, 11, 29, 30, 31], "build_arg": 2, "duse_sanit": 2, "dfbgemm_library_typ": 2, "share": [2, 8], "dpython_execut": 2, "which": [2, 11, 13, 14, 15, 17, 30], "python3": [2, 15], "document": [2, 8, 25, 27, 28], "dfbgemm_build_doc": 2, "ON": [2, 28], "j": [2, 17], "verbos": [2, 14], "As": [2, 11, 14, 15, 17], "write": [2, 13, 14, 15, 30, 31], "fail": [2, 15, 16, 29], "due": [2, 14], "known": [2, 14, 24], "regress": 2, "To": [2, 13, 14, 16, 32], "work": [2, 14, 15, 17, 27], "around": 2, "append": [2, 14, 29, 31], "export": [2, 14, 16], "prior": [2, 14, 15, 28], "cflag": 2, "wno": 2, "error": [2, 10, 15, 23, 24, 29, 30, 31], "mayb": 2, "uniniti": 2, "restrict": 2, "cxxflag": 2, "pleas": [2, 15, 27, 29, 31], "see": [2, 8, 14, 15, 17, 29, 31, 32], "77939": 2, "1094": 2, "1666": 2, "more": [2, 8, 14, 15, 24, 29, 31, 32], "detail": [2, 13, 15], "exactli": 2, "extra": 2, "need": [2, 13, 14, 15, 16, 17, 23, 27, 29, 31, 32], "ad": [2, 14, 27, 30], "invoc": [2, 14, 30], "llvm": [2, 14], "standard": [2, 14], "libc": [2, 14], "openmp": [2, 14], "libomp": 2, "locat": [2, 8, 12, 13, 14, 17], "cc_path": 2, "cxx_path": 2, "dcmake_c_compil": 2, "dcmake_cxx_compil": 2, "dcmake_c_flag": [2, 14], "fopenmp": 2, "stdlib": [2, 14], "conda_prefix": [2, 14], "includ": [2, 9, 13, 14, 28, 29, 31], "dcmake_cxx_flag": [2, 14], "likewis": 2, "also": [2, 13, 14, 24, 32], "veri": [2, 14, 29, 30, 31], "target": [2, 8, 10, 11, 14, 17, 29, 30, 31, 32], "architectur": [2, 14, 15], "bc": [2, 14], "x64": 2, "program": [2, 27], "file": [2, 14, 15, 26, 27, 29, 30, 31, 32], "x86": [2, 33], "enterpris": 2, "vc": 2, "auxiliari": 2, "vcvarsal": 2, "bat": 2, "build_dir": 2, "dfbgemm_build_benchmark": 2, "dcmake_build_typ": 2, "releas": [2, 25], "cl": 2, "ex": 2, "v": [2, 4, 6, 16], "int_nbit_split_embedding_codegen_lookup_funct": 3, "dev_weight": [3, 12], "uvm_weight": [3, 12], "weights_plac": [3, 12], "weights_offset": [3, 12], "weights_ti": [3, 12, 23], "d_offset": [3, 10, 12, 23], "total_d": [3, 12, 24], "max_int2_d": 3, "max_int4_d": 3, "max_int8_d": 3, "max_float16_d": 3, "max_float32_d": 3, "pooling_mod": [3, 23, 24], "indice_weight": 3, "output_dtyp": [3, 10, 23, 24], "lxu_cache_weight": [3, 12, 13], "lxu_cache_loc": [3, 12, 13], "row_align": [3, 12, 23], "max_float8_d": 3, "fp8_exponent_bit": [3, 23], "fp8_exponent_bia": [3, 23], "int_nbit_split_embedding_uvm_caching_codegen_lookup_funct": 3, "cache_hash_size_cumsum": [3, 12], "total_cache_hash_s": [3, 12], "cache_index_table_map": [3, 12], "lxu_cache_st": [3, 12], "lxu_stat": 3, "simlar": 3, "uvm_cach": 3, "lookup": [3, 12, 13, 23, 24], "pruned_hashmap_lookup_cuda": 3, "hash_tabl": 3, "hash_table_offset": 3, "pruned_array_lookup_cuda": 3, "index_remap": [3, 23], "index_remappings_offset": 3, "bounds_check_indices_cuda": 3, "rows_per_t": [3, 23], "bounds_check_mod": [3, 23, 24], "warn": [3, 23, 24, 29], "b_offset": [3, 12], "max_b": [3, 12], "b_t_map": 3, "info_b_num_bit": 3, "info_b_mask": 3, "bounds_check_vers": 3, "int_nbit_split_embedding_codegen_lookup_function_cpu": 3, "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu": 3, "pruned_hashmap_insert_unweighted_cpu": 3, "dense_indic": 3, "pruned_hashmap_lookup_unweighted_cpu": 3, "pruned_array_lookup_cpu": 3, "tupl": [4, 5, 6, 11, 12, 13, 23, 24], "gqa_attn_splitk": 4, "xq": 4, "cache_k": 4, "cache_v": 4, "seq_posit": 4, "doubl": [4, 6, 10, 11], "qk_scale": 4, "num_split_k": 4, "kv_cache_quant_num_group": 4, "use_tensor_cor": 4, "cache_logical_dtype_int": 4, "decod": 4, "queri": 4, "split": [4, 23, 24], "w": [4, 16], "bf16": [4, 10], "int4": [4, 10, 23], "kv": 4, "cuda": [4, 9, 19, 23, 24, 33], "gqa": 4, "cach": [4, 12, 13, 14, 23, 24], "It": [4, 13, 14, 15, 17, 19], "current": [4, 13, 14, 15, 17, 23, 24], "context": 4, "16384": 4, "fix": [4, 11, 23, 24], "head": 4, "128": 4, "an": [4, 8, 11, 13, 15, 16, 17, 19, 23, 24, 29, 30, 31, 32], "arbitrari": [4, 13], "b": [4, 11, 14, 17, 24, 29, 30, 31, 32], "h_q": 4, "d": [4, 17, 32], "where": [4, 6, 8, 11, 13, 17, 19, 24], "batch": [4, 6, 11, 17, 19, 25, 33], "num": 4, "max_t": 4, "h_kv": 4, "sequenc": [4, 23, 24], "contain": [4, 8, 13, 14, 17, 19, 24, 31], "actual": [4, 14], "token": [4, 17], "appli": [4, 11, 14, 17, 24], "after": [4, 11, 13, 14, 15, 16, 17, 30, 31, 32], "qk": 4, "control": [4, 24], "amount": [4, 23, 24], "parallel": [4, 13], "wise": [4, 17, 23, 24], "fp8": [4, 10, 23], "quantiz": [4, 25, 33], "singl": [4, 8, 10, 13], "now": [4, 23], "core": 4, "wmma": 4, "instruct": [4, 27, 29, 30, 31, 33], "fast": 4, "kv_cach": 4, "2": [4, 10, 13, 14, 15, 16, 17, 19, 23, 24, 29, 31, 32], "return": [4, 8, 10, 11, 13, 19, 23, 24, 29, 30, 31], "A": [4, 8, 10, 13, 14, 15, 17, 19, 23, 24, 28, 29, 30, 31], "combin": [4, 33], "metadata": [4, 13, 24], "softmax": 4, "tbe_input_combine_cpu": 5, "indices_list": 5, "offsets_list": 5, "per_sample_weight": [5, 23, 24], "include_last_offset": 5, "padding_fused_tbe_input_combine_cpu": 5, "batch_siz": [5, 19], "solv": 6, "issu": [6, 8, 14, 15, 26], "when": [6, 11, 13, 14, 16, 17, 23, 24, 29, 30, 32], "differ": [6, 11, 13, 17, 24], "often": 6, "occur": [6, 13, 29], "spars": [6, 17, 23, 24, 25, 33], "featur": [6, 11, 14, 17, 19, 23, 24, 26], "system": [6, 14, 15, 17], "well": [6, 11, 14, 29], "natur": [6, 17], "languag": [6, 17, 32], "jagged_to_padded_dense_forward": 6, "c10": [6, 10], "symintarrayref": 6, "max_length": 6, "padding_valu": 6, "jagged_dense_elementwise_add_jagged_output_cuda": 6, "x_valu": 6, "x_offset": [6, 31], "dens": [6, 31], "jagged_to_padded_dens": [6, 18], "jagged_dense_elementwise_add": [6, 18], "jagged_dense_elementwise_mul": [6, 18], "batched_dense_vec_jagged_2d_mul": [6, 18], "a_valu": 6, "a_offset": 6, "dense_to_jag": [6, 18], "symint": 6, "total_l": 6, "jagged_dense_elementwise_add_jagged_output": [6, 18], "jagged_1d_to_dens": [6, 18], "max_l": 6, "jagged_2d_to_dens": [6, 14, 15, 18, 30, 31], "max_sequence_length": [6, 31], "recat_embedding_grad_output_cuda": 7, "grad_output": 7, "num_features_per_rank": 7, "recat_embedding_grad_output_mixed_d_cuda": 7, "dim_sum_per_rank": 7, "recat_embedding_grad_output_mixed_d_batch_cuda": 7, "cumsum_dim_sum_per_rank": 7, "recat_embedding_grad_output_mixed_d_cpu": 7, "new_managed_tensor": 8, "self": [8, 13, 23], "alloc": [8, 23, 24, 29], "unifi": [8, 23, 24], "manag": [8, 14, 15, 23, 24], "uvm": [8, 16, 23, 24], "Then": 8, "prefer": [8, 13, 15], "storag": [8, 10, 12, 13], "host": [8, 14, 23, 24], "establish": 8, "map": [8, 11, 12, 13, 17, 23, 24], "devic": [8, 9, 14, 19, 23, 24], "new": [8, 10, 12, 29, 30, 31], "new_managed_tensor_meta": 8, "placehold": 8, "meta": [8, 23, 28], "kei": [8, 13, 24], "empti": [8, 17, 32], "new_host_mapped_tensor": 8, "new_unified_tensor": 8, "is_host_map": 8, "either": [8, 10, 11, 13, 14, 15], "depend": [8, 10, 14, 15, 17], "new_unified_tensor_meta": 8, "new_vanilla_managed_tensor": 8, "allow": [8, 14], "automat": [8, 11, 16, 30], "uvm_storag": 8, "check": [8, 23, 24], "gpu": [8, 13, 14, 15, 16, 23, 24, 31, 33], "is_uvm_tensor": 8, "BUT": [8, 28], "uvm_to_cpu": 8, "effect": [8, 17], "move": [8, 13, 19], "from": [8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 23, 24, 25, 27, 28, 29, 30, 31, 32], "uvm_to_devic": 8, "prototyp": 8, "whose": 8, "uvm_cuda_mem_advis": 8, "cuda_memory_advis": 8, "cudamemadvis": 8, "cudamemoryadvis": 8, "enum": [8, 10, 13], "avail": [8, 14, 15, 16, 23, 24, 30], "python": [8, 13, 14, 16, 29, 30, 32], "side": [8, 13, 14, 29, 31, 33], "namespac": 8, "over": [8, 14], "valid": 8, "inform": [8, 15, 17, 23, 24, 31, 32], "uvm_cuda_mem_prefetch_async": 8, "device_t": 8, "cudamemprefetchasync": 8, "prefetch": [8, 13, 24], "destin": 8, "uvm_mem_advice_dont_fork": 8, "madvis": 8, "madv_dontfork": 8, "workaround": 8, "driver": [8, 14], "un": 8, "page": [8, 15, 27, 32, 33], "tabl": [8, 11, 17, 25, 33], "fork": [8, 27], "caus": [8, 14, 15, 28, 30], "slowdown": 8, "next": [8, 13, 17, 29, 31], "access": [8, 13, 23, 24], "uvm_to_cpu_clon": 8, "copi": 8, "contigu": [8, 11], "thread": [8, 13], "memcpi": 8, "section": [9, 14, 15, 31], "variou": 9, "all_to_one_devic": 9, "inputtensor": 9, "target_devic": 9, "permute_pooled_embs_split_gpu": 9, "pooled_emb": [9, 19], "offset_dim_list": 9, "permute_list": 9, "inv_offset_dim_list": 9, "inv_permute_list": 9, "permute_pooled_embs_auto_grad_split_gpu": 9, "permute_pooled_embs_auto_grad_gpu": 9, "permute_pooled_embs_cpu_impl": 9, "allow_dupl": 9, "permute_pooled_embs_split_cpu": 9, "permute_pooled_embs_auto_grad_split_cpu": 9, "permute_pooled_embs_auto_grad": 9, "permute_pooled_embs_auto_grad_cpu": 9, "model": [10, 11], "techniqu": 10, "reduc": [10, 13], "larg": [10, 14], "order": [10, 17, 24, 27], "achiev": [10, 15], "better": [10, 13, 29], "small": 10, "loss": [10, 28], "accuraci": 10, "_float_to_bfloat16_gpu": 10, "brain": 10, "_bfloat16_to_float_gpu": 10, "_float_to_fp8rowwise_gpu": 10, "forward": [10, 23, 24], "dtype": [10, 19, 23, 24], "sparsetyp": [10, 23, 24], "throw": [10, 23, 24, 29], "_fp8rowwise_to_float_gpu": 10, "represent": [10, 17], "_float_to_fused8bitrowwise_gpu": 10, "_half_to_fused8bitrowwise_gpu": 10, "half": 10, "_single_or_half_precision_to_fused8bitrowwise_gpu": 10, "_fused8bitrowwise_to_float_gpu": 10, "_fused8bitrowwise_to_half_gpu": 10, "_fused8bitrowwise_to_single_or_half_precision_gpu": 10, "quant_padding_float_typ": 10, "_fused8bitrowwise_to_float_mixed_dim_gpu": 10, "kfloat": 10, "khalf": 10, "_float_to_fusednbitrowwise_gpu": 10, "_half_to_fusednbitrowwise_gpu": 10, "_single_or_half_precision_to_fusednbitrowwise_gpu": 10, "_fusednbitrowwise_to_float_gpu": 10, "_fusednbitrowwise_to_half_gpu": 10, "_fusednbitrowwise_to_single_or_half_precision_gpu": 10, "_float_to_hfp8_gpu": 10, "ebit": 10, "max_po": 10, "hybrid": 10, "hfp8": 10, "_hfp8_to_float_gpu": 10, "_float_to_msfp_gpu": 10, "bounding_box_s": 10, "mbit": 10, "min_po": 10, "msfp": 10, "_msfp_to_float_gpu": 10, "_float_to_paddedfp8rowwise_gpu": 10, "row_dim": 10, "pad": [10, 13, 17, 31], "_paddedfp8rowwise_to_float_gpu": 10, "output_last_dim": 10, "param": [10, 13, 29, 31], "_fused8bitrowwise_to_float_cpu_out": 10, "_float_to_fused8bitrowwise_cpu_out": 10, "float_to_fused8bitrowwise_cpu": 10, "half_to_fused8bitrowwise_cpu": 10, "float_or_half_to_fused8bitrowwise_cpu": 10, "fused8bitrowwise_to_float_cpu": 10, "fused8bitrowwise_to_half_cpu": 10, "fused8bitrowwise_to_float_or_half_cpu": 10, "float_to_fp8rowwise_cpu": 10, "fp8rowwise_to_float_cpu": 10, "fusednbitrowwise_to_float_cpu": 10, "fusednbitrowwise_sbfront_to_float_cpu": 10, "int2": [10, 23], "front": 10, "float32": [10, 19], "torch": [10, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 30, 31], "quint4x2": 10, "quint2x4": 10, "quantizedcpu": 10, "backend": [10, 33], "purpos": [10, 17, 23, 24, 28], "becaus": [10, 14, 17, 24], "refer": [10, 14, 17, 30, 31], "rate": [10, 24], "hold": [10, 13, 17], "fusednbitrowwise_to_half_cpu": 10, "fusednbitrowwise_to_float_or_half_cpu": 10, "floattofp8quantized_ref": 10, "nrow": 10, "ncol": 10, "fp8quantizedtofloat_ref": 10, "expand_into_jagged_permute_cuda": 11, "permut": [11, 19], "input_offset": 11, "output_offset": 11, "expand_into_jagged_permut": [11, 22], "expand": 11, "case": [11, 14, 15, 17, 27], "ha": [11, 13, 15, 17, 24, 25, 27, 29, 30], "across": [11, 14, 19], "rank": [11, 17, 24], "level": 11, "exclus": [11, 13], "op": [11, 15, 18, 20, 21, 22, 31], "sit": 11, "we": [11, 13, 14, 17, 25, 27], "deriv": [11, 17, 28], "arrai": [11, 23, 31], "comput": [11, 14, 15, 23, 24], "formula": 11, "output_permut": 11, "table_offset": 11, "bag_offset": 11, "generic_histogram_binning_calibration_by_feature_cpu": 11, "logit": 11, "segment_valu": 11, "segment_length": 11, "num_seg": 11, "bin_num_exampl": 11, "bin_num_posit": 11, "bin_boundari": 11, "positive_weight": 11, "bin_ctr_in_use_aft": 11, "bin_ctr_weight_valu": 11, "divid": [11, 17], "predict": 11, "rang": [11, 13, 17], "e": [11, 13, 14, 17, 19, 29, 31, 32], "bin": [11, 14], "two": [11, 17, 24, 30], "exampl": [11, 13, 14, 15, 16, 19, 24, 29, 30, 31, 32], "fall": [11, 14, 15], "bucket": [11, 14], "basic": [11, 13, 31], "histogram": 11, "result": [11, 13, 14], "statist": [11, 23, 24], "real": 11, "ctr": 11, "num_po": 11, "num_exampl": 11, "final": 11, "calibr": 11, "pre": [11, 15], "cali": 11, "wai": [11, 23, 28], "within": [11, 23, 24, 25], "suffici": [11, 27, 30], "That": 11, "fine": 11, "grain": 11, "modul": [11, 14, 15, 25, 31], "theoret": 11, "layer": [11, 13], "uncalibr": 11, "extens": [11, 29, 30], "ectr": 11, "abov": [11, 13, 15, 17, 28, 29, 31, 32], "accept": [11, 27], "sort": [11, 12, 13, 14], "befor": [11, 13, 14, 32], "sigmoid": 11, "calibart": 11, "pass": [11, 23, 24, 27, 30], "lower": 11, "upper_bound": 11, "bound": [11, 17, 23, 24], "calibration_target": 11, "observ": 11, "statisct": 11, "final_calibrated_predict": 11, "bin_ctr_weight": 11, "bin_ctr": 11, "histogram_binning_calibration_cpu": 11, "lower_bound": 11, "keyjaggedtensor": 11, "num_bin": 11, "longer": [11, 26, 29], "still": [11, 14], "parambin_ctr_weight_valu": 11, "calibrated_predict": 11, "bin_id": 11, "get_unique_indices_cuda": 12, "linear_indic": 12, "max_indic": 12, "compute_count": 12, "dedupl": 12, "get_unique_indices_with_inverse_cuda": 12, "compute_inverse_indic": 12, "lru_cache_find_uncached_cuda": 12, "unique_indic": 12, "unique_indices_length": [12, 13], "time_stamp": 12, "lru_stat": 12, "gather_cache_stat": 12, "uvm_cache_stat": 12, "lock_cache_lin": 12, "lxu_cache_locking_count": 12, "lru": [12, 13, 23, 24], "uncach": [12, 13], "them": [12, 23], "host_lxu_cache_slot": 12, "h_in": 12, "cache_set": [12, 23, 24], "linearize_cache_indices_cuda": 12, "indices_base_offset": 12, "linear": [12, 13], "uniqu": [12, 13, 24, 32], "linearize_cache_indices_from_row_idx_cuda": 12, "update_table_indic": 12, "update_row_indic": 12, "format": [12, 19, 30, 31], "inplac": [12, 24], "updat": [12, 13, 14, 15, 16, 24, 27], "direct_mapped_lxu_cache_lookup_cuda": 12, "linear_cache_indic": 12, "invalid_index": 12, "fetch": [12, 13], "insert": [12, 13, 32], "timestep": 12, "lru_cache_populate_cuda": 12, "hash_size_cumsum": 12, "stochastic_round": [12, 24], "byte": [12, 13, 23], "lru_cache_populate_byte_cuda": 12, "assoc": 12, "variant": [12, 14, 15, 30], "direct_mapped_lru_cache_populate_byte_cuda": 12, "lxu_cache_miss_timestamp": 12, "lfu": [12, 23, 24], "lfu_cache_populate_cuda": 12, "lfu_stat": 12, "lfu_cache_populate_byte_cuda": 12, "look": [12, 24], "up": [12, 13, 16, 24, 25], "slot": [12, 13], "sentinel": [12, 13], "miss": [12, 13, 14], "lxu_cache_lookup_cuda": 12, "num_uniq_cache_indic": 12, "lxu_cache_locations_output": 12, "emulate_cache_miss": 12, "enforced_misses_per_256": 12, "lxu_cache_flush_cuda": 12, "flush": [12, 13], "reset_weight_momentum_cuda": 12, "momentum1_dev": 12, "momentum1_uvm": 12, "momentum1_plac": 12, "momentum1_offset": 12, "pruned_indic": 12, "pruned_indices_offset": 12, "logical_table_id": 12, "buffer_id": 12, "lxu_cache_locking_counter_decrement_cuda": 12, "decrement": 12, "counter": [12, 24], "lxu_cache_locations_update_cuda": 12, "lxu_cache_locations_new": 12, "rocksdbwritemod": 13, "rocksdb": 13, "mode": [13, 16, 23, 24], "offload": 13, "3": [13, 14, 15, 17, 19, 24, 28, 31], "iter": 13, "fwd_rocksdb_read": 13, "l2": [13, 24], "fwd": 13, "fwd_l1_evict": 13, "l1": 13, "eviciton": 13, "evict": 13, "bwd_l1_cnflct_miss_write_back": 13, "conflict": 13, "bwd": 13, "fill": [13, 23], "potenti": 13, "trigger": 13, "onc": [13, 15, 27], "full": [13, 14, 15, 32], "addition": 13, "do": [13, 14, 15, 24, 25, 27], "io": 13, "enumer": 13, "inlin": [13, 32], "hash_shard": 13, "id": [13, 15], "num_shard": 13, "hash": [13, 23], "shard": 13, "algorithm": [13, 23, 24], "cuda_callback_func": 13, "cudastream_t": 13, "stream": [13, 14, 24], "cudaerror_t": 13, "statu": 13, "functor": 13, "callback": 13, "cudastreamaddcallback": 13, "common": [13, 14, 15, 17, 31], "cudastreamcallback_t": 13, "cast": 13, "invok": [13, 14, 19, 24, 30], "delet": 13, "anoth": [13, 32], "none": [13, 19, 23, 24], "masked_index_put_cuda": 13, "count": 13, "use_pipelin": 13, "preferred_sm": 13, "similar": [13, 14, 17, 23, 24], "index_put": 13, "ignor": [13, 16, 23, 24, 30], "2d": [13, 17, 19, 24, 31], "put": [13, 30], "equival": [13, 17], "filter_": 13, "indices_": 13, "nonzero": 13, "flatten": 13, "1d": [13, 24, 31], "flag": [13, 14, 30], "overlap": 13, "other": [13, 15, 17, 22, 28, 29, 30, 31], "fraction": 13, "sm": 13, "resourc": 13, "competit": 13, "masked_index_select_cuda": 13, "index_select": 13, "ssd_generate_row_addrs_cuda": 13, "assigned_cache_slot": 13, "linear_index_inverse_indic": 13, "unique_indices_count_cumsum": 13, "cache_set_inverse_indic": 13, "inserted_ssd_weight": 13, "cache_set_sorted_unique_indic": 13, "memori": [13, 15, 23, 24, 33], "tbe": [13, 25, 33], "retriev": 13, "scratch": [13, 15], "hbm": [13, 23, 24], "lxu": 13, "associ": 13, "enabl": [13, 14, 16, 24], "conveni": 13, "first": [13, 14, 29, 31, 32], "pointer": [13, 30], "moreov": 13, "list": [13, 14, 17, 19, 23, 24, 28, 29, 31], "post": 13, "backward": [13, 24, 25], "origin": 13, "being": [13, 14, 30], "prefix": [13, 14, 32], "ssd_update_row_addrs_cuda": 13, "ssd_row_addrs_curr": 13, "inserted_ssd_weights_curr_next_map": 13, "lxu_cache_locations_curr": 13, "linear_index_inverse_indices_curr": 13, "unique_indices_count_cumsum_curr": 13, "cache_set_inverse_indices_curr": 13, "inserted_ssd_weights_next": 13, "unique_indices_length_curr": 13, "pipelin": [13, 24], "dure": [13, 14, 17, 24, 31], "reloc": 13, "correct": [13, 14], "between": [13, 17, 29, 30, 32], "been": [13, 14, 29], "compact_indices_cuda": 13, "compact_indic": 13, "compact_count": 13, "mask": 13, "compact": 13, "given": [13, 14, 17], "operat": 13, "remov": 13, "7": [13, 14, 15, 17, 19, 24], "5": [13, 14, 15, 17, 19, 23, 24], "repres": [13, 17, 19, 24], "keep": [13, 14], "class": [13, 19, 23, 24, 30, 31], "cachelibcach": 13, "cachelib_cach": 13, "h": [13, 14, 29], "cachelib": 13, "wrapper": 13, "cachlib": 13, "interact": 13, "maintain": 13, "relat": [13, 17, 23], "initi": 13, "state": [13, 14, 24], "logic": [13, 17, 29], "caller": 13, "reset": 13, "captur": 13, "delai": 13, "markus": 13, "boost": 13, "get": 13, "handl": [13, 17], "read": [13, 17], "done": [13, 14, 15], "embeddingparameterserv": 13, "public": [13, 27, 30], "embeddingkvdb": 13, "ps_table_batched_embed": 13, "servic": [13, 28], "tp": 13, "client": 13, "cachecontext": 13, "kv_db_table_batched_embed": 13, "l2cach": 13, "num_miss": 13, "cached_addr_list": 13, "prealloc": 13, "invalid": [13, 23, 24], "spot": 13, "stai": 13, "struct": 13, "queueitem": 13, "queue": 13, "item": [13, 19, 31], "background": 13, "read_handl": 13, "abstract": 13, "pair": [13, 32], "later": [13, 14], "separ": [13, 24, 30], "get_cach": 13, "monitor": 13, "checkout": 13, "explan": 13, "enable_shared_from_thi": 13, "execut": [13, 15, 16], "dram": [13, 23, 24], "remot": 13, "scalabl": 13, "without": [13, 14, 28], "blow": 13, "subclass": [13, 23], "embeddingrocksdb": 13, "ssd_table_batched_embed": 13, "fbgemm": [14, 18, 20, 21, 22, 26, 27, 28, 30, 31], "experiment": [14, 15, 33], "reproduc": [14, 15, 27, 28], "platform_nam": 14, "unam": 14, "miniconda_prefix": 14, "home": 14, "download": [14, 15], "wget": 14, "q": 14, "anaconda": 14, "miniconda3": 14, "latest": 14, "sh": 14, "o": [14, 15], "p": 14, "load": [14, 17, 23, 31], "shortcut": 14, "bashrc": 14, "command": [14, 15, 29, 30], "against": [14, 16], "env": [14, 15], "name": [14, 15, 23, 24, 28, 29, 31], "python_vers": 14, "12": [14, 15, 17, 19, 24], "upgrad": 14, "pyopenssl": 14, "22": [14, 17, 19], "requir": [14, 15, 16, 17, 24, 30, 31], "recent": [14, 15, 23, 24], "nvcc": 14, "capabl": [14, 16], "bare": 14, "metal": 14, "neither": [14, 28], "nor": [14, 28], "nvidia": [14, 24], "present": [14, 31], "sinc": [14, 17, 23], "pull": [14, 15, 30], "linux": [14, 15], "distribut": [14, 28], "ubuntu": 14, "04": 14, "11": [14, 15, 17, 19], "entrypoint": 14, "devel": 14, "ubuntu22": 14, "rest": [14, 15], "mai": [14, 15, 17, 28], "construct": [14, 15, 17, 23], "mechan": 14, "nvml": 14, "org": [14, 15, 31], "cuda_vers": 14, "label": 14, "verifi": [14, 15, 29, 31], "cuda_runtim": 14, "libnvidia": [14, 15], "ml": [14, 15], "libnccl": [14, 16], "printenv": 14, "extract": 14, "url": [14, 15], "builder": 14, "blob": 14, "main": [14, 27], "install_cuda": 14, "cudnn_url": 14, "redist": 14, "x86_64": 14, "26_cuda12": 14, "archiv": 14, "tar": 14, "xz": 14, "unpack": 14, "xvf": 14, "applic": [14, 15, 24, 29, 31], "alreadi": [14, 15, 27, 29, 31], "repositori": [14, 27], "cmake": 14, "configur": [14, 29], "amd": [14, 15], "minim": 14, "6": [14, 15, 17, 19], "termin": 14, "while": [14, 23, 30], "come": [14, 15], "reason": [14, 15, 30], "oper": [14, 15, 16, 24, 25], "guid": [14, 31], "disabl": 14, "apt": 14, "prompt": 14, "debian_frontend": 14, "noninteract": 14, "db": 14, "radeon": 14, "amdgpu": 14, "focal": 14, "install_5": 14, "50601": 14, "1_all": 14, "deb": 14, "usecas": 14, "hiplibsdk": 14, "dkm": 14, "hipifi": 14, "hip": 14, "dev": 14, "20": [14, 19], "sysroot": 14, "avoid": 14, "glibcxx": 14, "fbgemm_cpu": 14, "10": [14, 15, 17, 19, 24], "older": [14, 15], "accompani": [14, 30], "appropri": 14, "sysroot_linux": 14, "gcc_version": 14, "forg": [14, 30], "gxx_linux": 14, "64": [14, 17], "17": [14, 19], "binari": [14, 28], "cento": 14, "librari": [14, 30, 33], "libstdc": 14, "what": [14, 30], "libcxx_path": 14, "print": [14, 15, 19, 23, 24, 31], "objdump": 14, "tc": 14, "grep": 14, "glibc_": 14, "sed": 14, "vu": 14, "cat": 14, "glibcxx_": 14, "possibl": [14, 17, 27, 28], "just": 14, "minimum": [14, 29, 30, 31], "llvm_version": 14, "16": [14, 17, 19], "libcxx": 14, "outdat": 14, "aarch64": [14, 15], "cannot": 14, "explicitli": [14, 24, 25], "clangxx": 14, "rt": 14, "lib": [14, 15, 16], "ld_library_path": [14, 15, 16], "config": [14, 24], "var": 14, "nvcc_prepend_flag": 14, "correctli": [14, 15, 16, 29, 30], "xcompil": 14, "ccbin": 14, "clangxx_path": 14, "unsupport": 14, "even": [14, 28], "though": [14, 15], "libstd": 14, "mean": [14, 17, 23, 24, 25], "regardless": 14, "scenario": 14, "binpath": 14, "overrid": 14, "exist": [14, 29, 31], "ln": 14, "sf": 14, "path_to_either_gcc_or_clang": 14, "cc": 14, "These": 14, "stage": [14, 17], "click": 14, "hypothesi": [14, 15], "jinja2": 14, "ncurs": 14, "numpi": [14, 15], "scikit": [14, 15], "offici": 14, "homepag": 14, "authorit": [14, 15, 30], "how": [14, 15, 16, 19, 31], "nightli": [14, 15], "rc": 14, "alwai": 14, "reliabl": 14, "arriv": 14, "hour": 14, "than": [14, 15, 17], "window": 14, "silent": [14, 23, 24], "both": [14, 23, 24, 26, 28, 30], "place": [14, 23, 24], "artifact": 14, "select": 14, "thu": [14, 24], "import": [14, 15, 19, 24, 31, 32], "much": [14, 29], "determinist": 14, "whl": [14, 15], "cu121": [14, 15], "rocm5": [14, 15], "ensur": [14, 15, 25, 27], "properli": 14, "__version__": 14, "cuda_cmake_macro": 14, "gemm": 14, "via": [14, 24, 25], "manual": [14, 15, 29], "sha": 14, "pin": 14, "ci": [14, 15], "ci_commit_pin": 14, "txt": [14, 16, 30, 32], "dedb7bdf33": 14, "tag": [14, 29, 32], "fbgemm_vers": 14, "v1": [14, 25], "fbgemm_": 14, "addit": [14, 16, 17], "flow": [14, 24], "becom": 14, "stale": 14, "problem": 14, "re": [14, 15, 23], "attempt": 14, "failur": [14, 15], "clear": [14, 27], "py": [14, 15, 16, 30, 31], "clean": [14, 30], "must": [14, 15, 16, 17, 23, 24, 28, 32], "package_nam": 14, "fbgemm_gpu_": 14, "convent": 14, "major": [14, 25], "minor": 14, "py312": 14, "python_tag": 14, "determin": [14, 17, 23, 24], "processor": 14, "arch": 14, "python_plat_nam": 14, "manylinux2014_": 14, "maco": 14, "macosx_10_9_": 14, "arm64": 14, "macosx_11_0_": 14, "win_": 14, "cpu_onli": 14, "bdist_wheel": 14, "package_vari": 14, "plat": 14, "cxxprefix": 14, "presum": 14, "made": [14, 30], "debug": [14, 16], "assert": 14, "presenc": 14, "unabl": 14, "cudacxx": 14, "cuda_bin_path": 14, "cub": 14, "cub_dir": 14, "log": [14, 15], "nvcc_verbos": 14, "header": [14, 29, 32], "cudnn_include_dir": 14, "cudnn_librari": 14, "filepath": 14, "nvml_lib_path": 14, "nccl": [14, 16], "nccl_lib_path": 14, "sm70": [14, 15], "80": 14, "v100": [14, 15], "a100": [14, 15], "cuda_arch_list": 14, "unset": 14, "torch_cuda_arch_list": 14, "preced": 14, "dtorch_cuda_arch_list": 14, "By": [14, 27], "those": [14, 17, 23, 27, 31], "rocm_path": 14, "pytorch_rocm_arch": 14, "hipcc": 14, "hipcc_verbos": 14, "gfx906": 14, "gfx908": 14, "gfx90a": 14, "wiki": 14, "gentoo": 14, "rocminfo": 14, "gfx": 14, "dhip_root_dir": 14, "dtorch_use_hip_dsa": 14, "complet": [14, 27, 30], "lot": 14, "jinja": 14, "instanti": [14, 19], "sure": [14, 27, 29, 31], "accident": 14, "cours": 14, "fbgemm_gpu_lib_path": 14, "fbgemm_gpu_pi": [14, 15], "defin": [14, 17, 23, 29], "nm": 14, "gdcu": 14, "referenc": 14, "certain": 14, "gdc": 14, "merge_pooled_embed": [14, 15, 20], "isol": [15, 30], "build": [15, 16, 29, 31, 33], "accord": 15, "schedul": 15, "guarante": [15, 25], "conjunct": 15, "visit": 15, "sm80": 15, "respect": 15, "especi": 15, "displai": [15, 32], "setup": 15, "smi": 15, "515": 15, "76": 15, "persist": 15, "bu": [15, 32], "disp": 15, "volatil": 15, "uncorr": 15, "ecc": 15, "fan": 15, "temp": 15, "perf": 15, "pwr": 15, "usag": [15, 30, 31], "cap": 15, "util": [15, 33], "mig": 15, "a10g": 15, "00000000": 15, "00": 15, "1e": [15, 24], "31c": 15, "p0": 15, "59w": 15, "300w": 15, "0mib": 15, "23028mib": 15, "gi": 15, "pid": 15, "No": [15, 23, 24, 25], "expos": 15, "imag": 15, "launch": 15, "toolkit": 15, "interfac": 15, "concis": 15, "info": [15, 29, 31], "dieedg": 15, "avgpwr": 15, "sclk": 15, "mclk": 15, "pwrcap": 15, "vram": 15, "33": [15, 19], "0c": 15, "37": [15, 19], "0w": 15, "300mhz": 15, "1200mhz": 15, "auto": [15, 30], "290": 15, "32": [15, 19, 23], "39": [15, 19], "difficult": 15, "relev": [15, 29], "genai": 15, "triton_vers": 15, "45fff310c8": 15, "about": [15, 31], "link": [15, 25, 30], "encount": [15, 23, 24], "signatur": [15, 30], "traceback": 15, "last": 15, "root": [15, 27], "miniconda": 15, "mycondaenv": 15, "site": 15, "_op": [15, 30], "line": [15, 23, 31, 32], "565": 15, "__getattr__": 15, "overload_nam": 15, "_c": 15, "_jit_get_oper": 15, "qualified_op_nam": 15, "runtimeerror": 15, "except": [15, 29, 31], "wa": 15, "string": [15, 32], "post47": 15, "py3": 15, "egg": 15, "__init__": [15, 31], "21": [15, 19], "_fbgemm_gpu_doc": 15, "noqa": 15, "f401": 15, "e402": 15, "18": [15, 19], "569": 15, "rais": [15, 31], "attributeerror": [15, 31], "_opnamespac": 15, "object": [15, 17], "attribut": [15, 31], "cli": 15, "main_run": 15, "47": [15, 19], "_zn6fbgemm48floatorhalftofusednbitrowwisequantizedsbhalfavx2itli2eeevpkt_miph": 15, "libtorch": 15, "visibl": 15, "incorrectli": [15, 30], "declar": [15, 29], "were": 15, "pr": [15, 29, 30, 31], "1618": 15, "former": [15, 23], "resolv": 15, "latter": [15, 23], "seriou": 15, "tha": 15, "develop": [15, 30], "bench": 16, "good": [16, 28], "instal": [16, 30, 33], "pip": [16, 30], "pytest": 16, "rsx": 16, "pytestcollectionwarn": 16, "split_table_batched_embeddings_test": 16, "quantize_ops_test": 16, "sparse_ops_test": 16, "split_embedding_inference_converter_test": 16, "cuda_visible_devic": 16, "cuda_launch_block": 16, "involv": [16, 17], "rpath": 16, "fbgemm_test_with_rocm": 16, "hip_launch_block": 16, "split_table_batched_embeddings_benchmark": 16, "consecut": 17, "nestedtensor": 17, "raggedtensor": 17, "tensorflow": 17, "notabl": 17, "sentenc": 17, "maxlength": 17, "numel": 17, "greatest": 17, "divisor": 17, "smallest": 17, "sub": 17, "exclud": 17, "partit": 17, "impli": [17, 28], "denot": [17, 29, 31], "offest": 17, "outer": 17, "would": 17, "begin": 17, "maximum": [17, 31], "densor": 17, "form": [17, 28], "figur": 17, "below": [17, 25], "show": [17, 24, 30], "accomod": 17, "At": [17, 29, 30, 31], "multipl": [17, 23, 24, 31, 33], "hadamard": 17, "product": [17, 28], "bmatrix": 17, "rightarrow": 17, "25": [17, 19], "36": [17, 19], "49": 17, "81": 17, "50": 17, "operand": 17, "word": 17, "ax": 17, "properti": 17, "elementwis": 17, "start": [17, 25, 31, 32], "dim": [17, 19], "onto": 17, "part": 17, "everi": [17, 23, 24, 25], "converson": 17, "could": 17, "lead": 17, "smaller": 17, "expect": [17, 23], "happen": 17, "give": 17, "situat": 17, "like": 17, "dense_tensor": 17, "jagged_tensor": 17, "break": 17, "exact": 17, "usual": 17, "arg": [18, 20, 21, 22, 24, 31], "kwarg": [18, 20, 21, 22], "jagged_dense_dense_elementwise_add_jagged_output": 18, "stacked_jagged_1d_to_dens": 18, "stacked_jagged_2d_to_dens": 18, "permute_pooled_embedding_modul": 19, "permutepooledembed": 19, "embs_dim": 19, "sourc": [19, 23, 24, 27, 28, 29, 30, 31], "column": 19, "essenti": 19, "second": [19, 29, 31], "suppos": 19, "int64": [19, 23], "perm": 19, "arang": 19, "reshap": 19, "13": [19, 24], "14": [19, 31], "15": 19, "19": 19, "23": 19, "24": 19, "26": 19, "27": 19, "28": 19, "29": 19, "30": 19, "31": 19, "34": 19, "35": 19, "38": 19, "40": 19, "41": 19, "42": [19, 31], "43": 19, "44": 19, "45": 19, "46": 19, "describ": [19, 23, 24, 27], "__call__": 19, "b_local": 19, "total_global_d": 19, "local": [19, 29, 31], "total": [19, 23, 24], "global": [19, 23, 24], "permute_pooled_emb": 20, "permute_2d_sparse_data": 22, "permute_1d_sparse_data": 22, "asynchronous_complete_cumsum": 22, "offsets_rang": 22, "segment_sum_csr": 22, "keyed_jagged_index_select_dim1": 22, "block_bucketize_sparse_featur": 22, "split_table_batched_embeddings_ops_infer": 23, "intnbittablebatchedembeddingbagscodegen": 23, "embedding_spec": [23, 24], "str": [23, 24], "embeddingloc": [23, 24], "feature_table_map": [23, 24], "poolingmod": [23, 24], "boundscheckmod": [23, 24], "weight_list": 23, "pruning_hash_load_factor": 23, "use_array_for_index_remap": 23, "cache_algorithm": [23, 24], "cachealgorithm": [23, 24], "cache_load_factor": [23, 24], "cache_reserved_memori": [23, 24], "enforce_hbm": [23, 24], "record_cache_metr": [23, 24], "recordcachemetr": [23, 24], "gather_uvm_cache_stat": [23, 24], "cache_assoc": 23, "scale_bias_size_in_byt": 23, "cacheline_align": 23, "uvm_host_map": [23, 24], "reverse_qparam": 23, "feature_names_per_t": 23, "indices_dtyp": 23, "int32": 23, "nn": [23, 24], "embeddingbag": 23, "computedevic": [23, 24], "spec": [23, 24], "physic": [23, 24], "placement": [23, 24], "virtual": [23, 24], "managed_cach": [23, 24], "mtia": [23, 24], "remap": 23, "prune": 23, "pool": [23, 24, 25, 33], "union": [23, 24], "skip": [23, 24], "fatal": [23, 24], "messag": [23, 24], "adjust": [23, 24], "factor": [23, 24], "least": [23, 24], "frequent": [23, 24], "capac": [23, 24], "reserv": [23, 24, 28], "momentum": [23, 24], "record": [23, 24], "hit": [23, 24], "request": [23, 24, 26, 30], "record_cache_miss_count": [23, 24], "metric": [23, 24], "record_tablewise_cache_miss": [23, 24], "collect": [23, 24, 33], "align": [23, 29, 31], "default_scale_bias_size_in_byt": 23, "128b": 23, "boundari": 23, "malloc": [23, 24], "cudahostregist": [23, 24], "cudamallocmanag": [23, 24], "begn": 23, "remap_indic": 23, "assign_embedding_weight": 23, "q_weight_list": 23, "assign": 23, "split_embedding_weight": [23, 24], "scale_shift": 23, "fill_random_weight": 23, "buffer": 23, "overridden": 23, "although": 23, "recip": 23, "instanc": 23, "afterward": 23, "care": [23, 29], "regist": 23, "hook": 23, "recompute_module_buff": 23, "materi": [23, 28], "reset_weights_placements_and_offset": 23, "bounds_check_warn": 23, "right": [23, 28, 32], "split_scale_shift": 23, "split_embedding_weights_with_scale_bia": 23, "split_scale_bias_mod": 23, "scale_bia": 23, "split_table_batched_embeddings_ops_train": 24, "splittablebatchedembeddingbagscodegen": 24, "cache_precis": 24, "weights_precis": 24, "emboptimtyp": 24, "exact_sgd": 24, "gradient_clip": 24, "max_gradi": 24, "max_norm": 24, "learning_r": 24, "01": 24, "ep": 24, "08": 24, "weight_decai": 24, "weight_decay_mod": 24, "weightdecaymod": 24, "eta": 24, "001": 24, "beta1": 24, "beta2": 24, "999": 24, "ensemble_mod": 24, "ensemblemodedefinit": 24, "emainplace_mod": 24, "emainplacemodedefinit": 24, "counter_based_regular": 24, "counterbasedregularizationdefinit": 24, "cowclip_regular": 24, "cowclipdefinit": 24, "uvm_non_rowwise_momentum": 24, "use_experimental_tb": 24, "prefetch_pipelin": 24, "stats_reporter_config": 24, "tbestatsreporterconfig": 24, "table_nam": 24, "optimizer_state_dtyp": 24, "dict": 24, "multipass_prefetch_config": 24, "multipassprefetchconfig": 24, "global_weight_decai": 24, "globalweightdecaydefinit": 24, "optimtyp": 24, "adam": 24, "exact_adagrad": 24, "adagrad": 24, "exact_rowwise_adagrad": 24, "aadagrad": 24, "sgd": 24, "lamb": 24, "lars_sgd": 24, "lar": 24, "partial_rowwise_adam": 24, "partial": 24, "partial_rowwise_lamb": 24, "ensemble_rowwise_adagrad": 24, "ensembl": 24, "emainplace_rowwise_adagrad": 24, "ema": 24, "Not": 24, "gradient": 24, "stochast": 24, "round": 24, "clip": 24, "norm": 24, "learn": 24, "0e": 24, "epsilon": 24, "decai": 24, "decoupl": 24, "v2": 24, "polici": 24, "forward_stream": 24, "stat": 24, "multipass": 24, "feature_requires_grad": 24, "batch_size_per_feature_per_rank": 24, "total_unique_indic": 24, "vbe": 24, "user": 24, "autograd": 24, "chosen": 24, "conatin": 24, "sampl": 24, "unweight": 24, "multipli": 24, "f": 24, "split_table_batched_embeddings_ops_common": 24, "init_embedding_weights_uniform": 24, "9426": 24, "7046": 24, "4214": 24, "0419": 24, "1331": 24, "7856": 24, "8124": 24, "2021": 24, "5771": 24, "5911": 24, "7792": 24, "1068": 24, "6203": 24, "4813": 24, "1677": 24, "4790": 24, "5587": 24, "0941": 24, "5754": 24, "3475": 24, "8952": 24, "1964": 24, "0810": 24, "4174": 24, "2513": 24, "4039": 24, "3775": 24, "3273": 24, "5399": 24, "0229": 24, "1455": 24, "8770": 24, "9520": 24, "4593": 24, "7169": 24, "6307": 24, "1765": 24, "8757": 24, "8614": 24, "2051": 24, "0603": 24, "9980": 24, "7958": 24, "5826": 24, "long": 24, "5197": 24, "2957": 24, "3578": 24, "1487": 24, "4873": 24, "3044": 24, "9801": 24, "2769": 24, "7164": 24, "8528": 24, "7159": 24, "6719": 24, "0784": 24, "2016": 24, "2176": 24, "1988": 24, "3825": 24, "5008": 24, "8991": 24, "1405": 24, "2637": 24, "9427": 24, "8902": 24, "3754": 24, "5013": 24, "6105": 24, "9968": 24, "3057": 24, "7621": 24, "9821": 24, "7314": 24, "6195": 24, "grad_fn": 24, "cppnode": 24, "splitlookupfunction_sgd_op": 24, "set_learning_r": 24, "lr": 24, "set_optimizer_step": 24, "view": [24, 30], "split_optimizer_st": 24, "momentum1": 24, "momentum2": 24, "prev_it": 24, "cowclip": 24, "row_count": 24, "update_hyper_paramet": 24, "params_dict": 24, "hyper": 24, "extern": [24, 32], "outlin": [25, 27], "our": 25, "compat": 25, "thorough": 25, "futur": 25, "unless": 25, "announc": 25, "advanc": 25, "enhanc": 25, "comprehens": 25, "unit": 25, "framework": 25, "NOT": [25, 28], "commit": 25, "best": 25, "effort": 25, "basi": 25, "infer": [25, 33], "jag": [25, 31, 33], "question": 26, "concern": 26, "discuss": 26, "kick": 26, "regard": 26, "feel": 26, "free": 26, "reach": 26, "easi": 27, "transpar": 27, "activ": 27, "welcom": [27, 33], "your": [27, 30, 31], "branch": 27, "ve": 27, "add": [27, 29, 30, 31], "chang": [27, 29, 31], "api": [27, 29, 30, 31], "suit": 27, "lint": 27, "haven": 27, "submit": [27, 29, 31], "facebook": [27, 28, 33], "open": 27, "track": 27, "bug": 27, "descript": [27, 29, 30, 31, 32], "abl": 27, "bounti": 27, "safe": 27, "disclosur": 27, "secur": 27, "go": 27, "agre": 27, "tree": 27, "claus": 28, "bsd": 28, "softwar": 28, "copyright": 28, "inc": 28, "affili": 28, "redistribut": 28, "modif": 28, "permit": 28, "condit": 28, "met": 28, "retain": 28, "notic": 28, "disclaim": 28, "contributor": 28, "endors": 28, "promot": 28, "written": 28, "permiss": 28, "BY": 28, "THE": 28, "holder": 28, "AND": 28, "AS": 28, "express": [28, 32], "OR": 28, "warranti": 28, "limit": [28, 30], "TO": 28, "OF": 28, "merchant": 28, "FOR": 28, "particular": 28, "IN": 28, "NO": 28, "event": 28, "shall": 28, "BE": 28, "liabl": 28, "indirect": 28, "incident": 28, "special": 28, "exemplari": 28, "consequenti": 28, "damag": 28, "procur": 28, "substitut": 28, "profit": 28, "busi": 28, "interrupt": 28, "theori": 28, "liabil": 28, "contract": 28, "strict": 28, "tort": 28, "neglig": 28, "aris": 28, "IF": 28, "advis": 28, "SUCH": 28, "javadoc": 29, "style": [29, 31], "comment": [29, 30, 32], "sphinx": [29, 30, 31], "breath": 29, "kept": 29, "cpp": [29, 31, 32], "cu": 29, "cuh": 29, "everyth": 29, "ifndef": 29, "doxygen_this_will_be_skip": 29, "endif": 29, "hidden": 29, "html": [29, 30, 31], "descriptionss": 29, "publish": [29, 31], "docstr": [29, 30, 31], "method": [29, 30, 31], "organ": 29, "yet": 29, "top": [29, 33], "defgroup": 29, "directli": [29, 31], "behavior": [29, 31], "tparam": 29, "thrown": [29, 31], "ingroup": 29, "brief": 29, "short": 29, "example_method": [29, 31], "def": [29, 31], "foo": [29, 31], "lst": [29, 31], "And": [29, 31], "verbatim": [29, 31], "text": [29, 31, 32], "diagram": [29, 31], "unpars": 29, "prev": [29, 31], "usabl": [29, 31], "space": [29, 30, 31], "endcod": 29, "param1": [29, 31], "param2": 29, "bad_alloc": 29, "logic_error": 29, "href": 29, "www": [29, 31], "nl": 29, "cmdlink": 29, "On": [29, 31], "doxygengroup": 29, "rst": [29, 31, 32], "content": [29, 32, 33], "toctre": [29, 31], "ini": 29, "taken": 29, "doc": [29, 30, 31, 32], "netlifi": [29, 30, 31], "preview": [29, 31], "serv": 30, "yourself": 30, "shoe": 30, "who": 30, "understand": 30, "live": 30, "easier": 30, "leav": 30, "task": 30, "tool": 30, "graphviz": [30, 32], "assembl": 30, "prepend": 30, "sphinx_lint": 30, "technic": 30, "why": 30, "occasion": 30, "unresolv": 30, "might": 30, "opt": 30, "pycapsul": 30, "neg": 30, "silenc": 30, "nitpick": 30, "conf": 30, "domain": 30, "deploi": 30, "app": 30, "googl": 31, "c_size_t": 31, "ret": 31, "emplace_back": 31, "valueerror": 31, "restructuredtext": 31, "en": 31, "master": 31, "__": 31, "pep": 31, "0287": 31, "autofunct": 31, "toc": 31, "c_ulong": 31, "mani": 31, "attach": 31, "fact": 31, "helper": 31, "codebas": 31, "add_doc": 31, "forc": 31, "hoc": 31, "the_new_doc_modul": 31, "remain": 31, "render": [31, 32], "anchor": 32, "_doc": 32, "underscor": 32, "_": 32, "There": 32, "elsewher": 32, "ref": 32, "literalinclud": 32, "rel": 32, "enclos": 32, "bracket": 32, "skiplin": 32, "suppli": 32, "math": 32, "k_": 32, "k_n": 32, "expressino": 32, "int_a": 32, "frac": 32, "2v": 32, "dx": 32, "left": 32, "dv": 32, "_a": 32, "du": 32, "digraph": 32, "altern": 32, "dot": 32, "examplegraph": 32, "low": 33, "precis": 33, "high": 33, "convolut": 33, "server": 33, "transform": 33, "contribut": 33, "contact": 33, "licens": 33, "autovector": 33, "ssd": 33}, "objects": {"": [[13, 0, 1, "_CPPv4N16RocksdbWriteMode29BWD_L1_CNFLCT_MISS_WRITE_BACKE", "BWD_L1_CNFLCT_MISS_WRITE_BACK"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode5FLUSHE", "FLUSH"], [10, 1, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::ebits"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::exponent_bias"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::input"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::ncols"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::nrows"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::output"], [10, 1, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu"], [10, 2, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::forward"], [10, 2, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::input"], [10, 2, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::output_dtype"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode15FWD_L1_EVICTIONE", "FWD_L1_EVICTION"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode16FWD_ROCKSDB_READE", "FWD_ROCKSDB_READ"], [0, 1, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax"], [0, 2, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax::len"], [0, 2, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax::m"], [0, 2, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax::max"], [0, 2, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax::min"], [0, 1, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf"], [0, 3, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::InputType"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::bit_rate"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::input"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::input_columns"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::input_rows"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::output"], [10, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::ebits"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::exponent_bias"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::input"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::max_pos"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::ncols"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::nrows"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::output"], [0, 1, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize"], [0, 3, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::T"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::dst"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::len"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::noise_ratio"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::num_threads"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::qparams"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::src"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::thread_id"], [0, 1, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::C"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::G"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::K"], [0, 3, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::LAYOUT"], [0, 3, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::T"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::X"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::dst"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::scales"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::src"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::zero_points"], [13, 4, 1, "_CPPv416RocksdbWriteMode", "RocksdbWriteMode"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode29BWD_L1_CNFLCT_MISS_WRITE_BACKE", "RocksdbWriteMode::BWD_L1_CNFLCT_MISS_WRITE_BACK"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode5FLUSHE", "RocksdbWriteMode::FLUSH"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode15FWD_L1_EVICTIONE", "RocksdbWriteMode::FWD_L1_EVICTION"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode16FWD_ROCKSDB_READE", "RocksdbWriteMode::FWD_ROCKSDB_READ"], [0, 1, 1, "_CPPv46Xor128v", "Xor128"], [10, 1, 1, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t", "_FP8rowwise_to_float_gpu"], [10, 2, 1, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t", "_FP8rowwise_to_float_gpu::forward"], [10, 2, 1, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t", "_FP8rowwise_to_float_gpu::input"], [10, 2, 1, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t", "_FP8rowwise_to_float_gpu::output_dtype"], [10, 1, 1, "_CPPv422_bfloat16_to_float_gpuRKN2at6TensorE", "_bfloat16_to_float_gpu"], [10, 2, 1, "_CPPv422_bfloat16_to_float_gpuRKN2at6TensorE", "_bfloat16_to_float_gpu::input"], [10, 1, 1, "_CPPv424_float_to_FP8rowwise_gpuRK6TensorKb", "_float_to_FP8rowwise_gpu"], [10, 2, 1, "_CPPv424_float_to_FP8rowwise_gpuRK6TensorKb", "_float_to_FP8rowwise_gpu::forward"], [10, 2, 1, "_CPPv424_float_to_FP8rowwise_gpuRK6TensorKb", "_float_to_FP8rowwise_gpu::input"], [10, 1, 1, "_CPPv422_float_to_bfloat16_gpuRKN2at6TensorE", "_float_to_bfloat16_gpu"], [10, 2, 1, "_CPPv422_float_to_bfloat16_gpuRKN2at6TensorE", "_float_to_bfloat16_gpu::input"], [10, 1, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out"], [10, 2, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out::input"], [10, 2, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out::output"], [10, 1, 1, "_CPPv430_float_to_fused8bitrowwise_gpuRK6Tensor", "_float_to_fused8bitrowwise_gpu"], [10, 2, 1, "_CPPv430_float_to_fused8bitrowwise_gpuRK6Tensor", "_float_to_fused8bitrowwise_gpu::input"], [10, 1, 1, "_CPPv430_float_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu"], [10, 2, 1, "_CPPv430_float_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu::bit_rate"], [10, 2, 1, "_CPPv430_float_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu::input"], [10, 1, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu"], [10, 2, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu::ebits"], [10, 2, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu::exponent_bias"], [10, 2, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu::input"], [10, 2, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu::max_pos"], [10, 1, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::bias"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::bounding_box_size"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::ebits"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::input"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::max_pos"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::mbits"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::min_pos"], [10, 1, 1, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t", "_float_to_paddedFP8rowwise_gpu"], [10, 2, 1, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t", "_float_to_paddedFP8rowwise_gpu::forward"], [10, 2, 1, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t", "_float_to_paddedFP8rowwise_gpu::input"], [10, 2, 1, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t", "_float_to_paddedFP8rowwise_gpu::row_dim"], [10, 1, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out"], [10, 2, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out::input"], [10, 2, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out::output"], [10, 1, 1, "_CPPv430_fused8bitrowwise_to_float_gpuRKN2at6TensorE", "_fused8bitrowwise_to_float_gpu"], [10, 2, 1, "_CPPv430_fused8bitrowwise_to_float_gpuRKN2at6TensorE", "_fused8bitrowwise_to_float_gpu::input"], [10, 1, 1, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t", "_fused8bitrowwise_to_float_mixed_dim_gpu"], [10, 2, 1, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t", "_fused8bitrowwise_to_float_mixed_dim_gpu::D_offsets"], [10, 2, 1, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t", "_fused8bitrowwise_to_float_mixed_dim_gpu::input"], [10, 2, 1, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t", "_fused8bitrowwise_to_float_mixed_dim_gpu::output_dtype"], [10, 1, 1, "_CPPv429_fused8bitrowwise_to_half_gpuRKN2at6TensorE", "_fused8bitrowwise_to_half_gpu"], [10, 2, 1, "_CPPv429_fused8bitrowwise_to_half_gpuRKN2at6TensorE", "_fused8bitrowwise_to_half_gpu::input"], [10, 1, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu"], [10, 2, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu::input"], [10, 2, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu::output_dtype"], [10, 2, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu::quant_padding_float_type"], [10, 2, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu::scale_bias_last"], [10, 1, 1, "_CPPv430_fusednbitrowwise_to_float_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_float_gpu"], [10, 2, 1, "_CPPv430_fusednbitrowwise_to_float_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_float_gpu::bit_rate"], [10, 2, 1, "_CPPv430_fusednbitrowwise_to_float_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_float_gpu::input"], [10, 1, 1, "_CPPv429_fusednbitrowwise_to_half_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_half_gpu"], [10, 2, 1, "_CPPv429_fusednbitrowwise_to_half_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_half_gpu::bit_rate"], [10, 2, 1, "_CPPv429_fusednbitrowwise_to_half_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_half_gpu::input"], [10, 1, 1, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t", "_fusednbitrowwise_to_single_or_half_precision_gpu"], [10, 2, 1, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t", "_fusednbitrowwise_to_single_or_half_precision_gpu::bit_rate"], [10, 2, 1, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t", "_fusednbitrowwise_to_single_or_half_precision_gpu::input"], [10, 2, 1, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t", "_fusednbitrowwise_to_single_or_half_precision_gpu::output_dtype"], [10, 1, 1, "_CPPv429_half_to_fused8bitrowwise_gpuRK6Tensor", "_half_to_fused8bitrowwise_gpu"], [10, 2, 1, "_CPPv429_half_to_fused8bitrowwise_gpuRK6Tensor", "_half_to_fused8bitrowwise_gpu::input"], [10, 1, 1, "_CPPv429_half_to_fusednbitrowwise_gpuRKN2at6TensorEK7int64_t", "_half_to_fusednbitrowwise_gpu"], [10, 2, 1, "_CPPv429_half_to_fusednbitrowwise_gpuRKN2at6TensorEK7int64_t", "_half_to_fusednbitrowwise_gpu::bit_rate"], [10, 2, 1, "_CPPv429_half_to_fusednbitrowwise_gpuRKN2at6TensorEK7int64_t", "_half_to_fusednbitrowwise_gpu::input"], [10, 1, 1, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t", "_hfp8_to_float_gpu"], [10, 2, 1, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t", "_hfp8_to_float_gpu::ebits"], [10, 2, 1, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t", "_hfp8_to_float_gpu::exponent_bias"], [10, 2, 1, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t", "_hfp8_to_float_gpu::input"], [10, 1, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu"], [10, 2, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu::bias"], [10, 2, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu::ebits"], [10, 2, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu::input"], [10, 2, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu::mbits"], [10, 1, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::forward"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::input"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::output_dtype"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::output_last_dim"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::row_dim"], [10, 1, 1, "_CPPv449_single_or_half_precision_to_fused8bitrowwise_gpuRK6Tensor", "_single_or_half_precision_to_fused8bitrowwise_gpu"], [10, 2, 1, "_CPPv449_single_or_half_precision_to_fused8bitrowwise_gpuRK6Tensor", "_single_or_half_precision_to_fused8bitrowwise_gpu::input"], [10, 1, 1, "_CPPv449_single_or_half_precision_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_single_or_half_precision_to_fusednbitrowwise_gpu"], [10, 2, 1, "_CPPv449_single_or_half_precision_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_single_or_half_precision_to_fusednbitrowwise_gpu::bit_rate"], [10, 2, 1, "_CPPv449_single_or_half_precision_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_single_or_half_precision_to_fusednbitrowwise_gpu::input"], [9, 1, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device"], [9, 2, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device::inputTensors"], [9, 2, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device::target_device"], [6, 1, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul"], [6, 2, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::a_offsets"], [6, 2, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::a_values"], [6, 2, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::v"], [3, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::B_offsets"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::b_t_map"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::bounds_check_mode"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::bounds_check_version"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::indices"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::info_B_mask"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::info_B_num_bits"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::max_B"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::offsets"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::rows_per_table"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::warning"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::weights"], [13, 1, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::compact_count"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::compact_indices"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::count"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::indices"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::masks"], [13, 1, 1, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv", "cuda_callback_func"], [13, 2, 1, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv", "cuda_callback_func::functor"], [13, 2, 1, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv", "cuda_callback_func::status"], [13, 2, 1, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv", "cuda_callback_func::stream"], [6, 1, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE", "dense_to_jagged"], [6, 2, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE", "dense_to_jagged::dense"], [6, 2, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE", "dense_to_jagged::offsets"], [6, 2, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE", "dense_to_jagged::total_L"], [12, 1, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::gather_cache_stats"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::invalid_index"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::linear_cache_indices"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::lxu_cache_state"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::uvm_cache_stats"], [29, 1, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method"], [29, 3, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method::Alignment"], [29, 3, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method::T"], [29, 2, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method::param1"], [29, 2, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method::param2"], [11, 1, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda"], [11, 2, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::input_offsets"], [11, 2, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::output_offsets"], [11, 2, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::output_size"], [11, 2, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::permute"], [10, 1, 1, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor", "float_or_half_to_fused8bitrowwise_cpu"], [10, 2, 1, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor", "float_or_half_to_fused8bitrowwise_cpu::input"], [10, 1, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu"], [10, 2, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu::forward"], [10, 2, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu::input"], [10, 1, 1, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor", "float_to_fused8bitrowwise_cpu"], [10, 2, 1, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor", "float_to_fused8bitrowwise_cpu::input"], [10, 1, 1, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor", "fused8bitrowwise_to_float_cpu"], [10, 2, 1, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor", "fused8bitrowwise_to_float_cpu::input"], [10, 1, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu"], [10, 2, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu::input"], [10, 2, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu::output_dtype"], [10, 2, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu::quant_padding_float_type"], [10, 2, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu::scale_bias_last"], [10, 1, 1, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor", "fused8bitrowwise_to_half_cpu"], [10, 2, 1, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor", "fused8bitrowwise_to_half_cpu::input"], [10, 1, 1, "_CPPv437fusednbitrowwise_sbfront_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_sbfront_to_float_cpu"], [10, 2, 1, "_CPPv437fusednbitrowwise_sbfront_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_sbfront_to_float_cpu::bit_rate"], [10, 2, 1, "_CPPv437fusednbitrowwise_sbfront_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_sbfront_to_float_cpu::input"], [10, 1, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu"], [10, 2, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu::bit_rate"], [10, 2, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu::input"], [10, 1, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu"], [10, 2, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::bit_rate"], [10, 2, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::input"], [10, 2, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::output_dtype"], [10, 1, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu"], [10, 2, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu::bit_rate"], [10, 2, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu::input"], [11, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_boundaries"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_ctr_in_use_after"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_ctr_weight_value"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_num_examples"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_num_positives"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::logit"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::num_segments"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::positive_weight"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::segment_lengths"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::segment_value"], [12, 1, 1, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb", "get_unique_indices_cuda"], [12, 2, 1, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb", "get_unique_indices_cuda::compute_count"], [12, 2, 1, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb", "get_unique_indices_cuda::linear_indices"], [12, 2, 1, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb", "get_unique_indices_cuda::max_indices"], [12, 1, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda"], [12, 2, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda::compute_count"], [12, 2, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda::compute_inverse_indices"], [12, 2, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda::linear_indices"], [12, 2, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda::max_indices"], [4, 1, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::XQ"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::cache_K"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::cache_V"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::cache_logical_dtype_int"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::kv_cache_quant_num_groups"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::num_split_ks"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::qk_scale"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::seq_positions"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::use_tensor_cores"], [10, 1, 1, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor", "half_to_fused8bitrowwise_cpu"], [10, 2, 1, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor", "half_to_fused8bitrowwise_cpu::input"], [13, 1, 1, "_CPPv410hash_shard7int64_t6size_t", "hash_shard"], [13, 2, 1, "_CPPv410hash_shard7int64_t6size_t", "hash_shard::id"], [13, 2, 1, "_CPPv410hash_shard7int64_t6size_t", "hash_shard::num_shards"], [12, 1, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot"], [12, 2, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot::C"], [12, 2, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot::h_in"], [3, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::D_offsets"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::dev_weights"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::fp8_exponent_bias"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::fp8_exponent_bits"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::indice_weights"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::indices"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::lxu_cache_locations"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::lxu_cache_weights"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float16_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float32_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float8_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int2_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int4_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int8_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::offsets"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::output_dtype"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::pooling_mode"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::row_alignment"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::total_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::uvm_weights"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_offsets"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_placements"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_tys"], [3, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::D_offsets"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::dev_weights"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::fp8_exponent_bias"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::fp8_exponent_bits"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::indice_weights"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::indices"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::lxu_cache_locations"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::lxu_cache_weights"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float16_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float32_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float8_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int2_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int4_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int8_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::offsets"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::output_dtype"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::pooling_mode"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::row_alignment"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::total_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::uvm_weights"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_offsets"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_placements"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_tys"], [3, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::D_offsets"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::cache_hash_size_cumsum"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::cache_index_table_map"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::dev_weights"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::fp8_exponent_bias"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::fp8_exponent_bits"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::indice_weights"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::indices"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_locations"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_state"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_weights"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_state"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float16_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float32_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float8_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int2_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int4_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int8_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::offsets"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::output_dtype"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::pooling_mode"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::row_alignment"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::total_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::total_cache_hash_size"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::uvm_weights"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_offsets"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_placements"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_tys"], [3, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::D_offsets"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::cache_hash_size_cumsum"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::cache_index_table_map"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::dev_weights"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::fp8_exponent_bias"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::fp8_exponent_bits"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::indice_weights"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::indices"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_locations"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_state"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_weights"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_state"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float16_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float32_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float8_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int2_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int4_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int8_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::offsets"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::output_dtype"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::pooling_mode"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::row_alignment"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::total_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::total_cache_hash_size"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::uvm_weights"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_offsets"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_placements"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_tys"], [8, 1, 1, "_CPPv413is_uvm_tensorRK6Tensor", "is_uvm_tensor"], [8, 2, 1, "_CPPv413is_uvm_tensorRK6Tensor", "is_uvm_tensor::self"], [6, 1, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense"], [6, 2, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::max_L"], [6, 2, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::offsets"], [6, 2, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::padding_value"], [6, 2, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::values"], [6, 1, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense"], [6, 2, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::max_sequence_length"], [6, 2, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::offsets"], [6, 2, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::values"], [6, 1, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::x_offsets"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::x_values"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::y"], [6, 1, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output"], [6, 2, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::x_offsets"], [6, 2, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::x_values"], [6, 2, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::y"], [6, 1, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda"], [6, 2, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::x_offsets"], [6, 2, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::x_values"], [6, 2, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::y"], [6, 1, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::x_offsets"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::x_values"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::y"], [6, 1, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense"], [6, 2, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::max_lengths"], [6, 2, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::offsets"], [6, 2, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::padding_value"], [6, 2, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::values"], [6, 1, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward"], [6, 2, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::max_lengths"], [6, 2, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::offsets"], [6, 2, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::padding_value"], [6, 2, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::values"], [13, 5, 1, "_CPPv4N5kv_db12CacheContextE", "kv_db::CacheContext"], [13, 5, 1, "_CPPv4N5kv_db13EmbeddingKVDBE", "kv_db::EmbeddingKVDB"], [13, 5, 1, "_CPPv4N5kv_db9QueueItemE", "kv_db::QueueItem"], [13, 5, 1, "_CPPv4N8l2_cache13CacheLibCacheE", "l2_cache::CacheLibCache"], [12, 1, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::B_offsets"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::cache_hash_size_cumsum"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::indices"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::indices_base_offset"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::max_B"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::offsets"], [12, 1, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda"], [12, 2, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::cache_hash_size_cumsum"], [12, 2, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::update_row_indices"], [12, 2, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::update_table_indices"], [12, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::compute_inverse_indices"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::gather_cache_stats"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::lock_cache_line"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::lru_state"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::lxu_cache_locking_counter"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::lxu_cache_state"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::max_indices"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::time_stamp"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::unique_indices"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::unique_indices_length"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::uvm_cache_stats"], [12, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::D_offsets"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::cache_hash_size_cumsum"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::cache_index_table_map"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::lxu_cache_state"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::lxu_cache_weights"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::stochastic_rounding"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::total_D"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::uvm_weights"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::weights_offsets"], [12, 1, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda"], [12, 2, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::lxu_cache_locations"], [12, 2, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::lxu_cache_locations_new"], [12, 2, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::num_uniq_cache_indices"], [12, 1, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda"], [12, 2, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda::lxu_cache_locations"], [12, 2, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda::lxu_cache_locking_counter"], [13, 1, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::count"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::indices"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::preferred_sms"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::self"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::use_pipeline"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::values"], [13, 1, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::count"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::indices"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::preferred_sms"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::self"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::use_pipeline"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::values"], [8, 1, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor"], [8, 2, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor::self"], [8, 2, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor::sizes"], [8, 1, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor"], [8, 2, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor::self"], [8, 2, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor::sizes"], [8, 1, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta"], [8, 2, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta::self"], [8, 2, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta::sizes"], [8, 1, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor"], [8, 2, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::is_host_mapped"], [8, 2, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::self"], [8, 2, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::sizes"], [8, 1, 1, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor_meta"], [8, 2, 1, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor_meta::is_host_mapped"], [8, 2, 1, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor_meta::self"], [8, 2, 1, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor_meta::sizes"], [8, 1, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor"], [8, 2, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor::self"], [8, 2, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor::sizes"], [5, 1, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::batch_size"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::include_last_offsets"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::indices_list"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::offsets_list"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::per_sample_weights"], [9, 1, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::inv_offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::inv_permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::pooled_embs"], [9, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::inv_permute_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::offset_dim_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::permute_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::pooled_embs"], [9, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::inv_permute_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::offset_dim_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::permute_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::pooled_embs"], [9, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::inv_permute_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::offset_dim_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::permute_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::pooled_embs"], [9, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::inv_permute_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::offset_dim_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::permute_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::pooled_embs"], [9, 1, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::allow_duplicates"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::inv_offset_dim_list"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::inv_permute_list"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::offset_dim_list"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::permute_list"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::pooled_embs"], [9, 1, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::inv_permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::pooled_embs"], [9, 1, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::inv_permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::pooled_embs"], [3, 1, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu"], [3, 2, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::index_remappings"], [3, 2, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::index_remappings_offsets"], [3, 2, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::indices"], [3, 2, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::offsets"], [3, 1, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda"], [3, 2, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::index_remappings"], [3, 2, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::index_remappings_offsets"], [3, 2, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::indices"], [3, 2, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::offsets"], [3, 1, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::dense_indices"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::hash_table"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::hash_table_offsets"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::indices"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::offsets"], [3, 1, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda"], [3, 2, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::hash_table"], [3, 2, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::hash_table_offsets"], [3, 2, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::indices"], [3, 2, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::offsets"], [3, 1, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu"], [3, 2, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::hash_table"], [3, 2, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::hash_table_offsets"], [3, 2, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::indices"], [3, 2, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::offsets"], [13, 5, 1, "_CPPv4N2ps24EmbeddingParameterServerE", "ps::EmbeddingParameterServer"], [7, 1, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda"], [7, 2, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda::grad_output"], [7, 2, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda::num_features_per_rank"], [7, 1, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda"], [7, 2, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::cumsum_dim_sum_per_rank"], [7, 2, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::dim_sum_per_rank"], [7, 2, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::grad_output"], [7, 1, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu"], [7, 2, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu::dim_sum_per_rank"], [7, 2, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu::grad_output"], [7, 1, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda"], [7, 2, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda::dim_sum_per_rank"], [7, 2, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda::grad_output"], [0, 1, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::A_SYMMETRIC"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::BIAS_TYPE"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::B_SYMMETRIC"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::DIRECT"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::FUSE_RELU"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::HAS_BIAS"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::Q_GRAN"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::block"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::inp"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::ld_in"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::ld_out"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::out"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::r"], [0, 1, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::A_SYMMETRIC"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::BIAS_TYPE"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::B_SYMMETRIC"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::C_PER_G"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::FUSE_RELU"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::HAS_BIAS"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::Q_GRAN"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::block"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::inp"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::ld_in"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::ld_out"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::out"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::r"], [12, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::D_offsets"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::buffer_ids"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::cache_hash_size_cumsum"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::dev_weights"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::logical_table_ids"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::lxu_cache_state"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::lxu_cache_weights"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_dev"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_offsets"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_placements"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_uvm"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::pruned_indices"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::pruned_indices_offsets"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::total_cache_hash_size"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::uvm_weights"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::weights_offsets"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::weights_placements"], [13, 5, 1, "_CPPv4N3ssd16EmbeddingRocksDBE", "ssd::EmbeddingRocksDB"], [13, 1, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::assigned_cache_slots"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::cache_set_inverse_indices"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::cache_set_sorted_unique_indices"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::inserted_ssd_weights"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::linear_index_inverse_indices"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::lxu_cache_locations"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::lxu_cache_weights"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::unique_indices_count_cumsum"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::unique_indices_length"], [13, 1, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::cache_set_inverse_indices_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::inserted_ssd_weights_curr_next_map"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::inserted_ssd_weights_next"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::linear_index_inverse_indices_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::lxu_cache_locations_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::lxu_cache_weights"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::ssd_row_addrs_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::unique_indices_count_cumsum_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::unique_indices_length_curr"], [5, 1, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu"], [5, 2, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::include_last_offsets"], [5, 2, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::indices_list"], [5, 2, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::offsets_list"], [5, 2, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::per_sample_weights"], [8, 1, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise"], [8, 2, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise::cuda_memory_advise"], [8, 2, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise::self"], [8, 1, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorNSt8optionalI6TensorEE", "uvm_cuda_mem_prefetch_async"], [8, 2, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorNSt8optionalI6TensorEE", "uvm_cuda_mem_prefetch_async::device_t"], [8, 2, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorNSt8optionalI6TensorEE", "uvm_cuda_mem_prefetch_async::self"], [8, 1, 1, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor", "uvm_mem_advice_dont_fork"], [8, 2, 1, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor", "uvm_mem_advice_dont_fork::self"], [8, 1, 1, "_CPPv411uvm_storageRK6Tensor", "uvm_storage"], [8, 2, 1, "_CPPv411uvm_storageRK6Tensor", "uvm_storage::self"], [8, 1, 1, "_CPPv410uvm_to_cpuRK6Tensor", "uvm_to_cpu"], [8, 2, 1, "_CPPv410uvm_to_cpuRK6Tensor", "uvm_to_cpu::self"], [8, 1, 1, "_CPPv416uvm_to_cpu_cloneRK6Tensor", "uvm_to_cpu_clone"], [8, 2, 1, "_CPPv416uvm_to_cpu_cloneRK6Tensor", "uvm_to_cpu_clone::self"], [8, 1, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device"], [8, 2, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device::prototype"], [8, 2, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device::self"], [22, 6, 0, "-", "fbgemm_gpu"]], "fbgemm_gpu.docs.examples": [[31, 7, 1, "", "example_method"]], "fbgemm_gpu.permute_pooled_embedding_modules": [[19, 8, 1, "", "PermutePooledEmbeddings"]], "fbgemm_gpu.permute_pooled_embedding_modules.PermutePooledEmbeddings": [[19, 9, 1, "", "__call__"]], "fbgemm_gpu.split_table_batched_embeddings_ops_inference": [[23, 8, 1, "", "IntNBitTableBatchedEmbeddingBagsCodegen"]], "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen": [[23, 9, 1, "", "assign_embedding_weights"], [23, 9, 1, "", "fill_random_weights"], [23, 9, 1, "", "forward"], [23, 9, 1, "", "recompute_module_buffers"], [23, 9, 1, "", "split_embedding_weights"], [23, 9, 1, "", "split_embedding_weights_with_scale_bias"]], "fbgemm_gpu.split_table_batched_embeddings_ops_training": [[24, 8, 1, "", "SplitTableBatchedEmbeddingBagsCodegen"]], "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen": [[24, 9, 1, "", "forward"], [24, 9, 1, "", "set_learning_rate"], [24, 9, 1, "", "set_optimizer_step"], [24, 9, 1, "", "split_embedding_weights"], [24, 9, 1, "", "split_optimizer_states"], [24, 9, 1, "", "update_hyper_parameters"]], "torch.ops.fbgemm": [[21, 7, 1, "", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf"], [22, 7, 1, "", "asynchronous_complete_cumsum"], [18, 7, 1, "", "batched_dense_vec_jagged_2d_mul"], [22, 7, 1, "", "block_bucketize_sparse_features"], [18, 7, 1, "", "dense_to_jagged"], [22, 7, 1, "", "expand_into_jagged_permute"], [18, 7, 1, "", "jagged_1d_to_dense"], [18, 7, 1, "", "jagged_2d_to_dense"], [18, 7, 1, "", "jagged_dense_dense_elementwise_add_jagged_output"], [18, 7, 1, "", "jagged_dense_elementwise_add"], [18, 7, 1, "", "jagged_dense_elementwise_add_jagged_output"], [18, 7, 1, "", "jagged_dense_elementwise_mul"], [18, 7, 1, "", "jagged_to_padded_dense"], [22, 7, 1, "", "keyed_jagged_index_select_dim1"], [20, 7, 1, "", "merge_pooled_embeddings"], [22, 7, 1, "", "offsets_range"], [22, 7, 1, "", "permute_1D_sparse_data"], [22, 7, 1, "", "permute_2D_sparse_data"], [20, 7, 1, "", "permute_pooled_embs"], [22, 7, 1, "", "segment_sum_csr"], [18, 7, 1, "", "stacked_jagged_1d_to_dense"], [18, 7, 1, "", "stacked_jagged_2d_to_dense"]]}, "objtypes": {"0": "cpp:enumerator", "1": "cpp:function", "2": "cpp:functionParam", "3": "cpp:templateParam", "4": "cpp:enum", "5": "cpp:class", "6": "py:module", "7": "py:function", "8": "py:class", "9": "py:method"}, "objnames": {"0": ["cpp", "enumerator", "C++ enumerator"], "1": ["cpp", "function", "C++ function"], "2": ["cpp", "functionParam", "C++ function parameter"], "3": ["cpp", "templateParam", "C++ template parameter"], "4": ["cpp", "enum", "C++ enum"], "5": ["cpp", "class", "C++ class"], "6": ["py", "module", "Python module"], "7": ["py", "function", "Python function"], "8": ["py", "class", "Python class"], "9": ["py", "method", "Python method"]}, "titleterms": {"quantiz": [0, 10, 21], "util": 0, "refer": [0, 32], "implement": [0, 1], "method": [0, 1], "avx": 0, "2": 0, "512": 0, "tbe": [1, 23, 24], "cpu": [1, 3, 6, 7, 10, 11, 14, 15], "autovector": 1, "fp8": 1, "16": 1, "32": 1, "autovec": 1, "build": [2, 14, 30], "instruct": [2, 14, 15, 16], "fbgemm": [2, 15, 33], "requir": 2, "hardwar": 2, "softwar": 2, "depend": 2, "asmjit": 2, "cpuinfo": 2, "googletest": 2, "set": [2, 14, 15, 30], "up": [2, 14, 15, 30], "an": [2, 14], "isol": [2, 14], "environ": [2, 14, 15, 16, 30], "instal": [2, 14, 15], "tool": [2, 14], "c": [2, 14, 29, 33], "compil": [2, 14], "other": [2, 14, 18, 19, 20, 21, 23, 24, 32], "librari": [2, 15], "prepar": [2, 14], "linux": 2, "maco": 2, "cmake": 2, "gcc": [2, 14], "issu": [2, 27], "12": 2, "clang": [2, 14], "bazel": 2, "window": 2, "embed": [3, 9, 12, 13, 19, 20, 23, 24], "oper": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 17, 18, 20, 21, 22, 33], "cuda": [3, 6, 7, 8, 10, 11, 13, 14, 15, 16], "experiment": 4, "attent": 4, "combin": [5, 17], "input": 5, "jag": [6, 17, 18], "tensor": [6, 17, 18], "layout": 7, "transform": 7, "memori": 8, "pool": [9, 19, 20], "merg": 9, "permut": 9, "spars": [11, 22], "data": 11, "tabl": [12, 15, 23, 24], "batch": [12, 23, 24], "ssd": 13, "miniconda": 14, "conda": [14, 15], "onli": [14, 15], "genai": 14, "docker": [14, 15], "imag": 14, "cudnn": 14, "cutlass": 14, "rocm": [14, 15, 16], "miopen": 14, "symlink": 14, "pytorch": [14, 15], "through": [14, 15], "pip": [14, 15], "post": [14, 15], "check": [14, 15], "triton": [14, 15], "pre": 14, "setup": [14, 16], "The": 14, "process": 14, "wheel": 14, "variabl": 14, "For": 14, "develop": [14, 33], "undefin": [14, 15], "symbol": [14, 15], "glibc": 14, "version": 14, "compat": [14, 15], "releas": 15, "nvidia": 15, "driver": 15, "contain": 15, "runtim": 15, "amdgpu": 15, "python": [15, 25, 31, 33], "fbgemm_gpu": [15, 16, 25, 30, 33], "packag": 15, "public": 15, "pypi": 15, "test": 16, "run": 16, "variant": 16, "benchmark": 16, "high": 17, "level": 17, "overview": [17, 33], "format": 17, "valu": 17, "offset": 17, "max": 17, "length": 17, "exampl": 17, "arithmet": 17, "convers": 17, "dens": 17, "stabl": [18, 19, 20, 21, 22, 23, 24, 25, 33], "api": [18, 19, 20, 21, 22, 23, 24, 25, 33], "modul": [19, 23, 24, 33], "infer": 23, "train": 24, "contact": 26, "u": 26, "github": 26, "slack": 26, "contribut": 27, "code": [27, 29, 31, 32], "conduct": 27, "pull": 27, "request": 27, "contributor": 27, "licens": [27, 28], "agreement": 27, "cla": 27, "ad": [29, 31, 32], "document": [29, 30, 31, 32, 33], "gener": [30, 31, 33], "guidelin": 30, "specif": 30, "guid": 30, "toolchain": 30, "lint": 30, "deploy": 30, "preview": 30, "todo": 31, "auto": 31, "sphinx": 32, "pointer": 32, "section": 32, "referenc": 32, "sourc": 32, "latex": 32, "graph": 32, "homepag": 33, "info": 33}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1, "sphinx": 57}, "alltitles": {"Test Instructions": [[16, "test-instructions"]], "Setup the FBGEMM_GPU Test Environment": [[16, "setup-the-fbgemm-gpu-test-environment"]], "Running FBGEMM_GPU Tests": [[16, "running-fbgemm-gpu-tests"]], "Testing with the CUDA Variant": [[16, "testing-with-the-cuda-variant"]], "Testing with the ROCm Variant": [[16, "testing-with-the-rocm-variant"]], "Running FBGEMM_GPU Benchmarks": [[16, "running-fbgemm-gpu-benchmarks"]], "Jagged Tensor Operators": [[17, "jagged-tensor-operators"], [6, "jagged-tensor-operators"], [18, "module-fbgemm_gpu"]], "High Level Overview": [[17, "high-level-overview"]], "Jagged Tensor Format": [[17, "jagged-tensor-format"]], "Values": [[17, "values"]], "Offsets": [[17, "offsets"]], "Max Lengths": [[17, "max-lengths"]], "Jagged Tensor Example": [[17, "jagged-tensor-example"]], "Jagged Tensor Operations": [[17, "jagged-tensor-operations"]], "Arithmetic Operations": [[17, "arithmetic-operations"]], "Conversion Operations": [[17, "conversion-operations"]], "Jagged to Dense": [[17, "jagged-to-dense"]], "Dense to Jagged": [[17, "dense-to-jagged"]], "Combined Arithmetic + Conversion Operations": [[17, "combined-arithmetic-conversion-operations"]], "Pooled Embedding Modules": [[19, "module-fbgemm_gpu"]], "Stable API": [[19, "stable-api"], [23, "stable-api"], [24, "stable-api"], [18, "stable-api"], [20, "stable-api"], [21, "stable-api"], [22, "stable-api"]], "Other API": [[19, "other-api"], [23, "other-api"], [24, "other-api"], [18, "other-api"], [20, "other-api"], [21, "other-api"]], "Table Batched Embedding (TBE) Inference Module": [[23, "table-batched-embedding-tbe-inference-module"]], "FBGEMM_GPU Stable Python API": [[25, "fbgemm-gpu-stable-python-api"]], "Stable APIs": [[25, "stable-apis"]], "Contact Us": [[26, "contact-us"]], "GitHub": [[26, "github"]], "Slack": [[26, "slack"]], "Table Batched Embedding (TBE) Training Module": [[24, "table-batched-embedding-tbe-training-module"]], "Contributing": [[27, "contributing"]], "Code of Conduct": [[27, "code-of-conduct"]], "Pull Requests": [[27, "pull-requests"]], "Contributor License Agreement (\u201cCLA\u201d)": [[27, "contributor-license-agreement-cla"]], "Issues": [[27, "issues"]], "License": [[27, "license"], [28, "license"]], "Sphinx Documentation Pointers": [[32, "sphinx-documentation-pointers"]], "References Other Sections of the Documentation": [[32, "references-other-sections-of-the-documentation"]], "Referencing the Source Code": [[32, "referencing-the-source-code"]], "Adding LaTeX": [[32, "adding-latex"]], "Adding Graphs": [[32, "adding-graphs"]], "FBGEMM and FBGEMM_GPU Documentation Homepage": [[33, "fbgemm-and-fbgemm-gpu-documentation-homepage"]], "General Info": [[33, null]], "FBGEMM Development": [[33, null]], "FBGEMM_GPU Development": [[33, null]], "FBGEMM_GPU Overview": [[33, null]], "FBGEMM Stable API": [[33, null]], "FBGEMM C++ API": [[33, null]], "FBGEMM_GPU C++ API": [[33, null]], "FBGEMM_GPU Python Operators API": [[33, null]], "FBGEMM_GPU Python Modules API": [[33, null]], "Experimental Operators": [[4, "experimental-operators"]], "Attention Operators": [[4, "attention-operators"]], "Layout Transformation Operators": [[7, "layout-transformation-operators"]], "CUDA Operators": [[7, "cuda-operators"], [6, "cuda-operators"], [3, "cuda-operators"], [13, "cuda-operators"], [11, "cuda-operators"], [10, "cuda-operators"]], "CPU Operators": [[7, "cpu-operators"], [6, "cpu-operators"], [3, "cpu-operators"], [11, "cpu-operators"], [10, "cpu-operators"]], "Combine Input Operators": [[5, "combine-input-operators"]], "TBE CPU Autovectorization": [[1, "tbe-cpu-autovectorization"]], "FP8/16/32 Autovec Implementation Methods": [[1, "fp8-16-32-autovec-implementation-methods"]], "Embedding Operators": [[3, "embedding-operators"]], "Quantization Utilities": [[0, "quantization-utilities"]], "Reference Implementation Methods": [[0, "reference-implementation-methods"]], "AVX-2 Implementation Methods": [[0, "avx-2-implementation-methods"]], "AVX-512 Implementation Methods": [[0, "avx-512-implementation-methods"]], "Build Instructions": [[2, "build-instructions"], [14, "build-instructions"]], "FBGEMM Requirements": [[2, "fbgemm-requirements"]], "Hardware Requirements": [[2, "hardware-requirements"]], "Software Dependencies": [[2, "software-dependencies"]], "asmjit": [[2, "asmjit"]], "cpuinfo": [[2, "cpuinfo"]], "GoogleTest": [[2, "googletest"]], "Set Up an Isolated Build Environment": [[2, "set-up-an-isolated-build-environment"], [14, "set-up-an-isolated-build-environment"]], "Install the Build Tools": [[2, "install-the-build-tools"], [14, "install-the-build-tools"]], "C/C++ Compiler": [[2, "c-c-compiler"]], "Other Build Tools": [[2, "other-build-tools"], [14, "other-build-tools"]], "Build the FBGEMM Library": [[2, "build-the-fbgemm-library"]], "Preparing the Build": [[2, "preparing-the-build"], [14, "preparing-the-build"]], "Building on Linux and macOS (CMake + GCC)": [[2, "building-on-linux-and-macos-cmake-gcc"]], "Build Issues with GCC 12+": [[2, "build-issues-with-gcc-12"]], "Building on Linux and macOS (CMake + Clang)": [[2, "building-on-linux-and-macos-cmake-clang"]], "Building on Linux (Bazel)": [[2, "building-on-linux-bazel"]], "Building on Windows": [[2, "building-on-windows"]], "Installation Instructions": [[15, "installation-instructions"]], "FBGEMM Releases Compatibility Table": [[15, "fbgemm-releases-compatibility-table"]], "Set Up CPU-Only Environment": [[15, "set-up-cpu-only-environment"]], "Set Up CUDA Environment": [[15, "set-up-cuda-environment"]], "Install NVIDIA Drivers": [[15, "install-nvidia-drivers"]], "Set Up the CUDA Docker Container and Conda Environment": [[15, "set-up-the-cuda-docker-container-and-conda-environment"]], "Install the CUDA Runtime": [[15, "install-the-cuda-runtime"]], "Set Up ROCm Environment": [[15, "set-up-rocm-environment"]], "Install AMDGPU Drivers": [[15, "install-amdgpu-drivers"]], "Set Up the ROCm Docker Container and Conda Environment": [[15, "set-up-the-rocm-docker-container-and-conda-environment"]], "Install Python Libraries": [[15, "install-python-libraries"]], "Install PyTorch": [[15, "install-pytorch"], [14, "install-pytorch"]], "Install Triton": [[15, "install-triton"]], "Install the FBGEMM_GPU Package": [[15, "install-the-fbgemm-gpu-package"]], "Install through PyTorch PIP": [[15, "install-through-pytorch-pip"]], "Install through Public PyPI": [[15, "install-through-public-pypi"]], "Post-Installation Checks": [[15, "post-installation-checks"]], "Undefined Symbols": [[15, "undefined-symbols"]], "Install Miniconda": [[14, "install-miniconda"]], "Set Up the Conda Environment": [[14, "set-up-the-conda-environment"]], "Set Up for CPU-Only Build": [[14, "set-up-for-cpu-only-build"]], "Set Up for CUDA / GenAI-Only Build": [[14, "set-up-for-cuda-genai-only-build"]], "CUDA Docker Image": [[14, "cuda-docker-image"]], "Install CUDA": [[14, "install-cuda"]], "Install cuDNN": [[14, "install-cudnn"]], "Install CUTLASS": [[14, "install-cutlass"]], "Set Up for ROCm Build": [[14, "set-up-for-rocm-build"]], "ROCm Docker Image": [[14, "rocm-docker-image"]], "Install ROCm": [[14, "install-rocm"]], "Install MIOpen": [[14, "install-miopen"]], "C/C++ Compiler (GCC)": [[14, "c-c-compiler-gcc"]], "C/C++ Compiler (Clang)": [[14, "c-c-compiler-clang"]], "Compiler Symlinks": [[14, "compiler-symlinks"]], "Installation Through Conda": [[14, "installation-through-conda"]], "Installation Through PyTorch PIP": [[14, "installation-through-pytorch-pip"]], "Post-Install Checks": [[14, "post-install-checks"]], "Install PyTorch-Triton": [[14, "install-pytorch-triton"]], "Other Pre-Build Setup": [[14, "other-pre-build-setup"]], "The Build Process": [[14, "the-build-process"]], "Set Wheel Build Variables": [[14, "set-wheel-build-variables"]], "CPU-Only Build": [[14, "cpu-only-build"]], "CUDA Build": [[14, "cuda-build"]], "GenAI-Only Build": [[14, "genai-only-build"]], "ROCm Build": [[14, "rocm-build"]], "Post-Build Checks (For Developers)": [[14, "post-build-checks-for-developers"]], "Undefined Symbols Check": [[14, "undefined-symbols-check"]], "GLIBC Version Compatibility Check": [[14, "glibc-version-compatibility-check"]], "SSD Embedding Operators": [[13, "ssd-embedding-operators"]], "Table Batched Embedding Operators": [[12, "table-batched-embedding-operators"]], "Adding Documentation to Python Code": [[31, "adding-documentation-to-python-code"]], "Todo": [[31, "id1"]], "Adding Documentation to Auto-Generated Python Code": [[31, "adding-documentation-to-auto-generated-python-code"]], "Documentation": [[30, "documentation"]], "General Documentation Guidelines": [[30, "general-documentation-guidelines"]], "Specific Documentation Guides": [[30, "specific-documentation-guides"]], "Building the Documentation": [[30, "building-the-documentation"]], "Set Up Build Environment": [[30, "set-up-build-environment"]], "Build FBGEMM_GPU": [[30, "build-fbgemm-gpu"]], "Set Up the Documentation Toolchain": [[30, "set-up-the-documentation-toolchain"]], "Build the Documentation": [[30, "build-the-documentation"]], "Linting the Documentation": [[30, "linting-the-documentation"]], "Deployment Preview": [[30, "deployment-preview"]], "Adding Documentation to C++ Code": [[29, "adding-documentation-to-c-code"]], "Sparse Data Operators": [[11, "sparse-data-operators"]], "Quantization Operators": [[10, "quantization-operators"], [21, "module-fbgemm_gpu"]], "CUDA Memory Operators": [[8, "cuda-memory-operators"]], "Pooled Embeddings Operators": [[9, "pooled-embeddings-operators"]], "Merge Operators": [[9, "merge-operators"]], "Permutation Operators": [[9, "permutation-operators"]], "Pooled Embedding Operators": [[20, "module-fbgemm_gpu"]], "Sparse Operators": [[22, "module-fbgemm_gpu"]]}, "indexentries": {"findminmax (c++ function)": [[0, "_CPPv410FindMinMaxPKfPfPf7int64_t"]], "floatorhalftofusednbitrowwisequantizedsbhalf (c++ function)": [[0, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE"]], "fusedquantizedequantize (c++ function)": [[0, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif"]], "quantizegroupwise (c++ function)": [[0, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T"]], "xor128 (c++ function)": [[0, "_CPPv46Xor128v"]], "requantizeoutputprocessingavx2 (c++ function)": [[0, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE"]], "requantizeoutputprocessinggconvavx512 (c++ function)": [[0, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE"]], "bounds_check_indices_cuda (c++ function)": [[3, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t"]], "int_nbit_split_embedding_codegen_lookup_function (c++ function)": [[3, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE"]], "int_nbit_split_embedding_codegen_lookup_function_cpu (c++ function)": [[3, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE"]], "int_nbit_split_embedding_uvm_caching_codegen_lookup_function (c++ function)": [[3, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE"]], "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu (c++ function)": [[3, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE"]], "pruned_array_lookup_cpu (c++ function)": [[3, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor"]], "pruned_array_lookup_cuda (c++ function)": [[3, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_insert_unweighted_cpu (c++ function)": [[3, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_lookup_cuda (c++ function)": [[3, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_lookup_unweighted_cpu (c++ function)": [[3, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor"]], "gqa_attn_splitk (c++ function)": [[4, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t"]], "padding_fused_tbe_input_combine_cpu (c++ function)": [[5, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t"]], "tbe_input_combine_cpu (c++ function)": [[5, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE"]], "batched_dense_vec_jagged_2d_mul (c++ function)": [[6, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor"]], "dense_to_jagged (c++ function)": [[6, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE"]], "jagged_1d_to_dense (c++ function)": [[6, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t"]], "jagged_2d_to_dense (c++ function)": [[6, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE"]], "jagged_dense_elementwise_add (c++ function)": [[6, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_add_jagged_output (c++ function)": [[6, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_add_jagged_output_cuda (c++ function)": [[6, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_mul (c++ function)": [[6, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_to_padded_dense (c++ function)": [[6, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd"]], "jagged_to_padded_dense_forward (c++ function)": [[6, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd"]], "recat_embedding_grad_output_cuda (c++ function)": [[7, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE"]], "recat_embedding_grad_output_mixed_d_batch_cuda (c++ function)": [[7, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor"]], "recat_embedding_grad_output_mixed_d_cpu (c++ function)": [[7, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE"]], "recat_embedding_grad_output_mixed_d_cuda (c++ function)": [[7, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE"]], "is_uvm_tensor (c++ function)": [[8, "_CPPv413is_uvm_tensorRK6Tensor"]], "new_host_mapped_tensor (c++ function)": [[8, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_managed_tensor (c++ function)": [[8, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_managed_tensor_meta (c++ function)": [[8, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_unified_tensor (c++ function)": [[8, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb"]], "new_unified_tensor_meta (c++ function)": [[8, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb"]], "new_vanilla_managed_tensor (c++ function)": [[8, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "uvm_cuda_mem_advise (c++ function)": [[8, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t"]], "uvm_cuda_mem_prefetch_async (c++ function)": [[8, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorNSt8optionalI6TensorEE"]], "uvm_mem_advice_dont_fork (c++ function)": [[8, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor"]], "uvm_storage (c++ function)": [[8, "_CPPv411uvm_storageRK6Tensor"]], "uvm_to_cpu (c++ function)": [[8, "_CPPv410uvm_to_cpuRK6Tensor"]], "uvm_to_cpu_clone (c++ function)": [[8, "_CPPv416uvm_to_cpu_cloneRK6Tensor"]], "uvm_to_device (c++ function)": [[8, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor"]], "all_to_one_device (c++ function)": [[9, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE"]], "permute_pooled_embs_auto_grad (c++ function)": [[9, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_cpu (c++ function)": [[9, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_gpu (c++ function)": [[9, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_split_cpu (c++ function)": [[9, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_auto_grad_split_gpu (c++ function)": [[9, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_cpu_impl (c++ function)": [[9, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb"]], "permute_pooled_embs_split_cpu (c++ function)": [[9, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_split_gpu (c++ function)": [[9, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "fp8quantizedtofloat_ref (c++ function)": [[10, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi"]], "fp8rowwise_to_float_cpu (c++ function)": [[10, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t"]], "floattofp8quantized_ref (c++ function)": [[10, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd"]], "_fp8rowwise_to_float_gpu (c++ function)": [[10, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t"]], "_bfloat16_to_float_gpu (c++ function)": [[10, "_CPPv422_bfloat16_to_float_gpuRKN2at6TensorE"]], "_float_to_fp8rowwise_gpu (c++ function)": [[10, "_CPPv424_float_to_FP8rowwise_gpuRK6TensorKb"]], "_float_to_bfloat16_gpu (c++ function)": [[10, "_CPPv422_float_to_bfloat16_gpuRKN2at6TensorE"]], "_float_to_fused8bitrowwise_cpu_out (c++ function)": [[10, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor"]], "_float_to_fused8bitrowwise_gpu (c++ function)": [[10, "_CPPv430_float_to_fused8bitrowwise_gpuRK6Tensor"]], "_float_to_fusednbitrowwise_gpu (c++ function)": [[10, "_CPPv430_float_to_fusednbitrowwise_gpuRK6TensorK7int64_t"]], "_float_to_hfp8_gpu (c++ function)": [[10, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd"]], "_float_to_msfp_gpu (c++ function)": [[10, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd"]], "_float_to_paddedfp8rowwise_gpu (c++ function)": [[10, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t"]], "_fused8bitrowwise_to_float_cpu_out (c++ function)": [[10, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor"]], "_fused8bitrowwise_to_float_gpu (c++ function)": [[10, "_CPPv430_fused8bitrowwise_to_float_gpuRKN2at6TensorE"]], "_fused8bitrowwise_to_float_mixed_dim_gpu (c++ function)": [[10, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t"]], "_fused8bitrowwise_to_half_gpu (c++ function)": [[10, "_CPPv429_fused8bitrowwise_to_half_gpuRKN2at6TensorE"]], "_fused8bitrowwise_to_single_or_half_precision_gpu (c++ function)": [[10, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb"]], "_fusednbitrowwise_to_float_gpu (c++ function)": [[10, "_CPPv430_fusednbitrowwise_to_float_gpuRKN2at6TensorEK7int64_t"]], "_fusednbitrowwise_to_half_gpu (c++ function)": [[10, "_CPPv429_fusednbitrowwise_to_half_gpuRKN2at6TensorEK7int64_t"]], "_fusednbitrowwise_to_single_or_half_precision_gpu (c++ function)": [[10, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t"]], "_half_to_fused8bitrowwise_gpu (c++ function)": [[10, "_CPPv429_half_to_fused8bitrowwise_gpuRK6Tensor"]], "_half_to_fusednbitrowwise_gpu (c++ function)": [[10, "_CPPv429_half_to_fusednbitrowwise_gpuRKN2at6TensorEK7int64_t"]], "_hfp8_to_float_gpu (c++ function)": [[10, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t"]], "_msfp_to_float_gpu (c++ function)": [[10, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t"]], "_paddedfp8rowwise_to_float_gpu (c++ function)": [[10, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t"]], "_single_or_half_precision_to_fused8bitrowwise_gpu (c++ function)": [[10, "_CPPv449_single_or_half_precision_to_fused8bitrowwise_gpuRK6Tensor"]], "_single_or_half_precision_to_fusednbitrowwise_gpu (c++ function)": [[10, "_CPPv449_single_or_half_precision_to_fusednbitrowwise_gpuRK6TensorK7int64_t"]], "float_or_half_to_fused8bitrowwise_cpu (c++ function)": [[10, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor"]], "float_to_fp8rowwise_cpu (c++ function)": [[10, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb"]], "float_to_fused8bitrowwise_cpu (c++ function)": [[10, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor"]], "fused8bitrowwise_to_float_cpu (c++ function)": [[10, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor"]], "fused8bitrowwise_to_float_or_half_cpu (c++ function)": [[10, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb"]], "fused8bitrowwise_to_half_cpu (c++ function)": [[10, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor"]], "fusednbitrowwise_sbfront_to_float_cpu (c++ function)": [[10, "_CPPv437fusednbitrowwise_sbfront_to_float_cpuRK6TensorK7int64_t"]], "fusednbitrowwise_to_float_cpu (c++ function)": [[10, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t"]], "fusednbitrowwise_to_float_or_half_cpu (c++ function)": [[10, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t"]], "fusednbitrowwise_to_half_cpu (c++ function)": [[10, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t"]], "half_to_fused8bitrowwise_cpu (c++ function)": [[10, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor"]], "expand_into_jagged_permute_cuda (c++ function)": [[11, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t"]], "generic_histogram_binning_calibration_by_feature_cpu (c++ function)": [[11, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td"]], "direct_mapped_lxu_cache_lookup_cuda (c++ function)": [[12, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE"]], "get_unique_indices_cuda (c++ function)": [[12, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb"]], "get_unique_indices_with_inverse_cuda (c++ function)": [[12, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb"]], "host_lxu_cache_slot (c++ function)": [[12, "_CPPv419host_lxu_cache_slot7int64_t7int64_t"]], "linearize_cache_indices_cuda (c++ function)": [[12, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t"]], "linearize_cache_indices_from_row_idx_cuda (c++ function)": [[12, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE"]], "lru_cache_find_uncached_cuda (c++ function)": [[12, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb"]], "lxu_cache_flush_cuda (c++ function)": [[12, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb"]], "lxu_cache_locations_update_cuda (c++ function)": [[12, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE"]], "lxu_cache_locking_counter_decrement_cuda (c++ function)": [[12, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE"]], "reset_weight_momentum_cuda (c++ function)": [[12, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t"]], "rocksdbwritemode (c++ enum)": [[13, "_CPPv416RocksdbWriteMode"]], "rocksdbwritemode::bwd_l1_cnflct_miss_write_back (c++ enumerator)": [[13, "_CPPv4N16RocksdbWriteMode29BWD_L1_CNFLCT_MISS_WRITE_BACKE"]], "rocksdbwritemode::flush (c++ enumerator)": [[13, "_CPPv4N16RocksdbWriteMode5FLUSHE"]], "rocksdbwritemode::fwd_l1_eviction (c++ enumerator)": [[13, "_CPPv4N16RocksdbWriteMode15FWD_L1_EVICTIONE"]], "rocksdbwritemode::fwd_rocksdb_read (c++ enumerator)": [[13, "_CPPv4N16RocksdbWriteMode16FWD_ROCKSDB_READE"]], "compact_indices_cuda (c++ function)": [[13, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor"]], "cuda_callback_func (c++ function)": [[13, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv"]], "hash_shard (c++ function)": [[13, "_CPPv410hash_shard7int64_t6size_t"]], "kv_db::cachecontext (c++ class)": [[13, "_CPPv4N5kv_db12CacheContextE"]], "kv_db::embeddingkvdb (c++ class)": [[13, "_CPPv4N5kv_db13EmbeddingKVDBE"]], "kv_db::queueitem (c++ struct)": [[13, "_CPPv4N5kv_db9QueueItemE"]], "l2_cache::cachelibcache (c++ class)": [[13, "_CPPv4N8l2_cache13CacheLibCacheE"]], "masked_index_put_cuda (c++ function)": [[13, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t"]], "masked_index_select_cuda (c++ function)": [[13, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t"]], "ps::embeddingparameterserver (c++ class)": [[13, "_CPPv4N2ps24EmbeddingParameterServerE"]], "ssd::embeddingrocksdb (c++ class)": [[13, "_CPPv4N3ssd16EmbeddingRocksDBE"]], "ssd_generate_row_addrs_cuda (c++ function)": [[13, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "ssd_update_row_addrs_cuda (c++ function)": [[13, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "batched_dense_vec_jagged_2d_mul() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul"]], "dense_to_jagged() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.dense_to_jagged"]], "fbgemm_gpu": [[18, "module-fbgemm_gpu"], [19, "module-fbgemm_gpu"], [20, "module-fbgemm_gpu"], [21, "module-fbgemm_gpu"], [22, "module-fbgemm_gpu"]], "jagged_1d_to_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_1d_to_dense"]], "jagged_2d_to_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_2d_to_dense"]], "jagged_dense_dense_elementwise_add_jagged_output() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output"]], "jagged_dense_elementwise_add() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_dense_elementwise_add"]], "jagged_dense_elementwise_add_jagged_output() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output"]], "jagged_dense_elementwise_mul() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_dense_elementwise_mul"]], "jagged_to_padded_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_to_padded_dense"]], "module": [[18, "module-fbgemm_gpu"], [19, "module-fbgemm_gpu"], [20, "module-fbgemm_gpu"], [21, "module-fbgemm_gpu"], [22, "module-fbgemm_gpu"]], "stacked_jagged_1d_to_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.stacked_jagged_1d_to_dense"]], "stacked_jagged_2d_to_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.stacked_jagged_2d_to_dense"]], "permutepooledembeddings (class in fbgemm_gpu.permute_pooled_embedding_modules)": [[19, "fbgemm_gpu.permute_pooled_embedding_modules.PermutePooledEmbeddings"]], "__call__() (fbgemm_gpu.permute_pooled_embedding_modules.permutepooledembeddings method)": [[19, "fbgemm_gpu.permute_pooled_embedding_modules.PermutePooledEmbeddings.__call__"]], "merge_pooled_embeddings() (in module torch.ops.fbgemm)": [[20, "torch.ops.fbgemm.merge_pooled_embeddings"]], "permute_pooled_embs() (in module torch.ops.fbgemm)": [[20, "torch.ops.fbgemm.permute_pooled_embs"]], "floatorhalftofusednbitrowwisequantizedsbhalf() (in module torch.ops.fbgemm)": [[21, "torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf"]], "asynchronous_complete_cumsum() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.asynchronous_complete_cumsum"]], "block_bucketize_sparse_features() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.block_bucketize_sparse_features"]], "expand_into_jagged_permute() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.expand_into_jagged_permute"]], "keyed_jagged_index_select_dim1() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.keyed_jagged_index_select_dim1"]], "offsets_range() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.offsets_range"]], "permute_1d_sparse_data() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.permute_1D_sparse_data"]], "permute_2d_sparse_data() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.permute_2D_sparse_data"]], "segment_sum_csr() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.segment_sum_csr"]], "intnbittablebatchedembeddingbagscodegen (class in fbgemm_gpu.split_table_batched_embeddings_ops_inference)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen"]], "assign_embedding_weights() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.assign_embedding_weights"]], "fill_random_weights() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.fill_random_weights"]], "forward() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.forward"]], "recompute_module_buffers() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.recompute_module_buffers"]], "split_embedding_weights() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.split_embedding_weights"]], "split_embedding_weights_with_scale_bias() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.split_embedding_weights_with_scale_bias"]], "splittablebatchedembeddingbagscodegen (class in fbgemm_gpu.split_table_batched_embeddings_ops_training)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen"]], "forward() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.forward"]], "set_learning_rate() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.set_learning_rate"]], "set_optimizer_step() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.set_optimizer_step"]], "split_embedding_weights() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.split_embedding_weights"]], "split_optimizer_states() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.split_optimizer_states"]], "update_hyper_parameters() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.update_hyper_parameters"]], "example_method (c++ function)": [[29, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf"]], "example_method() (in module fbgemm_gpu.docs.examples)": [[31, "fbgemm_gpu.docs.examples.example_method"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["fbgemm-cpp-api/QuantUtils", "fbgemm-cpp-api/tbe_cpu_autovec", "fbgemm-development/BuildInstructions", "fbgemm_gpu-cpp-api/embedding_ops", "fbgemm_gpu-cpp-api/experimental_ops", "fbgemm_gpu-cpp-api/input_combine", "fbgemm_gpu-cpp-api/jagged_tensor_ops", "fbgemm_gpu-cpp-api/layout_transform_ops", "fbgemm_gpu-cpp-api/memory_utils", "fbgemm_gpu-cpp-api/merge_pooled_embeddings", "fbgemm_gpu-cpp-api/quantize_ops", "fbgemm_gpu-cpp-api/sparse_ops", "fbgemm_gpu-cpp-api/split_table_batched_embeddings", "fbgemm_gpu-cpp-api/ssd_embedding_ops", "fbgemm_gpu-development/BuildInstructions", "fbgemm_gpu-development/InstallationInstructions", "fbgemm_gpu-development/TestInstructions", "fbgemm_gpu-overview/jagged-tensor-ops/JaggedTensorOps", "fbgemm_gpu-python-api/jagged_tensor_ops", "fbgemm_gpu-python-api/pooled_embedding_modules", "fbgemm_gpu-python-api/pooled_embedding_ops", "fbgemm_gpu-python-api/quantize_ops", "fbgemm_gpu-python-api/sparse_ops", "fbgemm_gpu-python-api/tbe_ops_inference", "fbgemm_gpu-python-api/tbe_ops_training", "fbgemm_gpu-stable-api/python_api", "general/ContactUs", "general/Contributing", "general/License", "general/documentation/Cpp", "general/documentation/Overview", "general/documentation/Python", "general/documentation/Sphinx", "index"], "filenames": ["fbgemm-cpp-api/QuantUtils.rst", "fbgemm-cpp-api/tbe_cpu_autovec.rst", "fbgemm-development/BuildInstructions.rst", "fbgemm_gpu-cpp-api/embedding_ops.rst", "fbgemm_gpu-cpp-api/experimental_ops.rst", "fbgemm_gpu-cpp-api/input_combine.rst", "fbgemm_gpu-cpp-api/jagged_tensor_ops.rst", "fbgemm_gpu-cpp-api/layout_transform_ops.rst", "fbgemm_gpu-cpp-api/memory_utils.rst", "fbgemm_gpu-cpp-api/merge_pooled_embeddings.rst", "fbgemm_gpu-cpp-api/quantize_ops.rst", "fbgemm_gpu-cpp-api/sparse_ops.rst", "fbgemm_gpu-cpp-api/split_table_batched_embeddings.rst", "fbgemm_gpu-cpp-api/ssd_embedding_ops.rst", "fbgemm_gpu-development/BuildInstructions.rst", "fbgemm_gpu-development/InstallationInstructions.rst", "fbgemm_gpu-development/TestInstructions.rst", "fbgemm_gpu-overview/jagged-tensor-ops/JaggedTensorOps.rst", "fbgemm_gpu-python-api/jagged_tensor_ops.rst", "fbgemm_gpu-python-api/pooled_embedding_modules.rst", "fbgemm_gpu-python-api/pooled_embedding_ops.rst", "fbgemm_gpu-python-api/quantize_ops.rst", "fbgemm_gpu-python-api/sparse_ops.rst", "fbgemm_gpu-python-api/tbe_ops_inference.rst", "fbgemm_gpu-python-api/tbe_ops_training.rst", "fbgemm_gpu-stable-api/python_api.rst", "general/ContactUs.rst", "general/Contributing.rst", "general/License.rst", "general/documentation/Cpp.rst", "general/documentation/Overview.rst", "general/documentation/Python.rst", "general/documentation/Sphinx.rst", "index.rst"], "titles": ["Quantization Utilities", "TBE CPU Autovectorization", "Build Instructions", "Embedding Operators", "Experimental Operators", "Combine Input Operators", "Jagged Tensor Operators", "Layout Transformation Operators", "CUDA Memory Operators", "Pooled Embeddings Operators", "Quantization Operators", "Sparse Data Operators", "Table Batched Embedding Operators", "SSD Embedding Operators", "Build Instructions", "Installation Instructions", "Test Instructions", "Jagged Tensor Operators", "Jagged Tensor Operators", "Pooled Embedding Modules", "Pooled Embedding Operators", "Quantization Operators", "Sparse Operators", "Table Batched Embedding (TBE) Inference Module", "Table Batched Embedding (TBE) Training Module", "FBGEMM_GPU Stable Python API", "Contact Us", "Contributing", "License", "Adding Documentation to C++ Code", "Documentation", "Adding Documentation to Python Code", "Sphinx Documentation Pointers", "FBGEMM and FBGEMM_GPU Documentation Homepage"], "terms": {"templat": [0, 1, 14, 29], "typenam": [0, 1, 29], "t": [0, 2, 4, 8, 11, 14, 23, 24, 27, 29, 30], "layout_t": 0, "layout": [0, 33], "kcx": 0, "void": [0, 3, 8, 10, 12, 13], "quantizegroupwis": 0, "const": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 31], "float": [0, 1, 10, 23, 24, 29, 31], "src": 0, "int": [0, 1, 10, 19, 23, 24, 29, 31], "k": [0, 4], "c": [0, 12, 15, 17, 28, 30, 31, 32], "x": [0, 6, 13, 15, 17, 29, 31], "g": [0, 2, 11, 13, 14, 29, 31], "scale": [0, 1, 4, 10, 23], "std": [0, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 29, 31], "int32_t": [0, 1, 29, 31], "zero_point": 0, "dst": 0, "point": [0, 10, 29, 31], "data": [0, 1, 4, 8, 13, 17, 23, 24, 28, 33], "type": [0, 1, 2, 4, 10, 15, 17, 23, 24, 29], "paramet": [0, 1, 4, 8, 10, 11, 13, 19, 23, 24, 29, 30, 31], "output": [0, 1, 4, 6, 10, 11, 13, 19, 23, 24, 29, 31], "int8_t": [0, 3], "uint8_t": [0, 1, 10, 12], "ar": [0, 2, 6, 12, 13, 14, 15, 17, 19, 23, 24, 25, 28, 29, 30, 31], "support": [0, 2, 4, 13, 14, 15, 17, 23, 24, 25, 31, 33], "input": [0, 1, 4, 6, 8, 10, 11, 13, 17, 19, 23, 24, 29, 33], "tensor": [0, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 19, 23, 24, 25, 30, 31, 33], "kxc": 0, "correspond": [0, 11, 12, 13, 15, 17, 24, 29, 31], "kcr": 0, "kctr": 0, "weight": [0, 1, 3, 11, 12, 13, 23, 24], "time": [0, 2, 14, 15, 17], "dimens": [0, 4, 6, 8, 11, 17, 19, 23, 24, 31], "krsc": 0, "ktrsc": 0, "channel": [0, 14, 15, 26], "number": [0, 1, 2, 4, 10, 11, 13, 14, 17, 19, 23, 24, 30], "r": [0, 14, 16, 24, 30], "": [0, 2, 8, 14, 16, 17, 27, 29, 30, 31], "group": [0, 4, 17, 29], "function": [0, 2, 13, 14, 23, 24, 25, 29, 31], "perform": [0, 2, 10, 11, 13, 17, 19, 23, 24, 25, 33], "channelwis": 0, "1": [0, 1, 2, 4, 11, 12, 13, 14, 15, 16, 17, 19, 23, 24, 30, 31, 32], "groupwis": 0, "per": [0, 17, 23, 24], "size": [0, 2, 4, 8, 10, 11, 17, 19, 23, 24], "should": [0, 10, 11, 12, 14, 15, 17, 23, 27, 29, 30, 31], "equal": [0, 17, 24, 31], "zero": [0, 23, 24, 31], "reprsent": 0, "fusedquantizedequant": 0, "int64_t": [0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "len": [0, 17, 24], "tensorquantizationparam": 0, "qparam": [0, 23], "thread_id": 0, "0": [0, 2, 4, 10, 11, 12, 13, 14, 15, 17, 19, 23, 24, 25, 31], "num_thread": 0, "noise_ratio": 0, "0f": 0, "fuse": [0, 10, 24], "integ": [0, 8, 10, 17, 24], "dequant": [0, 10], "kernel": [0, 2, 8, 10, 13, 16, 33], "acceler": 0, "awar": 0, "train": [0, 13, 25, 33], "fp32": [0, 1, 10, 23, 24], "valu": [0, 6, 8, 10, 11, 12, 13, 23, 24, 29, 30, 31], "u": [0, 14, 32, 33], "int8": [0, 23], "us": [0, 1, 2, 4, 8, 11, 13, 14, 15, 16, 17, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33], "provid": [0, 2, 13, 14, 15, 16, 25, 28, 29, 30, 31, 33], "back": [0, 8, 12, 14, 15], "inputtyp": 0, "floatorhalftofusednbitrowwisequantizedsbhalf": [0, 21], "bit_rat": [0, 10], "size_t": [0, 10, 13, 29], "input_row": 0, "input_column": 0, "convert": [0, 8, 10, 13, 17, 31], "fp16": [0, 10, 23, 24], "rowwis": [0, 10, 24], "bitrat": 0, "specifi": [0, 2, 4, 10, 11, 13, 14, 23, 24], "bit": [0, 1, 10, 23], "bia": [0, 1, 4, 10, 23], "each": [0, 1, 4, 10, 11, 13, 14, 15, 17, 19, 23, 24, 31], "row": [0, 1, 6, 10, 12, 13, 17, 19, 23, 24, 31], "store": [0, 10, 11, 12, 13], "itself": [0, 17, 30], "end": [0, 1, 15, 17, 23, 32], "can": [0, 1, 2, 10, 11, 13, 14, 15, 17, 24, 25, 29, 30, 31, 32], "4": [0, 10, 14, 15, 17, 19, 23, 24, 31], "8": [0, 10, 14, 15, 17, 19, 24], "uint32_t": 0, "xor128": 0, "random": [0, 23], "gener": [0, 2, 11, 13, 14, 15, 19, 24, 29, 32], "9": [0, 13, 14, 15, 17, 19, 24], "base": [0, 2, 11, 12, 13, 14, 17, 24], "thi": [0, 2, 6, 8, 9, 10, 11, 13, 14, 15, 17, 19, 23, 24, 26, 27, 28, 29, 31, 32, 33], "paper": 0, "findminmax": 0, "m": [0, 14, 15, 16], "min": 0, "max": [0, 4, 24], "find": [0, 12, 14], "matrix": [0, 2, 33], "bool": [0, 1, 4, 8, 9, 10, 12, 13, 23, 24], "a_symmetr": 0, "b_symmetr": 0, "quantizationgranular": 0, "q_gran": 0, "has_bia": 0, "fuse_relu": 0, "bias_typ": 0, "direct": [0, 12, 15, 28, 29, 31, 32], "fals": [0, 1, 8, 13, 23, 24, 30], "requantizeoutputprocessingavx2": 0, "out": [0, 1, 14, 26, 28, 30], "inp": 0, "block_type_t": 0, "block": [0, 1, 29, 31, 32], "ld_out": 0, "ld_in": 0, "requantizationparams_t": 0, "requant": 0, "avx2": [0, 2], "i": [0, 1, 2, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 23, 24, 27, 28, 29, 30, 31, 32, 33], "c_per_g": 0, "requantizeoutputprocessinggconvavx512": 0, "avx512": 0, "intyp": 1, "indextyp": 1, "offsettyp": 1, "outtyp": 1, "static": 1, "always_inlin": 1, "embeddingspmdm_autovec": 1, "block_siz": 1, "output_s": [1, 11], "index_s": 1, "data_s": 1, "indic": [1, 3, 12, 13, 17, 23, 24], "offsets_or_length": 1, "normalize_by_length": 1, "is_weight_posit": 1, "use_offset": 1, "output_strid": 1, "input_strid": 1, "no_bag": 1, "is_bf16_out": 1, "is_bf16_in": 1, "version": [1, 2, 15, 23, 25], "embeddingspmdm_ref": 1, "index": [1, 11, 12, 13, 14, 15, 17, 23, 24, 29, 31], "offset": [1, 3, 6, 11, 12, 23, 24], "element": [1, 10, 12, 13, 17, 24], "address": [1, 2, 13, 14], "sum": [1, 4, 11, 13, 19, 23, 24], "option": [1, 2, 3, 6, 8, 12, 14, 19, 23, 24], "null": 1, "non": [1, 4, 8, 23, 24], "whether": [1, 4, 8, 13, 14, 28], "normal": [1, 17], "length": [1, 4, 6, 11, 13, 19, 24, 31], "If": [1, 2, 13, 14, 15, 23, 24, 27, 29, 30, 31], "true": [1, 8, 13, 23, 24], "posit": [1, 4, 11, 13, 19], "set": [1, 8, 12, 13, 16, 17, 23, 24], "instead": [1, 14, 23, 30], "same": [1, 2, 4, 8, 11, 14, 17, 19, 24, 29, 30, 31], "scale_bias_last": [1, 10], "appear": [1, 15], "embed": [1, 2, 14, 15, 25, 30, 33], "bag": [1, 11, 24, 33], "bfloat16": [1, 10], "embeddingspmdmfp8_autovec": 1, "exponent_bit": 1, "exponent_bia": [1, 10], "expon": [1, 23], "note": [2, 12, 14, 15, 24, 29, 30, 31, 32], "The": [2, 4, 8, 10, 11, 13, 15, 16, 17, 19, 23, 24, 25, 27, 29, 30, 31, 32], "most": [2, 14, 15, 17, 30], "date": [2, 14, 15, 25, 30], "script": [2, 14, 15, 30], "bundl": [2, 14, 15, 30], "repo": [2, 14, 15, 30, 31], "under": [2, 14, 15, 27, 28, 30, 31], "setup_env": [2, 14, 15, 30], "bash": [2, 14, 15, 30], "step": [2, 13, 14, 15, 17, 24, 30, 31], "fbgemm_gpu": [2, 8, 14, 17, 19, 23, 24, 26, 27, 28, 29, 31], "follow": [2, 11, 14, 15, 17, 24, 25, 28, 29, 30, 31], "toolchain": [2, 14, 15], "run": [2, 14, 15, 19, 23, 30], "cpu": [2, 8, 9, 16, 23, 24, 30, 33], "higher": 2, "In": [2, 11, 13, 14, 15, 17, 27, 29, 31], "doe": [2, 3, 15, 29, 30, 31], "have": [2, 10, 11, 12, 14, 17, 24, 30], "ani": [2, 11, 14, 25, 27, 28, 30, 31], "intel": 2, "mkl": 2, "howev": [2, 14, 17, 25, 28], "comparison": 2, "some": [2, 14, 17, 30], "benchmark": 2, "found": [2, 14, 15, 25, 30], "path": [2, 13, 14, 16, 29, 32], "through": [2, 25, 27, 29, 31], "intel_mkl_dir": 2, "variabl": [2, 24], "built": [2, 14, 15, 30, 33], "report": [2, 15, 24], "otherwis": [2, 8, 13, 15, 23, 24, 28], "subset": 2, "all": [2, 11, 12, 13, 14, 15, 17, 19, 23, 24, 28, 30], "three": [2, 17], "git": [2, 14], "submodul": [2, 14], "custom": [2, 32], "desir": [2, 14, 17, 29], "thei": [2, 14, 30, 32], "asmjit_src_dir": 2, "cpuinfo_src_dir": 2, "googletest_source_dir": 2, "With": 2, "inner": [2, 17], "take": [2, 14, 23], "one": [2, 4, 10, 11, 12, 14, 15, 23, 24, 29, 31], "doesn": 2, "fit": [2, 28], "approach": 2, "so": [2, 11, 14, 15, 16, 17, 19], "implement": [2, 4, 10, 13, 14, 17, 24], "dynam": 2, "effici": [2, 33], "shape": [2, 4, 17, 19, 24], "specif": [2, 11, 13, 14, 23, 24, 28], "vector": [2, 5, 6, 7, 8, 9, 13, 31], "code": [2, 13, 14, 28, 30], "third": 2, "parti": 2, "call": [2, 8, 13, 15, 23], "detect": [2, 16], "runtim": [2, 14], "pytorch": [2, 13, 17, 26, 30, 31, 33], "project": [2, 27], "dispatch": [2, 8], "optim": [2, 10, 13, 24], "test": [2, 10, 14, 15, 25, 27, 33], "you": [2, 27, 29, 31], "don": [2, 11, 14, 30], "want": [2, 27], "togeth": [2, 29, 30], "default": [2, 11, 14, 15, 24], "turn": [2, 30], "off": [2, 15, 26], "simpli": [2, 14], "fbgemm_build_test": 2, "conda": [2, 16, 30], "For": [2, 15, 16, 17, 26, 28, 29, 30, 31, 32], "platform": [2, 14, 28], "machin": [2, 14, 15, 16, 33], "microsoft": [2, 10], "visual": 2, "studio": 2, "2019": 2, "newer": [2, 14], "recommend": [2, 6, 10, 14, 15, 17], "here": [2, 8, 14, 15, 27, 29, 30, 31, 32], "necessari": [2, 14, 24], "ninja": [2, 14], "etc": [2, 14, 23, 24], "n": [2, 10, 14, 15, 32], "env_nam": [2, 14, 15], "y": [2, 6, 14, 15, 30], "doxygen": [2, 29, 30], "make": [2, 12, 14, 27, 29, 30, 31], "openbla": 2, "packag": [2, 14, 16, 30], "onli": [2, 4, 10, 11, 12, 13, 16, 17, 24, 25, 27, 29, 30, 32], "clone": [2, 14], "along": [2, 14, 15, 19], "its": [2, 8, 10, 11, 14, 28, 30, 32], "insid": [2, 13, 14, 15, 16, 30, 32], "recurs": [2, 14], "http": [2, 14, 15, 27, 29, 30, 31], "github": [2, 14, 27], "com": [2, 14, 27], "cd": [2, 14, 16, 30], "assum": [2, 11, 24], "process": [2, 6, 13, 15, 17, 27, 31], "straightforward": 2, "creat": [2, 8, 14, 17, 27, 29, 31, 32], "directori": [2, 14, 16, 27, 29, 30], "mkdir": 2, "argument": [2, 11, 29, 30, 31], "build_arg": 2, "duse_sanit": 2, "dfbgemm_library_typ": 2, "share": [2, 8], "dpython_execut": 2, "which": [2, 11, 13, 14, 15, 17, 30], "python3": [2, 15], "document": [2, 8, 25, 27, 28], "dfbgemm_build_doc": 2, "ON": [2, 28], "j": [2, 17], "verbos": [2, 14], "As": [2, 11, 14, 15, 17], "write": [2, 13, 14, 15, 30, 31], "fail": [2, 15, 16, 29], "due": [2, 14], "known": [2, 14, 24], "regress": 2, "To": [2, 13, 14, 16, 32], "work": [2, 14, 15, 17, 27], "around": 2, "append": [2, 14, 29, 31], "export": [2, 14, 16], "prior": [2, 14, 15, 28], "cflag": 2, "wno": 2, "error": [2, 10, 15, 23, 24, 29, 30, 31], "mayb": 2, "uniniti": 2, "restrict": 2, "cxxflag": 2, "pleas": [2, 15, 27, 29, 31], "see": [2, 8, 14, 15, 17, 29, 31, 32], "77939": 2, "1094": 2, "1666": 2, "more": [2, 8, 14, 15, 24, 29, 31, 32], "detail": [2, 13, 15], "exactli": 2, "extra": 2, "need": [2, 13, 14, 15, 16, 17, 23, 27, 29, 31, 32], "ad": [2, 14, 27, 30], "invoc": [2, 14, 30], "llvm": [2, 14], "standard": [2, 14], "libc": [2, 14], "openmp": [2, 14], "libomp": 2, "locat": [2, 8, 12, 13, 14, 17], "cc_path": 2, "cxx_path": 2, "dcmake_c_compil": 2, "dcmake_cxx_compil": 2, "dcmake_c_flag": [2, 14], "fopenmp": 2, "stdlib": [2, 14], "conda_prefix": [2, 14], "includ": [2, 9, 13, 14, 28, 29, 31], "dcmake_cxx_flag": [2, 14], "likewis": 2, "also": [2, 13, 14, 24, 32], "veri": [2, 14, 29, 30, 31], "target": [2, 8, 10, 11, 14, 17, 29, 30, 31, 32], "architectur": [2, 14, 15], "bc": [2, 14], "x64": 2, "program": [2, 27], "file": [2, 14, 15, 26, 27, 29, 30, 31, 32], "x86": [2, 33], "enterpris": 2, "vc": 2, "auxiliari": 2, "vcvarsal": 2, "bat": 2, "build_dir": 2, "dfbgemm_build_benchmark": 2, "dcmake_build_typ": 2, "releas": [2, 25], "cl": 2, "ex": 2, "v": [2, 4, 6, 16], "int_nbit_split_embedding_codegen_lookup_funct": 3, "dev_weight": [3, 12], "uvm_weight": [3, 12], "weights_plac": [3, 12], "weights_offset": [3, 12], "weights_ti": [3, 12, 23], "d_offset": [3, 10, 12, 23], "total_d": [3, 12, 24], "max_int2_d": 3, "max_int4_d": 3, "max_int8_d": 3, "max_float16_d": 3, "max_float32_d": 3, "pooling_mod": [3, 23, 24], "indice_weight": 3, "output_dtyp": [3, 10, 23, 24], "lxu_cache_weight": [3, 12, 13], "lxu_cache_loc": [3, 12, 13], "row_align": [3, 12, 23], "max_float8_d": 3, "fp8_exponent_bit": [3, 23], "fp8_exponent_bia": [3, 23], "int_nbit_split_embedding_uvm_caching_codegen_lookup_funct": 3, "cache_hash_size_cumsum": [3, 12], "total_cache_hash_s": [3, 12], "cache_index_table_map": [3, 12], "lxu_cache_st": [3, 12], "lxu_stat": 3, "simlar": 3, "uvm_cach": 3, "lookup": [3, 12, 13, 23, 24], "pruned_hashmap_lookup_cuda": 3, "hash_tabl": 3, "hash_table_offset": 3, "pruned_array_lookup_cuda": 3, "index_remap": [3, 23], "index_remappings_offset": 3, "bounds_check_indices_cuda": 3, "rows_per_t": [3, 23], "bounds_check_mod": [3, 23, 24], "warn": [3, 23, 24, 29], "b_offset": [3, 12], "max_b": [3, 12], "b_t_map": 3, "info_b_num_bit": 3, "info_b_mask": 3, "bounds_check_vers": 3, "int_nbit_split_embedding_codegen_lookup_function_cpu": 3, "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu": 3, "pruned_hashmap_insert_unweighted_cpu": 3, "dense_indic": 3, "pruned_hashmap_lookup_unweighted_cpu": 3, "pruned_array_lookup_cpu": 3, "tupl": [4, 5, 6, 11, 12, 13, 23, 24], "gqa_attn_splitk": 4, "xq": 4, "cache_k": 4, "cache_v": 4, "seq_posit": 4, "doubl": [4, 6, 10, 11], "qk_scale": 4, "num_split_k": 4, "kv_cache_quant_num_group": 4, "use_tensor_cor": 4, "cache_logical_dtype_int": 4, "decod": 4, "queri": 4, "split": [4, 23, 24], "w": [4, 16], "bf16": [4, 10], "int4": [4, 10, 23], "kv": 4, "cuda": [4, 9, 19, 23, 24, 33], "gqa": 4, "cach": [4, 12, 13, 14, 23, 24], "It": [4, 13, 14, 15, 17, 19], "current": [4, 13, 14, 15, 17, 23, 24], "context": 4, "16384": 4, "fix": [4, 11, 23, 24], "head": 4, "128": 4, "an": [4, 8, 11, 13, 15, 16, 17, 19, 23, 24, 29, 30, 31, 32], "arbitrari": [4, 13], "b": [4, 11, 14, 17, 24, 29, 30, 31, 32], "h_q": 4, "d": [4, 17, 32], "where": [4, 6, 8, 11, 13, 17, 19, 24], "batch": [4, 6, 11, 17, 19, 25, 33], "num": 4, "max_t": 4, "h_kv": 4, "sequenc": [4, 23, 24], "contain": [4, 8, 13, 14, 17, 19, 24, 31], "actual": [4, 14], "token": [4, 17], "appli": [4, 11, 14, 17, 24], "after": [4, 11, 13, 14, 15, 16, 17, 30, 31, 32], "qk": 4, "control": [4, 24], "amount": [4, 23, 24], "parallel": [4, 13], "wise": [4, 17, 23, 24], "fp8": [4, 10, 23], "quantiz": [4, 25, 33], "singl": [4, 8, 10, 13], "now": [4, 23], "core": 4, "wmma": 4, "instruct": [4, 27, 29, 30, 31, 33], "fast": 4, "kv_cach": 4, "2": [4, 10, 13, 14, 15, 16, 17, 19, 23, 24, 29, 31, 32], "return": [4, 8, 10, 11, 13, 19, 23, 24, 29, 30, 31], "A": [4, 8, 10, 13, 14, 15, 17, 19, 23, 24, 28, 29, 30, 31], "combin": [4, 33], "metadata": [4, 13, 24], "softmax": 4, "tbe_input_combine_cpu": 5, "indices_list": 5, "offsets_list": 5, "per_sample_weight": [5, 23, 24], "include_last_offset": 5, "padding_fused_tbe_input_combine_cpu": 5, "batch_siz": [5, 19], "solv": 6, "issu": [6, 8, 14, 15, 26], "when": [6, 11, 13, 14, 16, 17, 23, 24, 29, 30, 32], "differ": [6, 11, 13, 17, 24], "often": 6, "occur": [6, 13, 29], "spars": [6, 17, 23, 24, 25, 33], "featur": [6, 11, 14, 17, 19, 23, 24, 26], "system": [6, 14, 15, 17], "well": [6, 11, 14, 29], "natur": [6, 17], "languag": [6, 17, 32], "jagged_to_padded_dense_forward": 6, "c10": [6, 10], "symintarrayref": 6, "max_length": 6, "padding_valu": 6, "jagged_dense_elementwise_add_jagged_output_cuda": 6, "x_valu": 6, "x_offset": [6, 31], "dens": [6, 31], "jagged_to_padded_dens": [6, 18], "jagged_dense_elementwise_add": [6, 18], "jagged_dense_elementwise_mul": [6, 18], "batched_dense_vec_jagged_2d_mul": [6, 18], "a_valu": 6, "a_offset": 6, "dense_to_jag": [6, 18], "symint": 6, "total_l": 6, "jagged_dense_elementwise_add_jagged_output": [6, 18], "jagged_1d_to_dens": [6, 18], "max_l": 6, "jagged_2d_to_dens": [6, 14, 15, 18, 30, 31], "max_sequence_length": [6, 31], "recat_embedding_grad_output_cuda": 7, "grad_output": 7, "num_features_per_rank": 7, "recat_embedding_grad_output_mixed_d_cuda": 7, "dim_sum_per_rank": 7, "recat_embedding_grad_output_mixed_d_batch_cuda": 7, "cumsum_dim_sum_per_rank": 7, "recat_embedding_grad_output_mixed_d_cpu": 7, "new_managed_tensor": 8, "self": [8, 13, 23], "alloc": [8, 23, 24, 29], "unifi": [8, 23, 24], "manag": [8, 14, 15, 23, 24], "uvm": [8, 16, 23, 24], "Then": 8, "prefer": [8, 13, 15], "storag": [8, 10, 12, 13], "host": [8, 14, 23, 24], "establish": 8, "map": [8, 11, 12, 13, 17, 23, 24], "devic": [8, 9, 14, 19, 23, 24], "new": [8, 10, 12, 29, 30, 31], "new_managed_tensor_meta": 8, "placehold": 8, "meta": [8, 23, 28], "kei": [8, 13, 24], "empti": [8, 17, 32], "new_host_mapped_tensor": 8, "new_unified_tensor": 8, "is_host_map": 8, "either": [8, 10, 11, 13, 14, 15], "depend": [8, 10, 14, 15, 17], "new_unified_tensor_meta": 8, "new_vanilla_managed_tensor": 8, "allow": [8, 14], "automat": [8, 11, 16, 30], "uvm_storag": 8, "check": [8, 23, 24], "gpu": [8, 13, 14, 15, 16, 23, 24, 31, 33], "is_uvm_tensor": 8, "BUT": [8, 28], "uvm_to_cpu": 8, "effect": [8, 17], "move": [8, 13, 19], "from": [8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 23, 24, 25, 27, 28, 29, 30, 31, 32], "uvm_to_devic": 8, "prototyp": 8, "whose": 8, "uvm_cuda_mem_advis": 8, "cuda_memory_advis": 8, "cudamemadvis": 8, "cudamemoryadvis": 8, "enum": [8, 10, 13], "avail": [8, 14, 15, 16, 23, 24, 30], "python": [8, 13, 14, 16, 29, 30, 32], "side": [8, 13, 14, 29, 31, 33], "namespac": 8, "over": [8, 14], "valid": 8, "inform": [8, 15, 17, 23, 24, 31, 32], "uvm_cuda_mem_prefetch_async": 8, "device_t": 8, "cudamemprefetchasync": 8, "prefetch": [8, 13, 24], "destin": 8, "uvm_mem_advice_dont_fork": 8, "madvis": 8, "madv_dontfork": 8, "workaround": 8, "driver": [8, 14], "un": 8, "page": [8, 15, 27, 32, 33], "tabl": [8, 11, 17, 25, 33], "fork": [8, 27], "caus": [8, 14, 15, 28, 30], "slowdown": 8, "next": [8, 13, 17, 29, 31], "access": [8, 13, 23, 24], "uvm_to_cpu_clon": 8, "copi": 8, "contigu": [8, 11], "thread": [8, 13], "memcpi": 8, "section": [9, 14, 15, 31], "variou": 9, "all_to_one_devic": 9, "inputtensor": 9, "target_devic": 9, "permute_pooled_embs_split_gpu": 9, "pooled_emb": [9, 19], "offset_dim_list": 9, "permute_list": 9, "inv_offset_dim_list": 9, "inv_permute_list": 9, "permute_pooled_embs_auto_grad_split_gpu": 9, "permute_pooled_embs_auto_grad_gpu": 9, "permute_pooled_embs_cpu_impl": 9, "allow_dupl": 9, "permute_pooled_embs_split_cpu": 9, "permute_pooled_embs_auto_grad_split_cpu": 9, "permute_pooled_embs_auto_grad": 9, "permute_pooled_embs_auto_grad_cpu": 9, "model": [10, 11], "techniqu": 10, "reduc": [10, 13], "larg": [10, 14], "order": [10, 17, 24, 27], "achiev": [10, 15], "better": [10, 13, 29], "small": 10, "loss": [10, 28], "accuraci": 10, "_float_to_bfloat16_gpu": 10, "brain": 10, "_bfloat16_to_float_gpu": 10, "_float_to_fp8rowwise_gpu": 10, "forward": [10, 23, 24], "dtype": [10, 19, 23, 24], "sparsetyp": [10, 23, 24], "throw": [10, 23, 24, 29], "_fp8rowwise_to_float_gpu": 10, "represent": [10, 17], "_float_to_fused8bitrowwise_gpu": 10, "_half_to_fused8bitrowwise_gpu": 10, "half": 10, "_single_or_half_precision_to_fused8bitrowwise_gpu": 10, "_fused8bitrowwise_to_float_gpu": 10, "_fused8bitrowwise_to_half_gpu": 10, "_fused8bitrowwise_to_single_or_half_precision_gpu": 10, "quant_padding_float_typ": 10, "_fused8bitrowwise_to_float_mixed_dim_gpu": 10, "kfloat": 10, "khalf": 10, "_float_to_fusednbitrowwise_gpu": 10, "_half_to_fusednbitrowwise_gpu": 10, "_single_or_half_precision_to_fusednbitrowwise_gpu": 10, "_fusednbitrowwise_to_float_gpu": 10, "_fusednbitrowwise_to_half_gpu": 10, "_fusednbitrowwise_to_single_or_half_precision_gpu": 10, "_float_to_hfp8_gpu": 10, "ebit": 10, "max_po": 10, "hybrid": 10, "hfp8": 10, "_hfp8_to_float_gpu": 10, "_float_to_msfp_gpu": 10, "bounding_box_s": 10, "mbit": 10, "min_po": 10, "msfp": 10, "_msfp_to_float_gpu": 10, "_float_to_paddedfp8rowwise_gpu": 10, "row_dim": 10, "pad": [10, 13, 17, 31], "_paddedfp8rowwise_to_float_gpu": 10, "output_last_dim": 10, "param": [10, 13, 29, 31], "_fused8bitrowwise_to_float_cpu_out": 10, "_float_to_fused8bitrowwise_cpu_out": 10, "float_to_fused8bitrowwise_cpu": 10, "half_to_fused8bitrowwise_cpu": 10, "float_or_half_to_fused8bitrowwise_cpu": 10, "fused8bitrowwise_to_float_cpu": 10, "fused8bitrowwise_to_half_cpu": 10, "fused8bitrowwise_to_float_or_half_cpu": 10, "float_to_fp8rowwise_cpu": 10, "fp8rowwise_to_float_cpu": 10, "fusednbitrowwise_to_float_cpu": 10, "fusednbitrowwise_sbfront_to_float_cpu": 10, "int2": [10, 23], "front": 10, "float32": [10, 19], "torch": [10, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 30, 31], "quint4x2": 10, "quint2x4": 10, "quantizedcpu": 10, "backend": [10, 33], "purpos": [10, 17, 23, 24, 28], "becaus": [10, 14, 17, 24], "refer": [10, 14, 17, 30, 31], "rate": [10, 24], "hold": [10, 13, 17], "fusednbitrowwise_to_half_cpu": 10, "fusednbitrowwise_to_float_or_half_cpu": 10, "floattofp8quantized_ref": 10, "nrow": 10, "ncol": 10, "fp8quantizedtofloat_ref": 10, "expand_into_jagged_permute_cuda": 11, "permut": [11, 19], "input_offset": 11, "output_offset": 11, "expand_into_jagged_permut": [11, 22], "expand": 11, "case": [11, 14, 15, 17, 27], "ha": [11, 13, 15, 17, 24, 25, 27, 29, 30], "across": [11, 14, 19], "rank": [11, 17, 24], "level": 11, "exclus": [11, 13], "op": [11, 15, 18, 20, 21, 22, 31], "sit": 11, "we": [11, 13, 14, 17, 25, 27], "deriv": [11, 17, 28], "arrai": [11, 23, 31], "comput": [11, 14, 15, 23, 24], "formula": 11, "output_permut": 11, "table_offset": 11, "bag_offset": 11, "generic_histogram_binning_calibration_by_feature_cpu": 11, "logit": 11, "segment_valu": 11, "segment_length": 11, "num_seg": 11, "bin_num_exampl": 11, "bin_num_posit": 11, "bin_boundari": 11, "positive_weight": 11, "bin_ctr_in_use_aft": 11, "bin_ctr_weight_valu": 11, "divid": [11, 17], "predict": 11, "rang": [11, 13, 17], "e": [11, 13, 14, 17, 19, 29, 31, 32], "bin": [11, 14], "two": [11, 17, 24, 30], "exampl": [11, 13, 14, 15, 16, 19, 24, 29, 30, 31, 32], "fall": [11, 14, 15], "bucket": [11, 14], "basic": [11, 13, 31], "histogram": 11, "result": [11, 13, 14], "statist": [11, 23, 24], "real": 11, "ctr": 11, "num_po": 11, "num_exampl": 11, "final": 11, "calibr": 11, "pre": [11, 15], "cali": 11, "wai": [11, 23, 28], "within": [11, 23, 24, 25], "suffici": [11, 27, 30], "That": 11, "fine": 11, "grain": 11, "modul": [11, 14, 15, 25, 31], "theoret": 11, "layer": [11, 13], "uncalibr": 11, "extens": [11, 29, 30], "ectr": 11, "abov": [11, 13, 15, 17, 28, 29, 31, 32], "accept": [11, 27], "sort": [11, 12, 13, 14], "befor": [11, 13, 14, 32], "sigmoid": 11, "calibart": 11, "pass": [11, 23, 24, 27, 30], "lower": 11, "upper_bound": 11, "bound": [11, 17, 23, 24], "calibration_target": 11, "observ": 11, "statisct": 11, "final_calibrated_predict": 11, "bin_ctr_weight": 11, "bin_ctr": 11, "histogram_binning_calibration_cpu": 11, "lower_bound": 11, "keyjaggedtensor": 11, "num_bin": 11, "longer": [11, 26, 29], "still": [11, 14], "parambin_ctr_weight_valu": 11, "calibrated_predict": 11, "bin_id": 11, "get_unique_indices_cuda": 12, "linear_indic": 12, "max_indic": 12, "compute_count": 12, "dedupl": 12, "get_unique_indices_with_inverse_cuda": 12, "compute_inverse_indic": 12, "lru_cache_find_uncached_cuda": 12, "unique_indic": 12, "unique_indices_length": [12, 13], "time_stamp": 12, "lru_stat": 12, "gather_cache_stat": 12, "uvm_cache_stat": 12, "lock_cache_lin": 12, "lxu_cache_locking_count": 12, "lru": [12, 13, 23, 24], "uncach": [12, 13], "them": [12, 23], "host_lxu_cache_slot": 12, "h_in": 12, "cache_set": [12, 23, 24], "linearize_cache_indices_cuda": 12, "indices_base_offset": 12, "linear": [12, 13], "uniqu": [12, 13, 24, 32], "linearize_cache_indices_from_row_idx_cuda": 12, "update_table_indic": 12, "update_row_indic": 12, "format": [12, 19, 30, 31], "inplac": [12, 24], "updat": [12, 13, 14, 15, 16, 24, 27], "direct_mapped_lxu_cache_lookup_cuda": 12, "linear_cache_indic": 12, "invalid_index": 12, "fetch": [12, 13], "insert": [12, 13, 32], "timestep": 12, "lru_cache_populate_cuda": 12, "hash_size_cumsum": 12, "stochastic_round": [12, 24], "byte": [12, 13, 23], "lru_cache_populate_byte_cuda": 12, "assoc": 12, "variant": [12, 14, 15, 30], "direct_mapped_lru_cache_populate_byte_cuda": 12, "lxu_cache_miss_timestamp": 12, "lfu": [12, 23, 24], "lfu_cache_populate_cuda": 12, "lfu_stat": 12, "lfu_cache_populate_byte_cuda": 12, "look": [12, 24], "up": [12, 13, 16, 24, 25], "slot": [12, 13], "sentinel": [12, 13], "miss": [12, 13, 14], "lxu_cache_lookup_cuda": 12, "num_uniq_cache_indic": 12, "lxu_cache_locations_output": 12, "emulate_cache_miss": 12, "enforced_misses_per_256": 12, "lxu_cache_flush_cuda": 12, "flush": [12, 13], "reset_weight_momentum_cuda": 12, "momentum1_dev": 12, "momentum1_uvm": 12, "momentum1_plac": 12, "momentum1_offset": 12, "pruned_indic": 12, "pruned_indices_offset": 12, "logical_table_id": 12, "buffer_id": 12, "lxu_cache_locking_counter_decrement_cuda": 12, "decrement": 12, "counter": [12, 24], "lxu_cache_locations_update_cuda": 12, "lxu_cache_locations_new": 12, "rocksdbwritemod": 13, "rocksdb": 13, "mode": [13, 16, 23, 24], "offload": 13, "3": [13, 14, 15, 17, 19, 24, 28, 31], "iter": 13, "fwd_rocksdb_read": 13, "l2": [13, 24], "fwd": 13, "fwd_l1_evict": 13, "l1": 13, "eviciton": 13, "evict": 13, "bwd_l1_cnflct_miss_write_back": 13, "conflict": 13, "bwd": 13, "fill": [13, 23], "potenti": 13, "trigger": 13, "onc": [13, 15, 27], "full": [13, 14, 15, 32], "addition": 13, "do": [13, 14, 15, 24, 25, 27], "io": 13, "enumer": 13, "inlin": [13, 32], "hash_shard": 13, "id": [13, 15], "num_shard": 13, "hash": [13, 23], "shard": 13, "algorithm": [13, 23, 24], "cuda_callback_func": 13, "cudastream_t": 13, "stream": [13, 14, 24], "cudaerror_t": 13, "statu": 13, "functor": 13, "callback": 13, "cudastreamaddcallback": 13, "common": [13, 14, 15, 17, 31], "cudastreamcallback_t": 13, "cast": 13, "invok": [13, 14, 19, 24, 30], "delet": 13, "anoth": [13, 32], "none": [13, 19, 23, 24], "masked_index_put_cuda": 13, "count": 13, "use_pipelin": 13, "preferred_sm": 13, "similar": [13, 14, 17, 23, 24], "index_put": 13, "ignor": [13, 16, 23, 24, 30], "2d": [13, 17, 19, 24, 31], "put": [13, 30], "equival": [13, 17], "filter_": 13, "indices_": 13, "nonzero": 13, "flatten": 13, "1d": [13, 24, 31], "flag": [13, 14, 30], "overlap": 13, "other": [13, 15, 17, 22, 28, 29, 30, 31], "fraction": 13, "sm": 13, "resourc": 13, "competit": 13, "masked_index_select_cuda": 13, "index_select": 13, "ssd_generate_row_addrs_cuda": 13, "assigned_cache_slot": 13, "linear_index_inverse_indic": 13, "unique_indices_count_cumsum": 13, "cache_set_inverse_indic": 13, "inserted_ssd_weight": 13, "cache_set_sorted_unique_indic": 13, "memori": [13, 15, 23, 24, 33], "tbe": [13, 25, 33], "retriev": 13, "scratch": [13, 15], "hbm": [13, 23, 24], "lxu": 13, "associ": 13, "enabl": [13, 14, 16, 24], "conveni": 13, "first": [13, 14, 29, 31, 32], "pointer": [13, 30], "moreov": 13, "list": [13, 14, 17, 19, 23, 24, 28, 29, 31], "post": 13, "backward": [13, 24, 25], "origin": 13, "being": [13, 14, 30], "prefix": [13, 14, 32], "ssd_update_row_addrs_cuda": 13, "ssd_row_addrs_curr": 13, "inserted_ssd_weights_curr_next_map": 13, "lxu_cache_locations_curr": 13, "linear_index_inverse_indices_curr": 13, "unique_indices_count_cumsum_curr": 13, "cache_set_inverse_indices_curr": 13, "inserted_ssd_weights_next": 13, "unique_indices_length_curr": 13, "pipelin": [13, 24], "dure": [13, 14, 17, 24, 31], "reloc": 13, "correct": [13, 14], "between": [13, 17, 29, 30, 32], "been": [13, 14, 29], "compact_indices_cuda": 13, "compact_indic": 13, "compact_count": 13, "mask": 13, "compact": 13, "given": [13, 14, 17], "operat": 13, "remov": 13, "7": [13, 14, 15, 17, 19, 24], "5": [13, 14, 15, 17, 19, 23, 24], "repres": [13, 17, 19, 24], "keep": [13, 14], "class": [13, 19, 23, 24, 30, 31], "cachelibcach": 13, "cachelib_cach": 13, "h": [13, 14, 29], "cachelib": 13, "wrapper": 13, "cachlib": 13, "interact": 13, "maintain": 13, "relat": [13, 17, 23], "initi": 13, "state": [13, 14, 24], "logic": [13, 17, 29], "caller": 13, "reset": 13, "captur": 13, "delai": 13, "markus": 13, "boost": 13, "get": 13, "handl": [13, 17], "read": [13, 17], "done": [13, 14, 15], "embeddingparameterserv": 13, "public": [13, 27, 30], "embeddingkvdb": 13, "ps_table_batched_embed": 13, "servic": [13, 28], "tp": 13, "client": 13, "cachecontext": 13, "kv_db_table_batched_embed": 13, "l2cach": 13, "num_miss": 13, "cached_addr_list": 13, "prealloc": 13, "invalid": [13, 23, 24], "spot": 13, "stai": 13, "struct": 13, "queueitem": 13, "queue": 13, "item": [13, 19, 31], "background": 13, "read_handl": 13, "abstract": 13, "pair": [13, 32], "later": [13, 14], "separ": [13, 24, 30], "get_cach": 13, "monitor": 13, "checkout": 13, "explan": 13, "enable_shared_from_thi": 13, "execut": [13, 15, 16], "dram": [13, 23, 24], "remot": 13, "scalabl": 13, "without": [13, 14, 28], "blow": 13, "subclass": [13, 23], "embeddingrocksdb": 13, "ssd_table_batched_embed": 13, "fbgemm": [14, 18, 20, 21, 22, 26, 27, 28, 30, 31], "experiment": [14, 15, 33], "reproduc": [14, 15, 27, 28], "platform_nam": 14, "unam": 14, "miniconda_prefix": 14, "home": 14, "download": [14, 15], "wget": 14, "q": 14, "anaconda": 14, "miniconda3": 14, "latest": 14, "sh": 14, "o": [14, 15], "p": 14, "load": [14, 17, 23, 31], "shortcut": 14, "bashrc": 14, "command": [14, 15, 29, 30], "against": [14, 16], "env": [14, 15], "name": [14, 15, 23, 24, 28, 29, 31], "python_vers": 14, "12": [14, 15, 17, 19, 24], "upgrad": 14, "pyopenssl": 14, "22": [14, 17, 19], "requir": [14, 15, 16, 17, 24, 30, 31], "recent": [14, 15, 23, 24], "nvcc": 14, "capabl": [14, 16], "bare": 14, "metal": 14, "neither": [14, 28], "nor": [14, 28], "nvidia": [14, 24], "present": [14, 31], "sinc": [14, 17, 23], "pull": [14, 15, 30], "linux": [14, 15], "distribut": [14, 28], "ubuntu": 14, "04": 14, "11": [14, 15, 17, 19], "entrypoint": 14, "devel": 14, "ubuntu22": 14, "rest": [14, 15], "mai": [14, 15, 17, 28], "construct": [14, 15, 17, 23], "mechan": 14, "nvml": 14, "org": [14, 15, 31], "cuda_vers": 14, "label": 14, "verifi": [14, 15, 29, 31], "cuda_runtim": 14, "libnvidia": [14, 15], "ml": [14, 15], "libnccl": [14, 16], "printenv": 14, "extract": 14, "url": [14, 15], "builder": 14, "blob": 14, "main": [14, 27], "install_cuda": 14, "cudnn_url": 14, "redist": 14, "x86_64": 14, "26_cuda12": 14, "archiv": 14, "tar": 14, "xz": 14, "unpack": 14, "xvf": 14, "applic": [14, 15, 24, 29, 31], "alreadi": [14, 15, 27, 29, 31], "repositori": [14, 27], "cmake": 14, "configur": [14, 29], "amd": [14, 15], "minim": 14, "6": [14, 15, 17, 19], "termin": 14, "while": [14, 23, 30], "come": [14, 15], "reason": [14, 15, 30], "oper": [14, 15, 16, 24, 25], "guid": [14, 31], "disabl": 14, "apt": 14, "prompt": 14, "debian_frontend": 14, "noninteract": 14, "db": 14, "radeon": 14, "amdgpu": 14, "focal": 14, "install_5": 14, "50601": 14, "1_all": 14, "deb": 14, "usecas": 14, "hiplibsdk": 14, "dkm": 14, "hipifi": 14, "hip": 14, "dev": 14, "20": [14, 19], "sysroot": 14, "avoid": 14, "glibcxx": 14, "fbgemm_cpu": 14, "10": [14, 15, 17, 19, 24], "older": [14, 15], "accompani": [14, 30], "appropri": 14, "sysroot_linux": 14, "gcc_version": 14, "forg": [14, 30], "gxx_linux": 14, "64": [14, 17], "17": [14, 19], "binari": [14, 28], "cento": 14, "librari": [14, 30, 33], "libstdc": 14, "what": [14, 30], "libcxx_path": 14, "print": [14, 15, 19, 23, 24, 31], "objdump": 14, "tc": 14, "grep": 14, "glibc_": 14, "sed": 14, "vu": 14, "cat": 14, "glibcxx_": 14, "possibl": [14, 17, 27, 28], "just": 14, "minimum": [14, 29, 30, 31], "llvm_version": 14, "16": [14, 17, 19], "libcxx": 14, "outdat": 14, "aarch64": [14, 15], "cannot": 14, "explicitli": [14, 24, 25], "clangxx": 14, "rt": 14, "lib": [14, 15, 16], "ld_library_path": [14, 15, 16], "config": [14, 24], "var": 14, "nvcc_prepend_flag": 14, "correctli": [14, 15, 16, 29, 30], "xcompil": 14, "ccbin": 14, "clangxx_path": 14, "unsupport": 14, "even": [14, 28], "though": [14, 15], "libstd": 14, "mean": [14, 17, 23, 24, 25], "regardless": 14, "scenario": 14, "binpath": 14, "overrid": 14, "exist": [14, 29, 31], "ln": 14, "sf": 14, "path_to_either_gcc_or_clang": 14, "cc": 14, "These": 14, "stage": [14, 17], "click": 14, "hypothesi": [14, 15], "jinja2": 14, "ncurs": 14, "numpi": [14, 15], "scikit": [14, 15], "offici": 14, "homepag": 14, "authorit": [14, 15, 30], "how": [14, 15, 16, 19, 31], "nightli": [14, 15], "rc": 14, "alwai": 14, "reliabl": 14, "arriv": 14, "hour": 14, "than": [14, 15, 17], "window": 14, "silent": [14, 23, 24], "both": [14, 23, 24, 26, 28, 30], "place": [14, 23, 24], "artifact": 14, "select": 14, "thu": [14, 24], "import": [14, 15, 19, 24, 31, 32], "much": [14, 29], "determinist": 14, "whl": [14, 15], "cu121": [14, 15], "rocm5": [14, 15], "ensur": [14, 15, 25, 27], "properli": 14, "__version__": 14, "cuda_cmake_macro": 14, "gemm": 14, "via": [14, 24, 25], "manual": [14, 15, 29], "sha": 14, "pin": 14, "ci": [14, 15], "ci_commit_pin": 14, "txt": [14, 16, 30, 32], "dedb7bdf33": 14, "tag": [14, 29, 32], "fbgemm_vers": 14, "v1": [14, 25], "fbgemm_": 14, "addit": [14, 16, 17], "flow": [14, 24], "becom": 14, "stale": 14, "problem": 14, "re": [14, 15, 23], "attempt": 14, "failur": [14, 15], "clear": [14, 27], "py": [14, 15, 16, 30, 31], "clean": [14, 30], "must": [14, 15, 16, 17, 23, 24, 28, 32], "package_nam": 14, "fbgemm_gpu_": 14, "convent": 14, "major": [14, 25], "minor": 14, "py312": 14, "python_tag": 14, "determin": [14, 17, 23, 24], "processor": 14, "arch": 14, "python_plat_nam": 14, "manylinux2014_": 14, "maco": 14, "macosx_10_9_": 14, "arm64": 14, "macosx_11_0_": 14, "win_": 14, "cpu_onli": 14, "bdist_wheel": 14, "package_vari": 14, "plat": 14, "cxxprefix": 14, "presum": 14, "made": [14, 30], "debug": [14, 16], "assert": 14, "presenc": 14, "unabl": 14, "cudacxx": 14, "cuda_bin_path": 14, "cub": 14, "cub_dir": 14, "log": [14, 15], "nvcc_verbos": 14, "header": [14, 29, 32], "cudnn_include_dir": 14, "cudnn_librari": 14, "filepath": 14, "nvml_lib_path": 14, "nccl": [14, 16], "nccl_lib_path": 14, "sm70": [14, 15], "80": 14, "v100": [14, 15], "a100": [14, 15], "cuda_arch_list": 14, "unset": 14, "torch_cuda_arch_list": 14, "preced": 14, "dtorch_cuda_arch_list": 14, "By": [14, 27], "those": [14, 17, 23, 27, 31], "rocm_path": 14, "pytorch_rocm_arch": 14, "hipcc": 14, "hipcc_verbos": 14, "gfx906": 14, "gfx908": 14, "gfx90a": 14, "wiki": 14, "gentoo": 14, "rocminfo": 14, "gfx": 14, "dhip_root_dir": 14, "dtorch_use_hip_dsa": 14, "complet": [14, 27, 30], "lot": 14, "jinja": 14, "instanti": [14, 19], "sure": [14, 27, 29, 31], "accident": 14, "cours": 14, "fbgemm_gpu_lib_path": 14, "fbgemm_gpu_pi": [14, 15], "defin": [14, 17, 23, 29], "nm": 14, "gdcu": 14, "referenc": 14, "certain": 14, "gdc": 14, "merge_pooled_embed": [14, 15, 20], "isol": [15, 30], "build": [15, 16, 29, 31, 33], "accord": 15, "schedul": 15, "guarante": [15, 25], "conjunct": 15, "visit": 15, "sm80": 15, "respect": 15, "especi": 15, "displai": [15, 32], "setup": 15, "smi": 15, "515": 15, "76": 15, "persist": 15, "bu": [15, 32], "disp": 15, "volatil": 15, "uncorr": 15, "ecc": 15, "fan": 15, "temp": 15, "perf": 15, "pwr": 15, "usag": [15, 30, 31], "cap": 15, "util": [15, 33], "mig": 15, "a10g": 15, "00000000": 15, "00": 15, "1e": [15, 24], "31c": 15, "p0": 15, "59w": 15, "300w": 15, "0mib": 15, "23028mib": 15, "gi": 15, "pid": 15, "No": [15, 23, 24, 25], "expos": 15, "imag": 15, "launch": 15, "toolkit": 15, "interfac": 15, "concis": 15, "info": [15, 29, 31], "dieedg": 15, "avgpwr": 15, "sclk": 15, "mclk": 15, "pwrcap": 15, "vram": 15, "33": [15, 19], "0c": 15, "37": [15, 19], "0w": 15, "300mhz": 15, "1200mhz": 15, "auto": [15, 30], "290": 15, "32": [15, 19, 23], "39": [15, 19], "difficult": 15, "relev": [15, 29], "genai": 15, "triton_vers": 15, "45fff310c8": 15, "about": [15, 31], "link": [15, 25, 30], "encount": [15, 23, 24], "signatur": [15, 30], "traceback": 15, "last": 15, "root": [15, 27], "miniconda": 15, "mycondaenv": 15, "site": 15, "_op": [15, 30], "line": [15, 23, 31, 32], "565": 15, "__getattr__": 15, "overload_nam": 15, "_c": 15, "_jit_get_oper": 15, "qualified_op_nam": 15, "runtimeerror": 15, "except": [15, 29, 31], "wa": 15, "string": [15, 32], "post47": 15, "py3": 15, "egg": 15, "__init__": [15, 31], "21": [15, 19], "_fbgemm_gpu_doc": 15, "noqa": 15, "f401": 15, "e402": 15, "18": [15, 19], "569": 15, "rais": [15, 31], "attributeerror": [15, 31], "_opnamespac": 15, "object": [15, 17], "attribut": [15, 31], "cli": 15, "main_run": 15, "47": [15, 19], "_zn6fbgemm48floatorhalftofusednbitrowwisequantizedsbhalfavx2itli2eeevpkt_miph": 15, "libtorch": 15, "visibl": 15, "incorrectli": [15, 30], "declar": [15, 29], "were": 15, "pr": [15, 29, 30, 31], "1618": 15, "former": [15, 23], "resolv": 15, "latter": [15, 23], "seriou": 15, "tha": 15, "develop": [15, 30], "bench": 16, "good": [16, 28], "instal": [16, 30, 33], "pip": [16, 30], "pytest": 16, "rsx": 16, "pytestcollectionwarn": 16, "split_table_batched_embeddings_test": 16, "quantize_ops_test": 16, "sparse_ops_test": 16, "split_embedding_inference_converter_test": 16, "cuda_visible_devic": 16, "cuda_launch_block": 16, "involv": [16, 17], "rpath": 16, "fbgemm_test_with_rocm": 16, "hip_launch_block": 16, "split_table_batched_embeddings_benchmark": 16, "consecut": 17, "nestedtensor": 17, "raggedtensor": 17, "tensorflow": 17, "notabl": 17, "sentenc": 17, "maxlength": 17, "numel": 17, "greatest": 17, "divisor": 17, "smallest": 17, "sub": 17, "exclud": 17, "partit": 17, "impli": [17, 28], "denot": [17, 29, 31], "offest": 17, "outer": 17, "would": 17, "begin": 17, "maximum": [17, 31], "densor": 17, "form": [17, 28], "figur": 17, "below": [17, 25], "show": [17, 24, 30], "accomod": 17, "At": [17, 29, 30, 31], "multipl": [17, 23, 24, 31, 33], "hadamard": 17, "product": [17, 28], "bmatrix": 17, "rightarrow": 17, "25": [17, 19], "36": [17, 19], "49": 17, "81": 17, "50": 17, "operand": 17, "word": 17, "ax": 17, "properti": 17, "elementwis": 17, "start": [17, 25, 31, 32], "dim": [17, 19], "onto": 17, "part": 17, "everi": [17, 23, 24, 25], "converson": 17, "could": 17, "lead": 17, "smaller": 17, "expect": [17, 23], "happen": 17, "give": 17, "situat": 17, "like": 17, "dense_tensor": 17, "jagged_tensor": 17, "break": 17, "exact": 17, "usual": 17, "arg": [18, 20, 21, 22, 24, 31], "kwarg": [18, 20, 21, 22], "jagged_dense_dense_elementwise_add_jagged_output": 18, "stacked_jagged_1d_to_dens": 18, "stacked_jagged_2d_to_dens": 18, "permute_pooled_embedding_modul": 19, "permutepooledembed": 19, "embs_dim": 19, "sourc": [19, 23, 24, 27, 28, 29, 30, 31], "column": 19, "essenti": 19, "second": [19, 29, 31], "suppos": 19, "int64": [19, 23], "perm": 19, "arang": 19, "reshap": 19, "13": [19, 24], "14": [19, 31], "15": 19, "19": 19, "23": 19, "24": 19, "26": 19, "27": 19, "28": 19, "29": 19, "30": 19, "31": 19, "34": 19, "35": 19, "38": 19, "40": 19, "41": 19, "42": [19, 31], "43": 19, "44": 19, "45": 19, "46": 19, "describ": [19, 23, 24, 27], "__call__": 19, "b_local": 19, "total_global_d": 19, "local": [19, 29, 31], "total": [19, 23, 24], "global": [19, 23, 24], "permute_pooled_emb": 20, "permute_2d_sparse_data": 22, "permute_1d_sparse_data": 22, "asynchronous_complete_cumsum": 22, "offsets_rang": 22, "segment_sum_csr": 22, "keyed_jagged_index_select_dim1": 22, "block_bucketize_sparse_featur": 22, "split_table_batched_embeddings_ops_infer": 23, "intnbittablebatchedembeddingbagscodegen": 23, "embedding_spec": [23, 24], "str": [23, 24], "embeddingloc": [23, 24], "feature_table_map": [23, 24], "poolingmod": [23, 24], "boundscheckmod": [23, 24], "weight_list": 23, "pruning_hash_load_factor": 23, "use_array_for_index_remap": 23, "cache_algorithm": [23, 24], "cachealgorithm": [23, 24], "cache_load_factor": [23, 24], "cache_reserved_memori": [23, 24], "enforce_hbm": [23, 24], "record_cache_metr": [23, 24], "recordcachemetr": [23, 24], "gather_uvm_cache_stat": [23, 24], "cache_assoc": 23, "scale_bias_size_in_byt": 23, "cacheline_align": 23, "uvm_host_map": [23, 24], "reverse_qparam": 23, "feature_names_per_t": 23, "indices_dtyp": 23, "int32": 23, "nn": [23, 24], "embeddingbag": 23, "computedevic": [23, 24], "spec": [23, 24], "physic": [23, 24], "placement": [23, 24], "virtual": [23, 24], "managed_cach": [23, 24], "mtia": [23, 24], "remap": 23, "prune": 23, "pool": [23, 24, 25, 33], "union": [23, 24], "skip": [23, 24], "fatal": [23, 24], "messag": [23, 24], "adjust": [23, 24], "factor": [23, 24], "least": [23, 24], "frequent": [23, 24], "capac": [23, 24], "reserv": [23, 24, 28], "momentum": [23, 24], "record": [23, 24], "hit": [23, 24], "request": [23, 24, 26, 30], "record_cache_miss_count": [23, 24], "metric": [23, 24], "record_tablewise_cache_miss": [23, 24], "collect": [23, 24, 33], "align": [23, 29, 31], "default_scale_bias_size_in_byt": 23, "128b": 23, "boundari": 23, "malloc": [23, 24], "cudahostregist": [23, 24], "cudamallocmanag": [23, 24], "begn": 23, "remap_indic": 23, "assign_embedding_weight": 23, "q_weight_list": 23, "assign": 23, "split_embedding_weight": [23, 24], "scale_shift": 23, "fill_random_weight": 23, "buffer": 23, "overridden": 23, "although": 23, "recip": 23, "instanc": 23, "afterward": 23, "care": [23, 29], "regist": 23, "hook": 23, "recompute_module_buff": 23, "materi": [23, 28], "reset_weights_placements_and_offset": 23, "bounds_check_warn": 23, "right": [23, 28, 32], "split_scale_shift": 23, "split_embedding_weights_with_scale_bia": 23, "split_scale_bias_mod": 23, "scale_bia": 23, "split_table_batched_embeddings_ops_train": 24, "splittablebatchedembeddingbagscodegen": 24, "cache_precis": 24, "weights_precis": 24, "emboptimtyp": 24, "exact_sgd": 24, "gradient_clip": 24, "max_gradi": 24, "max_norm": 24, "learning_r": 24, "01": 24, "ep": 24, "08": 24, "weight_decai": 24, "weight_decay_mod": 24, "weightdecaymod": 24, "eta": 24, "001": 24, "beta1": 24, "beta2": 24, "999": 24, "ensemble_mod": 24, "ensemblemodedefinit": 24, "emainplace_mod": 24, "emainplacemodedefinit": 24, "counter_based_regular": 24, "counterbasedregularizationdefinit": 24, "cowclip_regular": 24, "cowclipdefinit": 24, "uvm_non_rowwise_momentum": 24, "use_experimental_tb": 24, "prefetch_pipelin": 24, "stats_reporter_config": 24, "tbestatsreporterconfig": 24, "table_nam": 24, "optimizer_state_dtyp": 24, "dict": 24, "multipass_prefetch_config": 24, "multipassprefetchconfig": 24, "global_weight_decai": 24, "globalweightdecaydefinit": 24, "optimtyp": 24, "adam": 24, "exact_adagrad": 24, "adagrad": 24, "exact_rowwise_adagrad": 24, "aadagrad": 24, "sgd": 24, "lamb": 24, "lars_sgd": 24, "lar": 24, "partial_rowwise_adam": 24, "partial": 24, "partial_rowwise_lamb": 24, "ensemble_rowwise_adagrad": 24, "ensembl": 24, "emainplace_rowwise_adagrad": 24, "ema": 24, "Not": 24, "gradient": 24, "stochast": 24, "round": 24, "clip": 24, "norm": 24, "learn": 24, "0e": 24, "epsilon": 24, "decai": 24, "decoupl": 24, "v2": 24, "polici": 24, "forward_stream": 24, "stat": 24, "multipass": 24, "feature_requires_grad": 24, "batch_size_per_feature_per_rank": 24, "total_unique_indic": 24, "vbe": 24, "user": 24, "autograd": 24, "chosen": 24, "conatin": 24, "sampl": 24, "unweight": 24, "multipli": 24, "f": 24, "split_table_batched_embeddings_ops_common": 24, "init_embedding_weights_uniform": 24, "9426": 24, "7046": 24, "4214": 24, "0419": 24, "1331": 24, "7856": 24, "8124": 24, "2021": 24, "5771": 24, "5911": 24, "7792": 24, "1068": 24, "6203": 24, "4813": 24, "1677": 24, "4790": 24, "5587": 24, "0941": 24, "5754": 24, "3475": 24, "8952": 24, "1964": 24, "0810": 24, "4174": 24, "2513": 24, "4039": 24, "3775": 24, "3273": 24, "5399": 24, "0229": 24, "1455": 24, "8770": 24, "9520": 24, "4593": 24, "7169": 24, "6307": 24, "1765": 24, "8757": 24, "8614": 24, "2051": 24, "0603": 24, "9980": 24, "7958": 24, "5826": 24, "long": 24, "5197": 24, "2957": 24, "3578": 24, "1487": 24, "4873": 24, "3044": 24, "9801": 24, "2769": 24, "7164": 24, "8528": 24, "7159": 24, "6719": 24, "0784": 24, "2016": 24, "2176": 24, "1988": 24, "3825": 24, "5008": 24, "8991": 24, "1405": 24, "2637": 24, "9427": 24, "8902": 24, "3754": 24, "5013": 24, "6105": 24, "9968": 24, "3057": 24, "7621": 24, "9821": 24, "7314": 24, "6195": 24, "grad_fn": 24, "cppnode": 24, "splitlookupfunction_sgd_op": 24, "set_learning_r": 24, "lr": 24, "set_optimizer_step": 24, "view": [24, 30], "split_optimizer_st": 24, "momentum1": 24, "momentum2": 24, "prev_it": 24, "cowclip": 24, "row_count": 24, "update_hyper_paramet": 24, "params_dict": 24, "hyper": 24, "extern": [24, 32], "outlin": [25, 27], "our": 25, "compat": 25, "thorough": 25, "futur": 25, "unless": 25, "announc": 25, "advanc": 25, "enhanc": 25, "comprehens": 25, "unit": 25, "framework": 25, "NOT": [25, 28], "commit": 25, "best": 25, "effort": 25, "basi": 25, "infer": [25, 33], "jag": [25, 31, 33], "question": 26, "concern": 26, "discuss": 26, "kick": 26, "regard": 26, "feel": 26, "free": 26, "reach": 26, "easi": 27, "transpar": 27, "activ": 27, "welcom": [27, 33], "your": [27, 30, 31], "branch": 27, "ve": 27, "add": [27, 29, 30, 31], "chang": [27, 29, 31], "api": [27, 29, 30, 31], "suit": 27, "lint": 27, "haven": 27, "submit": [27, 29, 31], "facebook": [27, 28, 33], "open": 27, "track": 27, "bug": 27, "descript": [27, 29, 30, 31, 32], "abl": 27, "bounti": 27, "safe": 27, "disclosur": 27, "secur": 27, "go": 27, "agre": 27, "tree": 27, "claus": 28, "bsd": 28, "softwar": 28, "copyright": 28, "inc": 28, "affili": 28, "redistribut": 28, "modif": 28, "permit": 28, "condit": 28, "met": 28, "retain": 28, "notic": 28, "disclaim": 28, "contributor": 28, "endors": 28, "promot": 28, "written": 28, "permiss": 28, "BY": 28, "THE": 28, "holder": 28, "AND": 28, "AS": 28, "express": [28, 32], "OR": 28, "warranti": 28, "limit": [28, 30], "TO": 28, "OF": 28, "merchant": 28, "FOR": 28, "particular": 28, "IN": 28, "NO": 28, "event": 28, "shall": 28, "BE": 28, "liabl": 28, "indirect": 28, "incident": 28, "special": 28, "exemplari": 28, "consequenti": 28, "damag": 28, "procur": 28, "substitut": 28, "profit": 28, "busi": 28, "interrupt": 28, "theori": 28, "liabil": 28, "contract": 28, "strict": 28, "tort": 28, "neglig": 28, "aris": 28, "IF": 28, "advis": 28, "SUCH": 28, "javadoc": 29, "style": [29, 31], "comment": [29, 30, 32], "sphinx": [29, 30, 31], "breath": 29, "kept": 29, "cpp": [29, 31, 32], "cu": 29, "cuh": 29, "everyth": 29, "ifndef": 29, "doxygen_this_will_be_skip": 29, "endif": 29, "hidden": 29, "html": [29, 30, 31], "descriptionss": 29, "publish": [29, 31], "docstr": [29, 30, 31], "method": [29, 30, 31], "organ": 29, "yet": 29, "top": [29, 33], "defgroup": 29, "directli": [29, 31], "behavior": [29, 31], "tparam": 29, "thrown": [29, 31], "ingroup": 29, "brief": 29, "short": 29, "example_method": [29, 31], "def": [29, 31], "foo": [29, 31], "lst": [29, 31], "And": [29, 31], "verbatim": [29, 31], "text": [29, 31, 32], "diagram": [29, 31], "unpars": 29, "prev": [29, 31], "usabl": [29, 31], "space": [29, 30, 31], "endcod": 29, "param1": [29, 31], "param2": 29, "bad_alloc": 29, "logic_error": 29, "href": 29, "www": [29, 31], "nl": 29, "cmdlink": 29, "On": [29, 31], "doxygengroup": 29, "rst": [29, 31, 32], "content": [29, 32, 33], "toctre": [29, 31], "ini": 29, "taken": 29, "doc": [29, 30, 31, 32], "netlifi": [29, 30, 31], "preview": [29, 31], "serv": 30, "yourself": 30, "shoe": 30, "who": 30, "understand": 30, "live": 30, "easier": 30, "leav": 30, "task": 30, "tool": 30, "graphviz": [30, 32], "assembl": 30, "prepend": 30, "sphinx_lint": 30, "technic": 30, "why": 30, "occasion": 30, "unresolv": 30, "might": 30, "opt": 30, "pycapsul": 30, "neg": 30, "silenc": 30, "nitpick": 30, "conf": 30, "domain": 30, "deploi": 30, "app": 30, "googl": 31, "c_size_t": 31, "ret": 31, "emplace_back": 31, "valueerror": 31, "restructuredtext": 31, "en": 31, "master": 31, "__": 31, "pep": 31, "0287": 31, "autofunct": 31, "toc": 31, "c_ulong": 31, "mani": 31, "attach": 31, "fact": 31, "helper": 31, "codebas": 31, "add_doc": 31, "forc": 31, "hoc": 31, "the_new_doc_modul": 31, "remain": 31, "render": [31, 32], "anchor": 32, "_doc": 32, "underscor": 32, "_": 32, "There": 32, "elsewher": 32, "ref": 32, "literalinclud": 32, "rel": 32, "enclos": 32, "bracket": 32, "skiplin": 32, "suppli": 32, "math": 32, "k_": 32, "k_n": 32, "expressino": 32, "int_a": 32, "frac": 32, "2v": 32, "dx": 32, "left": 32, "dv": 32, "_a": 32, "du": 32, "digraph": 32, "altern": 32, "dot": 32, "examplegraph": 32, "low": 33, "precis": 33, "high": 33, "convolut": 33, "server": 33, "transform": 33, "contribut": 33, "contact": 33, "licens": 33, "autovector": 33, "ssd": 33}, "objects": {"": [[13, 0, 1, "_CPPv4N16RocksdbWriteMode29BWD_L1_CNFLCT_MISS_WRITE_BACKE", "BWD_L1_CNFLCT_MISS_WRITE_BACK"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode5FLUSHE", "FLUSH"], [10, 1, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::ebits"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::exponent_bias"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::input"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::ncols"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::nrows"], [10, 2, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::output"], [10, 1, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu"], [10, 2, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::forward"], [10, 2, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::input"], [10, 2, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::output_dtype"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode15FWD_L1_EVICTIONE", "FWD_L1_EVICTION"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode16FWD_ROCKSDB_READE", "FWD_ROCKSDB_READ"], [0, 1, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax"], [0, 2, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax::len"], [0, 2, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax::m"], [0, 2, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax::max"], [0, 2, 1, "_CPPv410FindMinMaxPKfPfPf7int64_t", "FindMinMax::min"], [0, 1, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf"], [0, 3, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::InputType"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::bit_rate"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::input"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::input_columns"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::input_rows"], [0, 2, 1, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf::output"], [10, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::ebits"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::exponent_bias"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::input"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::max_pos"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::ncols"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::nrows"], [10, 2, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::output"], [0, 1, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize"], [0, 3, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::T"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::dst"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::len"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::noise_ratio"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::num_threads"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::qparams"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::src"], [0, 2, 1, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif", "FusedQuantizeDequantize::thread_id"], [0, 1, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::C"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::G"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::K"], [0, 3, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::LAYOUT"], [0, 3, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::T"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::X"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::dst"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::scales"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::src"], [0, 2, 1, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T", "QuantizeGroupwise::zero_points"], [13, 4, 1, "_CPPv416RocksdbWriteMode", "RocksdbWriteMode"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode29BWD_L1_CNFLCT_MISS_WRITE_BACKE", "RocksdbWriteMode::BWD_L1_CNFLCT_MISS_WRITE_BACK"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode5FLUSHE", "RocksdbWriteMode::FLUSH"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode15FWD_L1_EVICTIONE", "RocksdbWriteMode::FWD_L1_EVICTION"], [13, 0, 1, "_CPPv4N16RocksdbWriteMode16FWD_ROCKSDB_READE", "RocksdbWriteMode::FWD_ROCKSDB_READ"], [0, 1, 1, "_CPPv46Xor128v", "Xor128"], [10, 1, 1, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t", "_FP8rowwise_to_float_gpu"], [10, 2, 1, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t", "_FP8rowwise_to_float_gpu::forward"], [10, 2, 1, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t", "_FP8rowwise_to_float_gpu::input"], [10, 2, 1, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t", "_FP8rowwise_to_float_gpu::output_dtype"], [10, 1, 1, "_CPPv422_bfloat16_to_float_gpuRKN2at6TensorE", "_bfloat16_to_float_gpu"], [10, 2, 1, "_CPPv422_bfloat16_to_float_gpuRKN2at6TensorE", "_bfloat16_to_float_gpu::input"], [10, 1, 1, "_CPPv424_float_to_FP8rowwise_gpuRK6TensorKb", "_float_to_FP8rowwise_gpu"], [10, 2, 1, "_CPPv424_float_to_FP8rowwise_gpuRK6TensorKb", "_float_to_FP8rowwise_gpu::forward"], [10, 2, 1, "_CPPv424_float_to_FP8rowwise_gpuRK6TensorKb", "_float_to_FP8rowwise_gpu::input"], [10, 1, 1, "_CPPv422_float_to_bfloat16_gpuRKN2at6TensorE", "_float_to_bfloat16_gpu"], [10, 2, 1, "_CPPv422_float_to_bfloat16_gpuRKN2at6TensorE", "_float_to_bfloat16_gpu::input"], [10, 1, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out"], [10, 2, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out::input"], [10, 2, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out::output"], [10, 1, 1, "_CPPv430_float_to_fused8bitrowwise_gpuRK6Tensor", "_float_to_fused8bitrowwise_gpu"], [10, 2, 1, "_CPPv430_float_to_fused8bitrowwise_gpuRK6Tensor", "_float_to_fused8bitrowwise_gpu::input"], [10, 1, 1, "_CPPv430_float_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu"], [10, 2, 1, "_CPPv430_float_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu::bit_rate"], [10, 2, 1, "_CPPv430_float_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu::input"], [10, 1, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu"], [10, 2, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu::ebits"], [10, 2, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu::exponent_bias"], [10, 2, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu::input"], [10, 2, 1, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd", "_float_to_hfp8_gpu::max_pos"], [10, 1, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::bias"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::bounding_box_size"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::ebits"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::input"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::max_pos"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::mbits"], [10, 2, 1, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd", "_float_to_msfp_gpu::min_pos"], [10, 1, 1, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t", "_float_to_paddedFP8rowwise_gpu"], [10, 2, 1, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t", "_float_to_paddedFP8rowwise_gpu::forward"], [10, 2, 1, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t", "_float_to_paddedFP8rowwise_gpu::input"], [10, 2, 1, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t", "_float_to_paddedFP8rowwise_gpu::row_dim"], [10, 1, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out"], [10, 2, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out::input"], [10, 2, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out::output"], [10, 1, 1, "_CPPv430_fused8bitrowwise_to_float_gpuRKN2at6TensorE", "_fused8bitrowwise_to_float_gpu"], [10, 2, 1, "_CPPv430_fused8bitrowwise_to_float_gpuRKN2at6TensorE", "_fused8bitrowwise_to_float_gpu::input"], [10, 1, 1, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t", "_fused8bitrowwise_to_float_mixed_dim_gpu"], [10, 2, 1, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t", "_fused8bitrowwise_to_float_mixed_dim_gpu::D_offsets"], [10, 2, 1, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t", "_fused8bitrowwise_to_float_mixed_dim_gpu::input"], [10, 2, 1, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t", "_fused8bitrowwise_to_float_mixed_dim_gpu::output_dtype"], [10, 1, 1, "_CPPv429_fused8bitrowwise_to_half_gpuRKN2at6TensorE", "_fused8bitrowwise_to_half_gpu"], [10, 2, 1, "_CPPv429_fused8bitrowwise_to_half_gpuRKN2at6TensorE", "_fused8bitrowwise_to_half_gpu::input"], [10, 1, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu"], [10, 2, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu::input"], [10, 2, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu::output_dtype"], [10, 2, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu::quant_padding_float_type"], [10, 2, 1, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb", "_fused8bitrowwise_to_single_or_half_precision_gpu::scale_bias_last"], [10, 1, 1, "_CPPv430_fusednbitrowwise_to_float_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_float_gpu"], [10, 2, 1, "_CPPv430_fusednbitrowwise_to_float_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_float_gpu::bit_rate"], [10, 2, 1, "_CPPv430_fusednbitrowwise_to_float_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_float_gpu::input"], [10, 1, 1, "_CPPv429_fusednbitrowwise_to_half_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_half_gpu"], [10, 2, 1, "_CPPv429_fusednbitrowwise_to_half_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_half_gpu::bit_rate"], [10, 2, 1, "_CPPv429_fusednbitrowwise_to_half_gpuRKN2at6TensorEK7int64_t", "_fusednbitrowwise_to_half_gpu::input"], [10, 1, 1, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t", "_fusednbitrowwise_to_single_or_half_precision_gpu"], [10, 2, 1, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t", "_fusednbitrowwise_to_single_or_half_precision_gpu::bit_rate"], [10, 2, 1, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t", "_fusednbitrowwise_to_single_or_half_precision_gpu::input"], [10, 2, 1, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t", "_fusednbitrowwise_to_single_or_half_precision_gpu::output_dtype"], [10, 1, 1, "_CPPv429_half_to_fused8bitrowwise_gpuRK6Tensor", "_half_to_fused8bitrowwise_gpu"], [10, 2, 1, "_CPPv429_half_to_fused8bitrowwise_gpuRK6Tensor", "_half_to_fused8bitrowwise_gpu::input"], [10, 1, 1, "_CPPv429_half_to_fusednbitrowwise_gpuRKN2at6TensorEK7int64_t", "_half_to_fusednbitrowwise_gpu"], [10, 2, 1, "_CPPv429_half_to_fusednbitrowwise_gpuRKN2at6TensorEK7int64_t", "_half_to_fusednbitrowwise_gpu::bit_rate"], [10, 2, 1, "_CPPv429_half_to_fusednbitrowwise_gpuRKN2at6TensorEK7int64_t", "_half_to_fusednbitrowwise_gpu::input"], [10, 1, 1, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t", "_hfp8_to_float_gpu"], [10, 2, 1, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t", "_hfp8_to_float_gpu::ebits"], [10, 2, 1, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t", "_hfp8_to_float_gpu::exponent_bias"], [10, 2, 1, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t", "_hfp8_to_float_gpu::input"], [10, 1, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu"], [10, 2, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu::bias"], [10, 2, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu::ebits"], [10, 2, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu::input"], [10, 2, 1, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t", "_msfp_to_float_gpu::mbits"], [10, 1, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::forward"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::input"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::output_dtype"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::output_last_dim"], [10, 2, 1, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t", "_paddedFP8rowwise_to_float_gpu::row_dim"], [10, 1, 1, "_CPPv449_single_or_half_precision_to_fused8bitrowwise_gpuRK6Tensor", "_single_or_half_precision_to_fused8bitrowwise_gpu"], [10, 2, 1, "_CPPv449_single_or_half_precision_to_fused8bitrowwise_gpuRK6Tensor", "_single_or_half_precision_to_fused8bitrowwise_gpu::input"], [10, 1, 1, "_CPPv449_single_or_half_precision_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_single_or_half_precision_to_fusednbitrowwise_gpu"], [10, 2, 1, "_CPPv449_single_or_half_precision_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_single_or_half_precision_to_fusednbitrowwise_gpu::bit_rate"], [10, 2, 1, "_CPPv449_single_or_half_precision_to_fusednbitrowwise_gpuRK6TensorK7int64_t", "_single_or_half_precision_to_fusednbitrowwise_gpu::input"], [9, 1, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device"], [9, 2, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device::inputTensors"], [9, 2, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device::target_device"], [6, 1, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul"], [6, 2, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::a_offsets"], [6, 2, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::a_values"], [6, 2, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::v"], [3, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::B_offsets"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::b_t_map"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::bounds_check_mode"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::bounds_check_version"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::indices"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::info_B_mask"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::info_B_num_bits"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::max_B"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::offsets"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::rows_per_table"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::warning"], [3, 2, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t", "bounds_check_indices_cuda::weights"], [13, 1, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::compact_count"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::compact_indices"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::count"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::indices"], [13, 2, 1, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor", "compact_indices_cuda::masks"], [13, 1, 1, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv", "cuda_callback_func"], [13, 2, 1, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv", "cuda_callback_func::functor"], [13, 2, 1, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv", "cuda_callback_func::status"], [13, 2, 1, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv", "cuda_callback_func::stream"], [6, 1, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE", "dense_to_jagged"], [6, 2, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE", "dense_to_jagged::dense"], [6, 2, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE", "dense_to_jagged::offsets"], [6, 2, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE", "dense_to_jagged::total_L"], [12, 1, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::gather_cache_stats"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::invalid_index"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::linear_cache_indices"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::lxu_cache_state"], [12, 2, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::uvm_cache_stats"], [29, 1, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method"], [29, 3, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method::Alignment"], [29, 3, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method::T"], [29, 2, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method::param1"], [29, 2, 1, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf", "example_method::param2"], [11, 1, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda"], [11, 2, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::input_offsets"], [11, 2, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::output_offsets"], [11, 2, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::output_size"], [11, 2, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::permute"], [10, 1, 1, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor", "float_or_half_to_fused8bitrowwise_cpu"], [10, 2, 1, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor", "float_or_half_to_fused8bitrowwise_cpu::input"], [10, 1, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu"], [10, 2, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu::forward"], [10, 2, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu::input"], [10, 1, 1, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor", "float_to_fused8bitrowwise_cpu"], [10, 2, 1, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor", "float_to_fused8bitrowwise_cpu::input"], [10, 1, 1, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor", "fused8bitrowwise_to_float_cpu"], [10, 2, 1, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor", "fused8bitrowwise_to_float_cpu::input"], [10, 1, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu"], [10, 2, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu::input"], [10, 2, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu::output_dtype"], [10, 2, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu::quant_padding_float_type"], [10, 2, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb", "fused8bitrowwise_to_float_or_half_cpu::scale_bias_last"], [10, 1, 1, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor", "fused8bitrowwise_to_half_cpu"], [10, 2, 1, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor", "fused8bitrowwise_to_half_cpu::input"], [10, 1, 1, "_CPPv437fusednbitrowwise_sbfront_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_sbfront_to_float_cpu"], [10, 2, 1, "_CPPv437fusednbitrowwise_sbfront_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_sbfront_to_float_cpu::bit_rate"], [10, 2, 1, "_CPPv437fusednbitrowwise_sbfront_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_sbfront_to_float_cpu::input"], [10, 1, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu"], [10, 2, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu::bit_rate"], [10, 2, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu::input"], [10, 1, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu"], [10, 2, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::bit_rate"], [10, 2, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::input"], [10, 2, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::output_dtype"], [10, 1, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu"], [10, 2, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu::bit_rate"], [10, 2, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu::input"], [11, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_boundaries"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_ctr_in_use_after"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_ctr_weight_value"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_num_examples"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_num_positives"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::logit"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::num_segments"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::positive_weight"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::segment_lengths"], [11, 2, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::segment_value"], [12, 1, 1, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb", "get_unique_indices_cuda"], [12, 2, 1, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb", "get_unique_indices_cuda::compute_count"], [12, 2, 1, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb", "get_unique_indices_cuda::linear_indices"], [12, 2, 1, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb", "get_unique_indices_cuda::max_indices"], [12, 1, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda"], [12, 2, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda::compute_count"], [12, 2, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda::compute_inverse_indices"], [12, 2, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda::linear_indices"], [12, 2, 1, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb", "get_unique_indices_with_inverse_cuda::max_indices"], [4, 1, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::XQ"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::cache_K"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::cache_V"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::cache_logical_dtype_int"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::kv_cache_quant_num_groups"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::num_split_ks"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::qk_scale"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::seq_positions"], [4, 2, 1, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t", "gqa_attn_splitk::use_tensor_cores"], [10, 1, 1, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor", "half_to_fused8bitrowwise_cpu"], [10, 2, 1, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor", "half_to_fused8bitrowwise_cpu::input"], [13, 1, 1, "_CPPv410hash_shard7int64_t6size_t", "hash_shard"], [13, 2, 1, "_CPPv410hash_shard7int64_t6size_t", "hash_shard::id"], [13, 2, 1, "_CPPv410hash_shard7int64_t6size_t", "hash_shard::num_shards"], [12, 1, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot"], [12, 2, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot::C"], [12, 2, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot::h_in"], [3, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::D_offsets"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::dev_weights"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::fp8_exponent_bias"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::fp8_exponent_bits"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::indice_weights"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::indices"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::lxu_cache_locations"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::lxu_cache_weights"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float16_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float32_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float8_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int2_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int4_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int8_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::offsets"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::output_dtype"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::pooling_mode"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::row_alignment"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::total_D"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::uvm_weights"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_offsets"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_placements"], [3, 2, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_tys"], [3, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::D_offsets"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::dev_weights"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::fp8_exponent_bias"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::fp8_exponent_bits"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::indice_weights"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::indices"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::lxu_cache_locations"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::lxu_cache_weights"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float16_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float32_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float8_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int2_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int4_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int8_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::offsets"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::output_dtype"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::pooling_mode"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::row_alignment"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::total_D"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::uvm_weights"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_offsets"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_placements"], [3, 2, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_tys"], [3, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::D_offsets"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::cache_hash_size_cumsum"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::cache_index_table_map"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::dev_weights"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::fp8_exponent_bias"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::fp8_exponent_bits"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::indice_weights"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::indices"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_locations"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_state"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_weights"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_state"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float16_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float32_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float8_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int2_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int4_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int8_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::offsets"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::output_dtype"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::pooling_mode"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::row_alignment"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::total_D"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::total_cache_hash_size"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::uvm_weights"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_offsets"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_placements"], [3, 2, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_tys"], [3, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::D_offsets"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::cache_hash_size_cumsum"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::cache_index_table_map"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::dev_weights"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::fp8_exponent_bias"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::fp8_exponent_bits"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::indice_weights"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::indices"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_locations"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_state"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_weights"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_state"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float16_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float32_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float8_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int2_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int4_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int8_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::offsets"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::output_dtype"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::pooling_mode"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::row_alignment"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::total_D"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::total_cache_hash_size"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::uvm_weights"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_offsets"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_placements"], [3, 2, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_tys"], [8, 1, 1, "_CPPv413is_uvm_tensorRK6Tensor", "is_uvm_tensor"], [8, 2, 1, "_CPPv413is_uvm_tensorRK6Tensor", "is_uvm_tensor::self"], [6, 1, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense"], [6, 2, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::max_L"], [6, 2, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::offsets"], [6, 2, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::padding_value"], [6, 2, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::values"], [6, 1, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense"], [6, 2, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::max_sequence_length"], [6, 2, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::offsets"], [6, 2, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::values"], [6, 1, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::x_offsets"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::x_values"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::y"], [6, 1, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output"], [6, 2, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::x_offsets"], [6, 2, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::x_values"], [6, 2, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::y"], [6, 1, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda"], [6, 2, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::x_offsets"], [6, 2, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::x_values"], [6, 2, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::y"], [6, 1, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::x_offsets"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::x_values"], [6, 2, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::y"], [6, 1, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense"], [6, 2, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::max_lengths"], [6, 2, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::offsets"], [6, 2, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::padding_value"], [6, 2, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::values"], [6, 1, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward"], [6, 2, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::max_lengths"], [6, 2, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::offsets"], [6, 2, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::padding_value"], [6, 2, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::values"], [13, 5, 1, "_CPPv4N5kv_db12CacheContextE", "kv_db::CacheContext"], [13, 5, 1, "_CPPv4N5kv_db13EmbeddingKVDBE", "kv_db::EmbeddingKVDB"], [13, 5, 1, "_CPPv4N5kv_db9QueueItemE", "kv_db::QueueItem"], [13, 5, 1, "_CPPv4N8l2_cache13CacheLibCacheE", "l2_cache::CacheLibCache"], [12, 1, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::B_offsets"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::cache_hash_size_cumsum"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::indices"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::indices_base_offset"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::max_B"], [12, 2, 1, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t", "linearize_cache_indices_cuda::offsets"], [12, 1, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda"], [12, 2, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::cache_hash_size_cumsum"], [12, 2, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::update_row_indices"], [12, 2, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::update_table_indices"], [12, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::compute_inverse_indices"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::gather_cache_stats"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::lock_cache_line"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::lru_state"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::lxu_cache_locking_counter"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::lxu_cache_state"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::max_indices"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::time_stamp"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::unique_indices"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::unique_indices_length"], [12, 2, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb", "lru_cache_find_uncached_cuda::uvm_cache_stats"], [12, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::D_offsets"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::cache_hash_size_cumsum"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::cache_index_table_map"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::lxu_cache_state"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::lxu_cache_weights"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::stochastic_rounding"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::total_D"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::uvm_weights"], [12, 2, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::weights_offsets"], [12, 1, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda"], [12, 2, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::lxu_cache_locations"], [12, 2, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::lxu_cache_locations_new"], [12, 2, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::num_uniq_cache_indices"], [12, 1, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda"], [12, 2, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda::lxu_cache_locations"], [12, 2, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda::lxu_cache_locking_counter"], [13, 1, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::count"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::indices"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::preferred_sms"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::self"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::use_pipeline"], [13, 2, 1, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_put_cuda::values"], [13, 1, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::count"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::indices"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::preferred_sms"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::self"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::use_pipeline"], [13, 2, 1, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t", "masked_index_select_cuda::values"], [8, 1, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor"], [8, 2, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor::self"], [8, 2, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor::sizes"], [8, 1, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor"], [8, 2, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor::self"], [8, 2, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor::sizes"], [8, 1, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta"], [8, 2, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta::self"], [8, 2, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta::sizes"], [8, 1, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor"], [8, 2, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::is_host_mapped"], [8, 2, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::self"], [8, 2, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::sizes"], [8, 1, 1, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor_meta"], [8, 2, 1, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor_meta::is_host_mapped"], [8, 2, 1, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor_meta::self"], [8, 2, 1, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor_meta::sizes"], [8, 1, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor"], [8, 2, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor::self"], [8, 2, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor::sizes"], [5, 1, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::batch_size"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::include_last_offsets"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::indices_list"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::offsets_list"], [5, 2, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::per_sample_weights"], [9, 1, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::inv_offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::inv_permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::pooled_embs"], [9, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::inv_permute_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::offset_dim_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::permute_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::pooled_embs"], [9, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::inv_permute_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::offset_dim_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::permute_list"], [9, 2, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::pooled_embs"], [9, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::inv_permute_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::offset_dim_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::permute_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::pooled_embs"], [9, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::inv_permute_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::offset_dim_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::permute_list"], [9, 2, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::pooled_embs"], [9, 1, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::allow_duplicates"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::inv_offset_dim_list"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::inv_permute_list"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::offset_dim_list"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::permute_list"], [9, 2, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::pooled_embs"], [9, 1, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::inv_permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::pooled_embs"], [9, 1, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::inv_offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::inv_permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::offset_dim_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::permute_list"], [9, 2, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::pooled_embs"], [3, 1, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu"], [3, 2, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::index_remappings"], [3, 2, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::index_remappings_offsets"], [3, 2, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::indices"], [3, 2, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::offsets"], [3, 1, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda"], [3, 2, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::index_remappings"], [3, 2, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::index_remappings_offsets"], [3, 2, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::indices"], [3, 2, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::offsets"], [3, 1, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::dense_indices"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::hash_table"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::hash_table_offsets"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::indices"], [3, 2, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::offsets"], [3, 1, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda"], [3, 2, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::hash_table"], [3, 2, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::hash_table_offsets"], [3, 2, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::indices"], [3, 2, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::offsets"], [3, 1, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu"], [3, 2, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::hash_table"], [3, 2, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::hash_table_offsets"], [3, 2, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::indices"], [3, 2, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::offsets"], [13, 5, 1, "_CPPv4N2ps24EmbeddingParameterServerE", "ps::EmbeddingParameterServer"], [7, 1, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda"], [7, 2, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda::grad_output"], [7, 2, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda::num_features_per_rank"], [7, 1, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda"], [7, 2, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::cumsum_dim_sum_per_rank"], [7, 2, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::dim_sum_per_rank"], [7, 2, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::grad_output"], [7, 1, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu"], [7, 2, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu::dim_sum_per_rank"], [7, 2, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu::grad_output"], [7, 1, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda"], [7, 2, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda::dim_sum_per_rank"], [7, 2, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda::grad_output"], [0, 1, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::A_SYMMETRIC"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::BIAS_TYPE"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::B_SYMMETRIC"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::DIRECT"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::FUSE_RELU"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::HAS_BIAS"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::Q_GRAN"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::block"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::inp"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::ld_in"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::ld_out"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::out"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingAvx2::r"], [0, 1, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::A_SYMMETRIC"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::BIAS_TYPE"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::B_SYMMETRIC"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::C_PER_G"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::FUSE_RELU"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::HAS_BIAS"], [0, 3, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::Q_GRAN"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::block"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::inp"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::ld_in"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::ld_out"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::out"], [0, 2, 1, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE", "requantizeOutputProcessingGConvAvx512::r"], [12, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::D_offsets"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::buffer_ids"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::cache_hash_size_cumsum"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::dev_weights"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::logical_table_ids"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::lxu_cache_state"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::lxu_cache_weights"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_dev"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_offsets"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_placements"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_uvm"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::pruned_indices"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::pruned_indices_offsets"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::total_cache_hash_size"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::uvm_weights"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::weights_offsets"], [12, 2, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::weights_placements"], [13, 5, 1, "_CPPv4N3ssd16EmbeddingRocksDBE", "ssd::EmbeddingRocksDB"], [13, 1, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::assigned_cache_slots"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::cache_set_inverse_indices"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::cache_set_sorted_unique_indices"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::inserted_ssd_weights"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::linear_index_inverse_indices"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::lxu_cache_locations"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::lxu_cache_weights"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::unique_indices_count_cumsum"], [13, 2, 1, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_generate_row_addrs_cuda::unique_indices_length"], [13, 1, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::cache_set_inverse_indices_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::inserted_ssd_weights_curr_next_map"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::inserted_ssd_weights_next"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::linear_index_inverse_indices_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::lxu_cache_locations_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::lxu_cache_weights"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::ssd_row_addrs_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::unique_indices_count_cumsum_curr"], [13, 2, 1, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "ssd_update_row_addrs_cuda::unique_indices_length_curr"], [5, 1, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu"], [5, 2, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::include_last_offsets"], [5, 2, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::indices_list"], [5, 2, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::offsets_list"], [5, 2, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::per_sample_weights"], [8, 1, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise"], [8, 2, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise::cuda_memory_advise"], [8, 2, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise::self"], [8, 1, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorNSt8optionalI6TensorEE", "uvm_cuda_mem_prefetch_async"], [8, 2, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorNSt8optionalI6TensorEE", "uvm_cuda_mem_prefetch_async::device_t"], [8, 2, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorNSt8optionalI6TensorEE", "uvm_cuda_mem_prefetch_async::self"], [8, 1, 1, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor", "uvm_mem_advice_dont_fork"], [8, 2, 1, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor", "uvm_mem_advice_dont_fork::self"], [8, 1, 1, "_CPPv411uvm_storageRK6Tensor", "uvm_storage"], [8, 2, 1, "_CPPv411uvm_storageRK6Tensor", "uvm_storage::self"], [8, 1, 1, "_CPPv410uvm_to_cpuRK6Tensor", "uvm_to_cpu"], [8, 2, 1, "_CPPv410uvm_to_cpuRK6Tensor", "uvm_to_cpu::self"], [8, 1, 1, "_CPPv416uvm_to_cpu_cloneRK6Tensor", "uvm_to_cpu_clone"], [8, 2, 1, "_CPPv416uvm_to_cpu_cloneRK6Tensor", "uvm_to_cpu_clone::self"], [8, 1, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device"], [8, 2, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device::prototype"], [8, 2, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device::self"], [22, 6, 0, "-", "fbgemm_gpu"]], "fbgemm_gpu.docs.examples": [[31, 7, 1, "", "example_method"]], "fbgemm_gpu.permute_pooled_embedding_modules": [[19, 8, 1, "", "PermutePooledEmbeddings"]], "fbgemm_gpu.permute_pooled_embedding_modules.PermutePooledEmbeddings": [[19, 9, 1, "", "__call__"]], "fbgemm_gpu.split_table_batched_embeddings_ops_inference": [[23, 8, 1, "", "IntNBitTableBatchedEmbeddingBagsCodegen"]], "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen": [[23, 9, 1, "", "assign_embedding_weights"], [23, 9, 1, "", "fill_random_weights"], [23, 9, 1, "", "forward"], [23, 9, 1, "", "recompute_module_buffers"], [23, 9, 1, "", "split_embedding_weights"], [23, 9, 1, "", "split_embedding_weights_with_scale_bias"]], "fbgemm_gpu.split_table_batched_embeddings_ops_training": [[24, 8, 1, "", "SplitTableBatchedEmbeddingBagsCodegen"]], "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen": [[24, 9, 1, "", "forward"], [24, 9, 1, "", "set_learning_rate"], [24, 9, 1, "", "set_optimizer_step"], [24, 9, 1, "", "split_embedding_weights"], [24, 9, 1, "", "split_optimizer_states"], [24, 9, 1, "", "update_hyper_parameters"]], "torch.ops.fbgemm": [[21, 7, 1, "", "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf"], [22, 7, 1, "", "asynchronous_complete_cumsum"], [18, 7, 1, "", "batched_dense_vec_jagged_2d_mul"], [22, 7, 1, "", "block_bucketize_sparse_features"], [18, 7, 1, "", "dense_to_jagged"], [22, 7, 1, "", "expand_into_jagged_permute"], [18, 7, 1, "", "jagged_1d_to_dense"], [18, 7, 1, "", "jagged_2d_to_dense"], [18, 7, 1, "", "jagged_dense_dense_elementwise_add_jagged_output"], [18, 7, 1, "", "jagged_dense_elementwise_add"], [18, 7, 1, "", "jagged_dense_elementwise_add_jagged_output"], [18, 7, 1, "", "jagged_dense_elementwise_mul"], [18, 7, 1, "", "jagged_to_padded_dense"], [22, 7, 1, "", "keyed_jagged_index_select_dim1"], [20, 7, 1, "", "merge_pooled_embeddings"], [22, 7, 1, "", "offsets_range"], [22, 7, 1, "", "permute_1D_sparse_data"], [22, 7, 1, "", "permute_2D_sparse_data"], [20, 7, 1, "", "permute_pooled_embs"], [22, 7, 1, "", "segment_sum_csr"], [18, 7, 1, "", "stacked_jagged_1d_to_dense"], [18, 7, 1, "", "stacked_jagged_2d_to_dense"]]}, "objtypes": {"0": "cpp:enumerator", "1": "cpp:function", "2": "cpp:functionParam", "3": "cpp:templateParam", "4": "cpp:enum", "5": "cpp:class", "6": "py:module", "7": "py:function", "8": "py:class", "9": "py:method"}, "objnames": {"0": ["cpp", "enumerator", "C++ enumerator"], "1": ["cpp", "function", "C++ function"], "2": ["cpp", "functionParam", "C++ function parameter"], "3": ["cpp", "templateParam", "C++ template parameter"], "4": ["cpp", "enum", "C++ enum"], "5": ["cpp", "class", "C++ class"], "6": ["py", "module", "Python module"], "7": ["py", "function", "Python function"], "8": ["py", "class", "Python class"], "9": ["py", "method", "Python method"]}, "titleterms": {"quantiz": [0, 10, 21], "util": 0, "refer": [0, 32], "implement": [0, 1], "method": [0, 1], "avx": 0, "2": 0, "512": 0, "tbe": [1, 23, 24], "cpu": [1, 3, 6, 7, 10, 11, 14, 15], "autovector": 1, "fp8": 1, "16": 1, "32": 1, "autovec": 1, "build": [2, 14, 30], "instruct": [2, 14, 15, 16], "fbgemm": [2, 15, 33], "requir": 2, "hardwar": 2, "softwar": 2, "depend": 2, "asmjit": 2, "cpuinfo": 2, "googletest": 2, "set": [2, 14, 15, 30], "up": [2, 14, 15, 30], "an": [2, 14], "isol": [2, 14], "environ": [2, 14, 15, 16, 30], "instal": [2, 14, 15], "tool": [2, 14], "c": [2, 14, 29, 33], "compil": [2, 14], "other": [2, 14, 18, 19, 20, 21, 23, 24, 32], "librari": [2, 15], "prepar": [2, 14], "linux": 2, "maco": 2, "cmake": 2, "gcc": [2, 14], "issu": [2, 27], "12": 2, "clang": [2, 14], "bazel": 2, "window": 2, "embed": [3, 9, 12, 13, 19, 20, 23, 24], "oper": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 17, 18, 20, 21, 22, 33], "cuda": [3, 6, 7, 8, 10, 11, 13, 14, 15, 16], "experiment": 4, "attent": 4, "combin": [5, 17], "input": 5, "jag": [6, 17, 18], "tensor": [6, 17, 18], "layout": 7, "transform": 7, "memori": 8, "pool": [9, 19, 20], "merg": 9, "permut": 9, "spars": [11, 22], "data": 11, "tabl": [12, 15, 23, 24], "batch": [12, 23, 24], "ssd": 13, "miniconda": 14, "conda": [14, 15], "onli": [14, 15], "genai": 14, "docker": [14, 15], "imag": 14, "cudnn": 14, "cutlass": 14, "rocm": [14, 15, 16], "miopen": 14, "symlink": 14, "pytorch": [14, 15], "through": [14, 15], "pip": [14, 15], "post": [14, 15], "check": [14, 15], "triton": [14, 15], "pre": 14, "setup": [14, 16], "The": 14, "process": 14, "wheel": 14, "variabl": 14, "For": 14, "develop": [14, 33], "undefin": [14, 15], "symbol": [14, 15], "glibc": 14, "version": 14, "compat": [14, 15], "releas": 15, "nvidia": 15, "driver": 15, "contain": 15, "runtim": 15, "amdgpu": 15, "python": [15, 25, 31, 33], "fbgemm_gpu": [15, 16, 25, 30, 33], "packag": 15, "public": 15, "pypi": 15, "test": 16, "run": 16, "variant": 16, "benchmark": 16, "high": 17, "level": 17, "overview": [17, 33], "format": 17, "valu": 17, "offset": 17, "max": 17, "length": 17, "exampl": 17, "arithmet": 17, "convers": 17, "dens": 17, "stabl": [18, 19, 20, 21, 22, 23, 24, 25, 33], "api": [18, 19, 20, 21, 22, 23, 24, 25, 33], "modul": [19, 23, 24, 33], "infer": 23, "train": 24, "contact": 26, "u": 26, "github": 26, "slack": 26, "contribut": 27, "code": [27, 29, 31, 32], "conduct": 27, "pull": 27, "request": 27, "contributor": 27, "licens": [27, 28], "agreement": 27, "cla": 27, "ad": [29, 31, 32], "document": [29, 30, 31, 32, 33], "gener": [30, 31, 33], "guidelin": 30, "specif": 30, "guid": 30, "toolchain": 30, "lint": 30, "deploy": 30, "preview": 30, "todo": 31, "auto": 31, "sphinx": 32, "pointer": 32, "section": 32, "referenc": 32, "sourc": 32, "latex": 32, "graph": 32, "homepag": 33, "info": 33}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1, "sphinx": 57}, "alltitles": {"Jagged Tensor Operators": [[17, "jagged-tensor-operators"], [6, "jagged-tensor-operators"], [18, "module-fbgemm_gpu"]], "High Level Overview": [[17, "high-level-overview"]], "Jagged Tensor Format": [[17, "jagged-tensor-format"]], "Values": [[17, "values"]], "Offsets": [[17, "offsets"]], "Max Lengths": [[17, "max-lengths"]], "Jagged Tensor Example": [[17, "jagged-tensor-example"]], "Jagged Tensor Operations": [[17, "jagged-tensor-operations"]], "Arithmetic Operations": [[17, "arithmetic-operations"]], "Conversion Operations": [[17, "conversion-operations"]], "Jagged to Dense": [[17, "jagged-to-dense"]], "Dense to Jagged": [[17, "dense-to-jagged"]], "Combined Arithmetic + Conversion Operations": [[17, "combined-arithmetic-conversion-operations"]], "Test Instructions": [[16, "test-instructions"]], "Setup the FBGEMM_GPU Test Environment": [[16, "setup-the-fbgemm-gpu-test-environment"]], "Running FBGEMM_GPU Tests": [[16, "running-fbgemm-gpu-tests"]], "Testing with the CUDA Variant": [[16, "testing-with-the-cuda-variant"]], "Testing with the ROCm Variant": [[16, "testing-with-the-rocm-variant"]], "Running FBGEMM_GPU Benchmarks": [[16, "running-fbgemm-gpu-benchmarks"]], "Pooled Embedding Modules": [[19, "module-fbgemm_gpu"]], "Stable API": [[19, "stable-api"], [23, "stable-api"], [24, "stable-api"], [18, "stable-api"], [20, "stable-api"], [21, "stable-api"], [22, "stable-api"]], "Other API": [[19, "other-api"], [23, "other-api"], [24, "other-api"], [18, "other-api"], [20, "other-api"], [21, "other-api"]], "Table Batched Embedding (TBE) Inference Module": [[23, "table-batched-embedding-tbe-inference-module"]], "FBGEMM_GPU Stable Python API": [[25, "fbgemm-gpu-stable-python-api"]], "Stable APIs": [[25, "stable-apis"]], "Table Batched Embedding (TBE) Training Module": [[24, "table-batched-embedding-tbe-training-module"]], "Contributing": [[27, "contributing"]], "Code of Conduct": [[27, "code-of-conduct"]], "Pull Requests": [[27, "pull-requests"]], "Contributor License Agreement (\u201cCLA\u201d)": [[27, "contributor-license-agreement-cla"]], "Issues": [[27, "issues"]], "License": [[27, "license"], [28, "license"]], "Contact Us": [[26, "contact-us"]], "GitHub": [[26, "github"]], "Slack": [[26, "slack"]], "Sphinx Documentation Pointers": [[32, "sphinx-documentation-pointers"]], "References Other Sections of the Documentation": [[32, "references-other-sections-of-the-documentation"]], "Referencing the Source Code": [[32, "referencing-the-source-code"]], "Adding LaTeX": [[32, "adding-latex"]], "Adding Graphs": [[32, "adding-graphs"]], "FBGEMM and FBGEMM_GPU Documentation Homepage": [[33, "fbgemm-and-fbgemm-gpu-documentation-homepage"]], "General Info": [[33, null]], "FBGEMM Development": [[33, null]], "FBGEMM_GPU Development": [[33, null]], "FBGEMM_GPU Overview": [[33, null]], "FBGEMM Stable API": [[33, null]], "FBGEMM C++ API": [[33, null]], "FBGEMM_GPU C++ API": [[33, null]], "FBGEMM_GPU Python Operators API": [[33, null]], "FBGEMM_GPU Python Modules API": [[33, null]], "Layout Transformation Operators": [[7, "layout-transformation-operators"]], "CUDA Operators": [[7, "cuda-operators"], [6, "cuda-operators"], [3, "cuda-operators"], [13, "cuda-operators"], [10, "cuda-operators"], [11, "cuda-operators"]], "CPU Operators": [[7, "cpu-operators"], [6, "cpu-operators"], [3, "cpu-operators"], [10, "cpu-operators"], [11, "cpu-operators"]], "Experimental Operators": [[4, "experimental-operators"]], "Attention Operators": [[4, "attention-operators"]], "Combine Input Operators": [[5, "combine-input-operators"]], "Quantization Utilities": [[0, "quantization-utilities"]], "Reference Implementation Methods": [[0, "reference-implementation-methods"]], "AVX-2 Implementation Methods": [[0, "avx-2-implementation-methods"]], "AVX-512 Implementation Methods": [[0, "avx-512-implementation-methods"]], "Embedding Operators": [[3, "embedding-operators"]], "Build Instructions": [[2, "build-instructions"], [14, "build-instructions"]], "FBGEMM Requirements": [[2, "fbgemm-requirements"]], "Hardware Requirements": [[2, "hardware-requirements"]], "Software Dependencies": [[2, "software-dependencies"]], "asmjit": [[2, "asmjit"]], "cpuinfo": [[2, "cpuinfo"]], "GoogleTest": [[2, "googletest"]], "Set Up an Isolated Build Environment": [[2, "set-up-an-isolated-build-environment"], [14, "set-up-an-isolated-build-environment"]], "Install the Build Tools": [[2, "install-the-build-tools"], [14, "install-the-build-tools"]], "C/C++ Compiler": [[2, "c-c-compiler"]], "Other Build Tools": [[2, "other-build-tools"], [14, "other-build-tools"]], "Build the FBGEMM Library": [[2, "build-the-fbgemm-library"]], "Preparing the Build": [[2, "preparing-the-build"], [14, "preparing-the-build"]], "Building on Linux and macOS (CMake + GCC)": [[2, "building-on-linux-and-macos-cmake-gcc"]], "Build Issues with GCC 12+": [[2, "build-issues-with-gcc-12"]], "Building on Linux and macOS (CMake + Clang)": [[2, "building-on-linux-and-macos-cmake-clang"]], "Building on Linux (Bazel)": [[2, "building-on-linux-bazel"]], "Building on Windows": [[2, "building-on-windows"]], "TBE CPU Autovectorization": [[1, "tbe-cpu-autovectorization"]], "FP8/16/32 Autovec Implementation Methods": [[1, "fp8-16-32-autovec-implementation-methods"]], "Installation Instructions": [[15, "installation-instructions"]], "FBGEMM Releases Compatibility Table": [[15, "fbgemm-releases-compatibility-table"]], "Set Up CPU-Only Environment": [[15, "set-up-cpu-only-environment"]], "Set Up CUDA Environment": [[15, "set-up-cuda-environment"]], "Install NVIDIA Drivers": [[15, "install-nvidia-drivers"]], "Set Up the CUDA Docker Container and Conda Environment": [[15, "set-up-the-cuda-docker-container-and-conda-environment"]], "Install the CUDA Runtime": [[15, "install-the-cuda-runtime"]], "Set Up ROCm Environment": [[15, "set-up-rocm-environment"]], "Install AMDGPU Drivers": [[15, "install-amdgpu-drivers"]], "Set Up the ROCm Docker Container and Conda Environment": [[15, "set-up-the-rocm-docker-container-and-conda-environment"]], "Install Python Libraries": [[15, "install-python-libraries"]], "Install PyTorch": [[15, "install-pytorch"], [14, "install-pytorch"]], "Install Triton": [[15, "install-triton"]], "Install the FBGEMM_GPU Package": [[15, "install-the-fbgemm-gpu-package"]], "Install through PyTorch PIP": [[15, "install-through-pytorch-pip"]], "Install through Public PyPI": [[15, "install-through-public-pypi"]], "Post-Installation Checks": [[15, "post-installation-checks"]], "Undefined Symbols": [[15, "undefined-symbols"]], "Install Miniconda": [[14, "install-miniconda"]], "Set Up the Conda Environment": [[14, "set-up-the-conda-environment"]], "Set Up for CPU-Only Build": [[14, "set-up-for-cpu-only-build"]], "Set Up for CUDA / GenAI-Only Build": [[14, "set-up-for-cuda-genai-only-build"]], "CUDA Docker Image": [[14, "cuda-docker-image"]], "Install CUDA": [[14, "install-cuda"]], "Install cuDNN": [[14, "install-cudnn"]], "Install CUTLASS": [[14, "install-cutlass"]], "Set Up for ROCm Build": [[14, "set-up-for-rocm-build"]], "ROCm Docker Image": [[14, "rocm-docker-image"]], "Install ROCm": [[14, "install-rocm"]], "Install MIOpen": [[14, "install-miopen"]], "C/C++ Compiler (GCC)": [[14, "c-c-compiler-gcc"]], "C/C++ Compiler (Clang)": [[14, "c-c-compiler-clang"]], "Compiler Symlinks": [[14, "compiler-symlinks"]], "Installation Through Conda": [[14, "installation-through-conda"]], "Installation Through PyTorch PIP": [[14, "installation-through-pytorch-pip"]], "Post-Install Checks": [[14, "post-install-checks"]], "Install PyTorch-Triton": [[14, "install-pytorch-triton"]], "Other Pre-Build Setup": [[14, "other-pre-build-setup"]], "The Build Process": [[14, "the-build-process"]], "Set Wheel Build Variables": [[14, "set-wheel-build-variables"]], "CPU-Only Build": [[14, "cpu-only-build"]], "CUDA Build": [[14, "cuda-build"]], "GenAI-Only Build": [[14, "genai-only-build"]], "ROCm Build": [[14, "rocm-build"]], "Post-Build Checks (For Developers)": [[14, "post-build-checks-for-developers"]], "Undefined Symbols Check": [[14, "undefined-symbols-check"]], "GLIBC Version Compatibility Check": [[14, "glibc-version-compatibility-check"]], "Table Batched Embedding Operators": [[12, "table-batched-embedding-operators"]], "SSD Embedding Operators": [[13, "ssd-embedding-operators"]], "Adding Documentation to Python Code": [[31, "adding-documentation-to-python-code"]], "Todo": [[31, "id1"]], "Adding Documentation to Auto-Generated Python Code": [[31, "adding-documentation-to-auto-generated-python-code"]], "Adding Documentation to C++ Code": [[29, "adding-documentation-to-c-code"]], "Documentation": [[30, "documentation"]], "General Documentation Guidelines": [[30, "general-documentation-guidelines"]], "Specific Documentation Guides": [[30, "specific-documentation-guides"]], "Building the Documentation": [[30, "building-the-documentation"]], "Set Up Build Environment": [[30, "set-up-build-environment"]], "Build FBGEMM_GPU": [[30, "build-fbgemm-gpu"]], "Set Up the Documentation Toolchain": [[30, "set-up-the-documentation-toolchain"]], "Build the Documentation": [[30, "build-the-documentation"]], "Linting the Documentation": [[30, "linting-the-documentation"]], "Deployment Preview": [[30, "deployment-preview"]], "Pooled Embeddings Operators": [[9, "pooled-embeddings-operators"]], "Merge Operators": [[9, "merge-operators"]], "Permutation Operators": [[9, "permutation-operators"]], "CUDA Memory Operators": [[8, "cuda-memory-operators"]], "Quantization Operators": [[10, "quantization-operators"], [21, "module-fbgemm_gpu"]], "Sparse Data Operators": [[11, "sparse-data-operators"]], "Pooled Embedding Operators": [[20, "module-fbgemm_gpu"]], "Sparse Operators": [[22, "module-fbgemm_gpu"]]}, "indexentries": {"findminmax (c++ function)": [[0, "_CPPv410FindMinMaxPKfPfPf7int64_t"]], "floatorhalftofusednbitrowwisequantizedsbhalf (c++ function)": [[0, "_CPPv4I0E44FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfviPK9InputType6size_tiPNSt7uint8_tE"]], "fusedquantizedequantize (c++ function)": [[0, "_CPPv4I0E23FusedQuantizeDequantizevPKfPfNSt7int64_tERK24TensorQuantizationParamsiif"]], "quantizegroupwise (c++ function)": [[0, "_CPPv4I0_8layout_tE17QuantizeGroupwisevPKfiiiiPKfPKNSt7int32_tEP1T"]], "xor128 (c++ function)": [[0, "_CPPv46Xor128v"]], "requantizeoutputprocessingavx2 (c++ function)": [[0, "_CPPv4I_b_b_23QuantizationGranularity_b_b0_bE30requantizeOutputProcessingAvx2vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE"]], "requantizeoutputprocessinggconvavx512 (c++ function)": [[0, "_CPPv4I_b_b_23QuantizationGranularity_b_b_i0E37requantizeOutputProcessingGConvAvx512vPNSt7uint8_tEPKNSt7int32_tERK12block_type_tiiRK22requantizationParams_tI9BIAS_TYPEE"]], "bounds_check_indices_cuda (c++ function)": [[3, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKNSt8optionalI6TensorEERKNSt8optionalI6TensorEEK7int64_tRKNSt8optionalI6TensorEEK7int64_tK7int64_tK6int8_t"]], "int_nbit_split_embedding_codegen_lookup_function (c++ function)": [[3, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE"]], "int_nbit_split_embedding_codegen_lookup_function_cpu (c++ function)": [[3, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEE"]], "int_nbit_split_embedding_uvm_caching_codegen_lookup_function (c++ function)": [[3, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE"]], "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu (c++ function)": [[3, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tNSt8optionalI6TensorEE7int64_tNSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI7int64_tEENSt8optionalI6TensorEENSt8optionalI6TensorEENSt8optionalI6TensorEE"]], "pruned_array_lookup_cpu (c++ function)": [[3, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor"]], "pruned_array_lookup_cuda (c++ function)": [[3, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_insert_unweighted_cpu (c++ function)": [[3, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_lookup_cuda (c++ function)": [[3, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_lookup_unweighted_cpu (c++ function)": [[3, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor"]], "gqa_attn_splitk (c++ function)": [[4, "_CPPv415gqa_attn_splitkRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorEKdK7int64_tK7int64_tKbK7int64_t"]], "padding_fused_tbe_input_combine_cpu (c++ function)": [[5, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t"]], "tbe_input_combine_cpu (c++ function)": [[5, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE"]], "batched_dense_vec_jagged_2d_mul (c++ function)": [[6, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor"]], "dense_to_jagged (c++ function)": [[6, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEENSt8optionalIN2at6SymIntEEE"]], "jagged_1d_to_dense (c++ function)": [[6, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t"]], "jagged_2d_to_dense (c++ function)": [[6, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE"]], "jagged_dense_elementwise_add (c++ function)": [[6, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_add_jagged_output (c++ function)": [[6, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_add_jagged_output_cuda (c++ function)": [[6, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_mul (c++ function)": [[6, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_to_padded_dense (c++ function)": [[6, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd"]], "jagged_to_padded_dense_forward (c++ function)": [[6, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd"]], "recat_embedding_grad_output_cuda (c++ function)": [[7, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE"]], "recat_embedding_grad_output_mixed_d_batch_cuda (c++ function)": [[7, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor"]], "recat_embedding_grad_output_mixed_d_cpu (c++ function)": [[7, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE"]], "recat_embedding_grad_output_mixed_d_cuda (c++ function)": [[7, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE"]], "is_uvm_tensor (c++ function)": [[8, "_CPPv413is_uvm_tensorRK6Tensor"]], "new_host_mapped_tensor (c++ function)": [[8, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_managed_tensor (c++ function)": [[8, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_managed_tensor_meta (c++ function)": [[8, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_unified_tensor (c++ function)": [[8, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb"]], "new_unified_tensor_meta (c++ function)": [[8, "_CPPv423new_unified_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEEb"]], "new_vanilla_managed_tensor (c++ function)": [[8, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "uvm_cuda_mem_advise (c++ function)": [[8, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t"]], "uvm_cuda_mem_prefetch_async (c++ function)": [[8, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorNSt8optionalI6TensorEE"]], "uvm_mem_advice_dont_fork (c++ function)": [[8, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor"]], "uvm_storage (c++ function)": [[8, "_CPPv411uvm_storageRK6Tensor"]], "uvm_to_cpu (c++ function)": [[8, "_CPPv410uvm_to_cpuRK6Tensor"]], "uvm_to_cpu_clone (c++ function)": [[8, "_CPPv416uvm_to_cpu_cloneRK6Tensor"]], "uvm_to_device (c++ function)": [[8, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor"]], "all_to_one_device (c++ function)": [[9, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE"]], "permute_pooled_embs_auto_grad (c++ function)": [[9, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_cpu (c++ function)": [[9, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_gpu (c++ function)": [[9, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_split_cpu (c++ function)": [[9, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_auto_grad_split_gpu (c++ function)": [[9, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_cpu_impl (c++ function)": [[9, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb"]], "permute_pooled_embs_split_cpu (c++ function)": [[9, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_split_gpu (c++ function)": [[9, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "fp8quantizedtofloat_ref (c++ function)": [[10, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi"]], "fp8rowwise_to_float_cpu (c++ function)": [[10, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t"]], "floattofp8quantized_ref (c++ function)": [[10, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd"]], "_fp8rowwise_to_float_gpu (c++ function)": [[10, "_CPPv424_FP8rowwise_to_float_gpuRKN2at6TensorEbK7int64_t"]], "_bfloat16_to_float_gpu (c++ function)": [[10, "_CPPv422_bfloat16_to_float_gpuRKN2at6TensorE"]], "_float_to_fp8rowwise_gpu (c++ function)": [[10, "_CPPv424_float_to_FP8rowwise_gpuRK6TensorKb"]], "_float_to_bfloat16_gpu (c++ function)": [[10, "_CPPv422_float_to_bfloat16_gpuRKN2at6TensorE"]], "_float_to_fused8bitrowwise_cpu_out (c++ function)": [[10, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor"]], "_float_to_fused8bitrowwise_gpu (c++ function)": [[10, "_CPPv430_float_to_fused8bitrowwise_gpuRK6Tensor"]], "_float_to_fusednbitrowwise_gpu (c++ function)": [[10, "_CPPv430_float_to_fusednbitrowwise_gpuRK6TensorK7int64_t"]], "_float_to_hfp8_gpu (c++ function)": [[10, "_CPPv418_float_to_hfp8_gpuRKN2at6TensorEK7int64_tK7int64_tKd"]], "_float_to_msfp_gpu (c++ function)": [[10, "_CPPv418_float_to_msfp_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_tK7int64_tKdKd"]], "_float_to_paddedfp8rowwise_gpu (c++ function)": [[10, "_CPPv430_float_to_paddedFP8rowwise_gpuRK6TensorKbK7int64_t"]], "_fused8bitrowwise_to_float_cpu_out (c++ function)": [[10, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor"]], "_fused8bitrowwise_to_float_gpu (c++ function)": [[10, "_CPPv430_fused8bitrowwise_to_float_gpuRKN2at6TensorE"]], "_fused8bitrowwise_to_float_mixed_dim_gpu (c++ function)": [[10, "_CPPv440_fused8bitrowwise_to_float_mixed_dim_gpuRKN2at6TensorERKN2at6TensorEK7int64_t"]], "_fused8bitrowwise_to_half_gpu (c++ function)": [[10, "_CPPv429_fused8bitrowwise_to_half_gpuRKN2at6TensorE"]], "_fused8bitrowwise_to_single_or_half_precision_gpu (c++ function)": [[10, "_CPPv449_fused8bitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tKbKb"]], "_fusednbitrowwise_to_float_gpu (c++ function)": [[10, "_CPPv430_fusednbitrowwise_to_float_gpuRKN2at6TensorEK7int64_t"]], "_fusednbitrowwise_to_half_gpu (c++ function)": [[10, "_CPPv429_fusednbitrowwise_to_half_gpuRKN2at6TensorEK7int64_t"]], "_fusednbitrowwise_to_single_or_half_precision_gpu (c++ function)": [[10, "_CPPv449_fusednbitrowwise_to_single_or_half_precision_gpuRKN2at6TensorEK7int64_tK7int64_t"]], "_half_to_fused8bitrowwise_gpu (c++ function)": [[10, "_CPPv429_half_to_fused8bitrowwise_gpuRK6Tensor"]], "_half_to_fusednbitrowwise_gpu (c++ function)": [[10, "_CPPv429_half_to_fusednbitrowwise_gpuRKN2at6TensorEK7int64_t"]], "_hfp8_to_float_gpu (c++ function)": [[10, "_CPPv418_hfp8_to_float_gpuRKN2at6TensorEK7int64_tK7int64_t"]], "_msfp_to_float_gpu (c++ function)": [[10, "_CPPv418_msfp_to_float_gpuRKN2at6TensorEK7int64_tK7int64_tK7int64_t"]], "_paddedfp8rowwise_to_float_gpu (c++ function)": [[10, "_CPPv430_paddedFP8rowwise_to_float_gpuRKN2at6TensorEKbK7int64_tK7int64_tK7int64_t"]], "_single_or_half_precision_to_fused8bitrowwise_gpu (c++ function)": [[10, "_CPPv449_single_or_half_precision_to_fused8bitrowwise_gpuRK6Tensor"]], "_single_or_half_precision_to_fusednbitrowwise_gpu (c++ function)": [[10, "_CPPv449_single_or_half_precision_to_fusednbitrowwise_gpuRK6TensorK7int64_t"]], "float_or_half_to_fused8bitrowwise_cpu (c++ function)": [[10, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor"]], "float_to_fp8rowwise_cpu (c++ function)": [[10, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb"]], "float_to_fused8bitrowwise_cpu (c++ function)": [[10, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor"]], "fused8bitrowwise_to_float_cpu (c++ function)": [[10, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor"]], "fused8bitrowwise_to_float_or_half_cpu (c++ function)": [[10, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_tKbKb"]], "fused8bitrowwise_to_half_cpu (c++ function)": [[10, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor"]], "fusednbitrowwise_sbfront_to_float_cpu (c++ function)": [[10, "_CPPv437fusednbitrowwise_sbfront_to_float_cpuRK6TensorK7int64_t"]], "fusednbitrowwise_to_float_cpu (c++ function)": [[10, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t"]], "fusednbitrowwise_to_float_or_half_cpu (c++ function)": [[10, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t"]], "fusednbitrowwise_to_half_cpu (c++ function)": [[10, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t"]], "half_to_fused8bitrowwise_cpu (c++ function)": [[10, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor"]], "expand_into_jagged_permute_cuda (c++ function)": [[11, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t"]], "generic_histogram_binning_calibration_by_feature_cpu (c++ function)": [[11, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td"]], "direct_mapped_lxu_cache_lookup_cuda (c++ function)": [[12, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbNSt8optionalIN2at6TensorEEE"]], "get_unique_indices_cuda (c++ function)": [[12, "_CPPv423get_unique_indices_cudaRKN2at6TensorEK7int64_tKb"]], "get_unique_indices_with_inverse_cuda (c++ function)": [[12, "_CPPv436get_unique_indices_with_inverse_cudaRKN2at6TensorEK7int64_tKbKb"]], "host_lxu_cache_slot (c++ function)": [[12, "_CPPv419host_lxu_cache_slot7int64_t7int64_t"]], "linearize_cache_indices_cuda (c++ function)": [[12, "_CPPv428linearize_cache_indices_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorERKNSt8optionalIN2at6TensorEEEK7int64_tK7int64_t"]], "linearize_cache_indices_from_row_idx_cuda (c++ function)": [[12, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE"]], "lru_cache_find_uncached_cuda (c++ function)": [[12, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorEKb"]], "lxu_cache_flush_cuda (c++ function)": [[12, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb"]], "lxu_cache_locations_update_cuda (c++ function)": [[12, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorENSt8optionalIN2at6TensorEEE"]], "lxu_cache_locking_counter_decrement_cuda (c++ function)": [[12, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE"]], "reset_weight_momentum_cuda (c++ function)": [[12, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t"]], "rocksdbwritemode (c++ enum)": [[13, "_CPPv416RocksdbWriteMode"]], "rocksdbwritemode::bwd_l1_cnflct_miss_write_back (c++ enumerator)": [[13, "_CPPv4N16RocksdbWriteMode29BWD_L1_CNFLCT_MISS_WRITE_BACKE"]], "rocksdbwritemode::flush (c++ enumerator)": [[13, "_CPPv4N16RocksdbWriteMode5FLUSHE"]], "rocksdbwritemode::fwd_l1_eviction (c++ enumerator)": [[13, "_CPPv4N16RocksdbWriteMode15FWD_L1_EVICTIONE"]], "rocksdbwritemode::fwd_rocksdb_read (c++ enumerator)": [[13, "_CPPv4N16RocksdbWriteMode16FWD_ROCKSDB_READE"]], "compact_indices_cuda (c++ function)": [[13, "_CPPv420compact_indices_cudaNSt6vectorI6TensorEE6TensorNSt6vectorI6TensorEE6Tensor6Tensor"]], "cuda_callback_func (c++ function)": [[13, "_CPPv418cuda_callback_func12cudaStream_t11cudaError_tPv"]], "hash_shard (c++ function)": [[13, "_CPPv410hash_shard7int64_t6size_t"]], "kv_db::cachecontext (c++ class)": [[13, "_CPPv4N5kv_db12CacheContextE"]], "kv_db::embeddingkvdb (c++ class)": [[13, "_CPPv4N5kv_db13EmbeddingKVDBE"]], "kv_db::queueitem (c++ struct)": [[13, "_CPPv4N5kv_db9QueueItemE"]], "l2_cache::cachelibcache (c++ class)": [[13, "_CPPv4N8l2_cache13CacheLibCacheE"]], "masked_index_put_cuda (c++ function)": [[13, "_CPPv421masked_index_put_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t"]], "masked_index_select_cuda (c++ function)": [[13, "_CPPv424masked_index_select_cuda6Tensor6Tensor6Tensor6TensorKbK7int64_t"]], "ps::embeddingparameterserver (c++ class)": [[13, "_CPPv4N2ps24EmbeddingParameterServerE"]], "ssd::embeddingrocksdb (c++ class)": [[13, "_CPPv4N3ssd16EmbeddingRocksDBE"]], "ssd_generate_row_addrs_cuda (c++ function)": [[13, "_CPPv427ssd_generate_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "ssd_update_row_addrs_cuda (c++ function)": [[13, "_CPPv425ssd_update_row_addrs_cudaRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "batched_dense_vec_jagged_2d_mul() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul"]], "dense_to_jagged() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.dense_to_jagged"]], "fbgemm_gpu": [[18, "module-fbgemm_gpu"], [19, "module-fbgemm_gpu"], [20, "module-fbgemm_gpu"], [21, "module-fbgemm_gpu"], [22, "module-fbgemm_gpu"]], "jagged_1d_to_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_1d_to_dense"]], "jagged_2d_to_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_2d_to_dense"]], "jagged_dense_dense_elementwise_add_jagged_output() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output"]], "jagged_dense_elementwise_add() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_dense_elementwise_add"]], "jagged_dense_elementwise_add_jagged_output() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output"]], "jagged_dense_elementwise_mul() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_dense_elementwise_mul"]], "jagged_to_padded_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.jagged_to_padded_dense"]], "module": [[18, "module-fbgemm_gpu"], [19, "module-fbgemm_gpu"], [20, "module-fbgemm_gpu"], [21, "module-fbgemm_gpu"], [22, "module-fbgemm_gpu"]], "stacked_jagged_1d_to_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.stacked_jagged_1d_to_dense"]], "stacked_jagged_2d_to_dense() (in module torch.ops.fbgemm)": [[18, "torch.ops.fbgemm.stacked_jagged_2d_to_dense"]], "permutepooledembeddings (class in fbgemm_gpu.permute_pooled_embedding_modules)": [[19, "fbgemm_gpu.permute_pooled_embedding_modules.PermutePooledEmbeddings"]], "__call__() (fbgemm_gpu.permute_pooled_embedding_modules.permutepooledembeddings method)": [[19, "fbgemm_gpu.permute_pooled_embedding_modules.PermutePooledEmbeddings.__call__"]], "merge_pooled_embeddings() (in module torch.ops.fbgemm)": [[20, "torch.ops.fbgemm.merge_pooled_embeddings"]], "permute_pooled_embs() (in module torch.ops.fbgemm)": [[20, "torch.ops.fbgemm.permute_pooled_embs"]], "floatorhalftofusednbitrowwisequantizedsbhalf() (in module torch.ops.fbgemm)": [[21, "torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf"]], "asynchronous_complete_cumsum() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.asynchronous_complete_cumsum"]], "block_bucketize_sparse_features() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.block_bucketize_sparse_features"]], "expand_into_jagged_permute() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.expand_into_jagged_permute"]], "keyed_jagged_index_select_dim1() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.keyed_jagged_index_select_dim1"]], "offsets_range() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.offsets_range"]], "permute_1d_sparse_data() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.permute_1D_sparse_data"]], "permute_2d_sparse_data() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.permute_2D_sparse_data"]], "segment_sum_csr() (in module torch.ops.fbgemm)": [[22, "torch.ops.fbgemm.segment_sum_csr"]], "intnbittablebatchedembeddingbagscodegen (class in fbgemm_gpu.split_table_batched_embeddings_ops_inference)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen"]], "assign_embedding_weights() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.assign_embedding_weights"]], "fill_random_weights() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.fill_random_weights"]], "forward() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.forward"]], "recompute_module_buffers() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.recompute_module_buffers"]], "split_embedding_weights() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.split_embedding_weights"]], "split_embedding_weights_with_scale_bias() (fbgemm_gpu.split_table_batched_embeddings_ops_inference.intnbittablebatchedembeddingbagscodegen method)": [[23, "fbgemm_gpu.split_table_batched_embeddings_ops_inference.IntNBitTableBatchedEmbeddingBagsCodegen.split_embedding_weights_with_scale_bias"]], "splittablebatchedembeddingbagscodegen (class in fbgemm_gpu.split_table_batched_embeddings_ops_training)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen"]], "forward() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.forward"]], "set_learning_rate() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.set_learning_rate"]], "set_optimizer_step() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.set_optimizer_step"]], "split_embedding_weights() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.split_embedding_weights"]], "split_optimizer_states() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.split_optimizer_states"]], "update_hyper_parameters() (fbgemm_gpu.split_table_batched_embeddings_ops_training.splittablebatchedembeddingbagscodegen method)": [[24, "fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen.update_hyper_parameters"]], "example_method (c++ function)": [[29, "_CPPv4I0_NSt6size_tEE14example_method7int32_t1Tf"]], "example_method() (in module fbgemm_gpu.docs.examples)": [[31, "fbgemm_gpu.docs.examples.example_method"]]}}) \ No newline at end of file