From 6d40ff1442572b8808961689c5ecb4ee5c975456 Mon Sep 17 00:00:00 2001 From: Torch-TensorRT Github Bot Date: Wed, 23 Oct 2024 02:12:50 +0000 Subject: [PATCH] docs: [Automated] Regenerating documenation for 92bf700 Signed-off-by: Torch-TensorRT Github Bot --- .../classtorch__tensorrt_1_1DataType.html | 5 +- ...rch__tensorrt_1_1Device_1_1DeviceType.html | 5 +- .../classtorch__tensorrt_1_1TensorFormat.html | 5 +- ...ensorrt_1_1ptq_1_1Int8CacheCalibrator.html | 5 +- ...ch__tensorrt_1_1ptq_1_1Int8Calibrator.html | 5 +- ...8h_1a18d295a837ac71add5578860b55e5502.html | 5 +- ...8h_1a282fd3c0b1c3a215148ae372070e1268.html | 5 +- ...8h_1a31398a6d4d27e28817afb0f0139e909e.html | 5 +- ...8h_1a35703561b26b1a9d2738ad7d58b27827.html | 5 +- ...8h_1abd1465eb38256d3f22cc1426b23d516b.html | 5 +- ...8h_1abe87b341f562fd1cf40b7672e4d759da.html | 5 +- ...8h_1ad19939408f7be171a74a89928b36eb59.html | 5 +- ...8h_1adad592a7b1b7eed529cdf6acd584c883.html | 5 +- docs/_cpp_api/dir_cpp.html | 5 +- docs/_cpp_api/dir_cpp_include.html | 5 +- .../dir_cpp_include_torch_tensorrt.html | 5 +- ...ng_1a130f65408ad8cbaee060f05e8db69558.html | 5 +- ...rt_1a3fbe5d72e4fc624dbd038853079620eb.html | 5 +- ..._cpp_include_torch_tensorrt_logging.h.html | 5 +- ...e_cpp_include_torch_tensorrt_macros.h.html | 5 +- ...file_cpp_include_torch_tensorrt_ptq.h.html | 5 +- ...clude_torch_tensorrt_torch_tensorrt.h.html | 5 +- ...ng_1a0593f776f469c20469e2f729fc7861a3.html | 5 +- ...ng_1a0c012cb374addd90eb1f42eaec570650.html | 5 +- ...ng_1a56e110feaaba2c3fd44bd201fd21a76a.html | 5 +- ...ng_1a7cb50492421ea9de4e3db895819df6f2.html | 5 +- ...ng_1ac46ac0901cb97e3ae6e93b45f24e90b8.html | 5 +- ...ng_1ad2efd47b6c3689e58ccc595680579ae5.html | 5 +- ...ng_1af8f3443813315af7901903d25dd495cc.html | 5 +- ...tq_1a226e3c83379d1012cde8578c1c86b16c.html | 5 +- ...tq_1a6186e305f47c1d94b6130ef6c7f7e178.html | 5 +- ...pt_1a5b405fd3bf3c8fc2e2a54cbbab979797.html | 5 +- ...pt_1a6e19490a08fb1553c9dd347a5ae79db9.html | 5 +- ...pt_1a81f9783517335dda877d8cfcf38987c9.html | 5 +- ...pt_1ae8d56472106eeef37fbe51ff7f40c9b2.html | 5 +- ...rt_1ac4ab8313ae72c2c899ea31548b528528.html | 5 +- ...rt_1ad1acd06eaeaffbbcf6e7ebf426891384.html | 5 +- ...rt_1ad6a4ee8ca6c8f6e5519eb1128ec7f4a1.html | 5 +- docs/_cpp_api/namespace_torch_tensorrt.html | 5 +- .../namespace_torch_tensorrt__logging.html | 5 +- .../namespace_torch_tensorrt__ptq.html | 5 +- ...namespace_torch_tensorrt__torchscript.html | 5 +- ..._cpp_include_torch_tensorrt_logging.h.html | 5 +- ...e_cpp_include_torch_tensorrt_macros.h.html | 5 +- ...file_cpp_include_torch_tensorrt_ptq.h.html | 5 +- ...clude_torch_tensorrt_torch_tensorrt.h.html | 5 +- .../structtorch__tensorrt_1_1Device.html | 5 +- .../structtorch__tensorrt_1_1GraphInputs.html | 5 +- .../structtorch__tensorrt_1_1Input.html | 5 +- ...ensorrt_1_1torchscript_1_1CompileSpec.html | 5 +- docs/_cpp_api/torch_tensort_cpp.html | 5 +- docs/_cpp_api/unabridged_orphan.html | 5 +- .../weight_streaming_example.ipynb | 104 ++ .../_rendered_examples_jupyter.zip | Bin 117861 -> 126838 bytes .../_rendered_examples_python.zip | Bin 89587 -> 96893 bytes .../weight_streaming_example.py | 174 +++ ...phx_glr_weight_streaming_example_thumb.png | Bin 0 -> 26794 bytes docs/_modules/index.html | 5 +- docs/_modules/torch_tensorrt/_Device.html | 5 +- docs/_modules/torch_tensorrt/_Input.html | 5 +- docs/_modules/torch_tensorrt/_compile.html | 5 +- docs/_modules/torch_tensorrt/_enums.html | 5 +- .../torch_tensorrt/dynamo/_compiler.html | 15 +- .../torch_tensorrt/dynamo/_exporter.html | 5 +- .../torch_tensorrt/dynamo/_refit.html | 5 +- .../torch_tensorrt/dynamo/_settings.html | 11 +- .../torch_tensorrt/dynamo/_tracer.html | 5 +- .../runtime/_MutableTorchTensorRTModule.html | 5 +- .../runtime/_PythonTorchTensorRTModule.html | 44 +- .../dynamo/runtime/_TorchTensorRTModule.html | 27 +- docs/_modules/torch_tensorrt/fx/fx2trt.html | 5 +- .../torch_tensorrt/fx/input_tensor_spec.html | 5 +- docs/_modules/torch_tensorrt/fx/lower.html | 5 +- .../torch_tensorrt/fx/trt_module.html | 5 +- docs/_modules/torch_tensorrt/logging.html | 5 +- .../runtime/_multi_device_safe_mode.html | 5 +- .../torch_tensorrt/ts/_compile_spec.html | 5 +- .../_modules/torch_tensorrt/ts/_compiler.html | 5 +- docs/_modules/torch_tensorrt/ts/ptq.html | 5 +- docs/_sources/index.rst.txt | 2 + .../_rendered_examples/dynamo/index.rst.txt | 18 + .../dynamo/weight_streaming_example.rst.txt | 249 +++++ .../_rendered_examples/index.rst.txt | 17 + docs/_static/documentation_options.js | 2 +- docs/cli/torchtrtc.html | 5 +- docs/contributors/conversion.html | 5 +- docs/contributors/dynamo_converters.html | 5 +- docs/contributors/lowering.html | 5 +- docs/contributors/partitioning.html | 5 +- docs/contributors/phases.html | 5 +- docs/contributors/runtime.html | 5 +- docs/contributors/system_overview.html | 5 +- docs/contributors/ts_converters.html | 5 +- docs/contributors/useful_links.html | 5 +- .../writing_dynamo_aten_lowering_passes.html | 5 +- docs/dynamo/dynamo_export.html | 5 +- docs/dynamo/torch_compile.html | 12 +- docs/fx/getting_started_with_fx_path.html | 5 +- docs/genindex.html | 5 +- docs/getting_started/installation.html | 5 +- docs/getting_started/jetpack.html | 5 +- docs/getting_started/quick_start.html | 5 +- docs/index.html | 6 +- docs/indices/supported_ops.html | 5 +- docs/objects.inv | Bin 32090 -> 32294 bytes docs/py-modindex.html | 5 +- docs/py_api/dynamo.html | 11 +- docs/py_api/fx.html | 5 +- docs/py_api/logging.html | 5 +- docs/py_api/ptq.html | 5 +- docs/py_api/runtime.html | 9 +- docs/py_api/torch_tensorrt.html | 5 +- docs/py_api/ts.html | 7 +- docs/search.html | 5 +- docs/searchindex.js | 2 +- docs/sg_execution_times.html | 5 +- .../pytorch-sphinx-theme/docs/changelog.html | 5 +- .../docs/configuring.html | 5 +- .../pytorch-sphinx-theme/docs/demo/api.html | 5 +- .../pytorch-sphinx-theme/docs/demo/demo.html | 7 +- .../docs/demo/lists_tables.html | 5 +- .../pytorch-sphinx-theme/docs/demo/long.html | 5 +- .../docs/demo/structure.html | 5 +- docs/src/pytorch-sphinx-theme/docs/index.html | 5 +- .../pytorch-sphinx-theme/docs/installing.html | 5 +- ...creating_torchscript_module_in_python.html | 5 +- docs/ts/getting_started_with_cpp_api.html | 5 +- docs/ts/getting_started_with_python_api.html | 5 +- docs/ts/ptq.html | 5 +- .../ts/torchscript_frontend_from_pytorch.html | 5 +- .../dynamo/converter_overloading.html | 5 +- .../dynamo/custom_kernel_plugins.html | 5 +- .../dynamo/engine_caching_bert_example.html | 5 +- .../dynamo/engine_caching_example.html | 5 +- .../_rendered_examples/dynamo/index.html | 8 +- .../mutable_torchtrt_module_example.html | 9 +- .../dynamo/refit_engine_example.html | 5 +- .../dynamo/torch_compile_advanced_usage.html | 5 +- .../dynamo/torch_compile_resnet_example.html | 5 +- .../torch_compile_stable_diffusion.html | 5 +- .../torch_compile_transformers_example.html | 5 +- .../dynamo/torch_export_cudagraphs.html | 5 +- .../dynamo/torch_export_gpt2.html | 5 +- .../dynamo/torch_export_llama2.html | 5 +- .../_rendered_examples/dynamo/vgg16_ptq.html | 5 +- .../dynamo/weight_streaming_example.html | 992 ++++++++++++++++++ docs/tutorials/_rendered_examples/index.html | 8 +- docs/tutorials/notebooks.html | 5 +- .../serving_torch_tensorrt_with_triton.html | 5 +- docs/user_guide/dynamic_shapes.html | 5 +- docs/user_guide/mixed_precision.html | 5 +- docs/user_guide/runtime.html | 5 +- docs/user_guide/saving_models.html | 5 +- docs/user_guide/torch_tensorrt_explained.html | 5 +- docs/user_guide/using_dla.html | 5 +- 155 files changed, 2081 insertions(+), 298 deletions(-) create mode 100644 docs/_downloads/3e4586a9107efae8f87a361bd207b6e0/weight_streaming_example.ipynb create mode 100644 docs/_downloads/b26ba3d33b5fc57e738fb2f26cabe4e8/weight_streaming_example.py create mode 100644 docs/_images/sphx_glr_weight_streaming_example_thumb.png create mode 100644 docs/_sources/tutorials/_rendered_examples/dynamo/weight_streaming_example.rst.txt create mode 100644 docs/tutorials/_rendered_examples/dynamo/weight_streaming_example.html diff --git a/docs/_cpp_api/classtorch__tensorrt_1_1DataType.html b/docs/_cpp_api/classtorch__tensorrt_1_1DataType.html index 164d195394..87519bd3c9 100644 --- a/docs/_cpp_api/classtorch__tensorrt_1_1DataType.html +++ b/docs/_cpp_api/classtorch__tensorrt_1_1DataType.html @@ -10,7 +10,7 @@ - Class DataType — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + Class DataType — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -275,7 +275,7 @@
- v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
@@ -330,6 +330,7 @@
  • Overloading Torch-TensorRT Converters with Custom Converters
  • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
  • Mutable Torch TensorRT Module
  • +
  • Weight Streaming
  • Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      @@ -554,6 +555,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE, use_explicit_typing: bool = _defaults.USE_EXPLICIT_TYPING, use_fp32_acc: bool = _defaults.USE_FP32_ACC, + enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -626,6 +628,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored. use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs. use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. + enable_weight_streaming (bool): Enable weight streaming. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -679,6 +682,10 @@

      Source code for torch_tensorrt.dynamo._compiler

      < This flag inserts casts around matmul layers and ensures TensorRT executes the matmul layers in FP16 with FP32 accumulation." ) + if enable_weight_streaming and not use_explicit_typing: + raise AssertionError( + "When enable_weight_streaming is enabled, it requires use_explicit_typing to be set to True" + ) # Aliasing inputs to arg_inputs for better understanding if not arg_inputs and not inputs: raise AssertionError("'arg_inputs' and 'inputs' should not both be None.") @@ -755,6 +762,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < "reuse_cached_engines": reuse_cached_engines, "use_explicit_typing": use_explicit_typing, "use_fp32_acc": use_fp32_acc, + "enable_weight_streaming": enable_weight_streaming, } settings = CompilationSettings(**compilation_options) @@ -1013,6 +1021,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < timing_cache_path: str = _defaults.TIMING_CACHE_PATH, use_explicit_typing: bool = _defaults.USE_EXPLICIT_TYPING, use_fp32_acc: bool = _defaults.USE_FP32_ACC, + enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING, **kwargs: Any, ) -> bytes: """Convert an ExportedProgram to a serialized TensorRT engine @@ -1073,6 +1082,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs. use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. + enable_weight_streaming (bool): Enable weight streaming. Returns: bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs """ @@ -1148,6 +1158,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < "timing_cache_path": timing_cache_path, "use_explicit_typing": use_explicit_typing, "use_fp32_acc": use_fp32_acc, + "enable_weight_streaming": enable_weight_streaming, } settings = CompilationSettings(**compilation_options) diff --git a/docs/_modules/torch_tensorrt/dynamo/_exporter.html b/docs/_modules/torch_tensorrt/dynamo/_exporter.html index 8663f56da0..1245f63fcc 100644 --- a/docs/_modules/torch_tensorrt/dynamo/_exporter.html +++ b/docs/_modules/torch_tensorrt/dynamo/_exporter.html @@ -9,7 +9,7 @@ - torch_tensorrt.dynamo._exporter — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + torch_tensorrt.dynamo._exporter — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -272,7 +272,7 @@
      - v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
      @@ -327,6 +327,7 @@
    • Overloading Torch-TensorRT Converters with Custom Converters
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • Mutable Torch TensorRT Module
    • +
    • Weight Streaming

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      @@ -478,6 +479,7 @@

      Source code for torch_tensorrt.dynamo._settings

      < DLA_SRAM_SIZE, DRYRUN, ENABLE_EXPERIMENTAL_DECOMPOSITIONS, + ENABLE_WEIGHT_STREAMING, ENABLED_PRECISIONS, ENGINE_CAPABILITY, HARDWARE_COMPATIBLE, @@ -546,6 +548,7 @@

      Source code for torch_tensorrt.dynamo._settings

      < reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage use_strong_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs. use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions. + enable_weight_streaming (bool): Enable weight streaming. """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) @@ -581,7 +584,8 @@

      Source code for torch_tensorrt.dynamo._settings

      < cache_built_engines: bool = CACHE_BUILT_ENGINES reuse_cached_engines: bool = REUSE_CACHED_ENGINES use_explicit_typing: bool = USE_EXPLICIT_TYPING - use_fp32_acc: bool = USE_FP32_ACC
      + use_fp32_acc: bool = USE_FP32_ACC + enable_weight_streaming: bool = ENABLE_WEIGHT_STREAMING
      _SETTINGS_TO_BE_ENGINE_INVARIANT = ( @@ -594,6 +598,7 @@

      Source code for torch_tensorrt.dynamo._settings

      < "make_refittable", "engine_capability", "hardware_compatible", + "enable_weight_streaming", ) diff --git a/docs/_modules/torch_tensorrt/dynamo/_tracer.html b/docs/_modules/torch_tensorrt/dynamo/_tracer.html index 008f58a266..a940592cfa 100644 --- a/docs/_modules/torch_tensorrt/dynamo/_tracer.html +++ b/docs/_modules/torch_tensorrt/dynamo/_tracer.html @@ -9,7 +9,7 @@ - torch_tensorrt.dynamo._tracer — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + torch_tensorrt.dynamo._tracer — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -272,7 +272,7 @@
      - v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
      @@ -327,6 +327,7 @@
    • Overloading Torch-TensorRT Converters with Custom Converters
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • Mutable Torch TensorRT Module
    • +
    • Weight Streaming

    Dynamo Frontend

      diff --git a/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html b/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html index 45289f99c5..a43d6ef316 100644 --- a/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html +++ b/docs/_modules/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.html @@ -9,7 +9,7 @@ - torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + torch_tensorrt.dynamo.runtime._MutableTorchTensorRTModule — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -272,7 +272,7 @@
      - v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
      @@ -327,6 +327,7 @@
    • Overloading Torch-TensorRT Converters with Custom Converters
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • Mutable Torch TensorRT Module
    • +
    • Weight Streaming

    Dynamo Frontend

      diff --git a/docs/_modules/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.html b/docs/_modules/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.html index 474f41edb3..6c1b1e3836 100644 --- a/docs/_modules/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.html +++ b/docs/_modules/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.html @@ -9,7 +9,7 @@ - torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -272,7 +272,7 @@
      - v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
      @@ -327,6 +327,7 @@
    • Overloading Torch-TensorRT Converters with Custom Converters
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • Mutable Torch TensorRT Module
    • +
    • Weight Streaming

    Dynamo Frontend

      @@ -529,6 +530,7 @@

      Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule

      ) """ + self.context: Any super(PythonTorchTensorRTModule, self).__init__() self._register_state_dict_hook(PythonTorchTensorRTModule._on_state_dict) @@ -574,6 +576,42 @@

      Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule

      if self.serialized_engine is not None and not self.settings.lazy_engine_init: self.setup_engine() + def get_streamable_device_memory_budget(self) -> Any: + return self.engine.streamable_weights_size + + def get_automatic_device_memory_budget(self) -> Any: + return self.engine.get_weight_streaming_automatic_budget() + + def get_device_memory_budget(self) -> Any: + return self.engine.weight_streaming_budget_v2 + + def set_device_memory_budget(self, budget_bytes: int) -> int: + # Recreating the context because weight streaming budget cannot be modified while there are active context. + if self.context is not None: + del self.context + budget_bytes = self._set_device_memory_budget(budget_bytes) + self.context = self.engine.create_execution_context() + return budget_bytes + + def _set_device_memory_budget(self, budget_bytes: int) -> int: + # Disable weight streaming for invalid budget size + if budget_bytes < 0: + budget_bytes = self.get_streamable_device_memory_budget() + self.engine.weight_streaming_budget_v2 = budget_bytes + if self.engine.weight_streaming_budget_v2 != budget_bytes: + logger.error(f"Failed to set weight streaming budget to {budget_bytes}") + budget_bytes = self.engine.weight_streaming_budget_v2 + if self.get_streamable_device_memory_budget() == budget_bytes: + logger.warning("Weight streaming is disabled") + + return budget_bytes + + def set_default_device_memory_budget(self) -> int: + budget_bytes = self.get_automatic_device_memory_budget() + # Set automatic weight streaming budget as default when context is created + logger.debug(f"Weight streaming budget set to {budget_bytes}B") + return self._set_device_memory_budget(budget_bytes) + def setup_engine(self) -> None: assert ( self.target_platform == Platform.current_platform() @@ -582,6 +620,8 @@

      Source code for torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule

      self.initialized = True runtime = trt.Runtime(TRT_LOGGER) self.engine = runtime.deserialize_cuda_engine(self.serialized_engine) + if self.settings.enable_weight_streaming: + self.set_default_device_memory_budget() self.context = self.engine.create_execution_context() assert self.engine.num_io_tensors == ( diff --git a/docs/_modules/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.html b/docs/_modules/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.html index 1fdbfb4d26..36f8ba3396 100644 --- a/docs/_modules/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.html +++ b/docs/_modules/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.html @@ -9,7 +9,7 @@ - torch_tensorrt.dynamo.runtime._TorchTensorRTModule — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + torch_tensorrt.dynamo.runtime._TorchTensorRTModule — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -272,7 +272,7 @@
      - v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
      @@ -327,6 +327,7 @@
    • Overloading Torch-TensorRT Converters with Custom Converters
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • Mutable Torch TensorRT Module
    • +
    • Weight Streaming

    Dynamo Frontend

      @@ -633,6 +634,28 @@

      Source code for torch_tensorrt.dynamo.runtime._TorchTensorRTModule

      return engine_info + def get_streamable_device_memory_budget(self) -> Any: + return self.engine.streamable_device_memory_budget + + def get_automatic_device_memory_budget(self) -> Any: + return self.engine.automatic_device_memory_budget + + def get_device_memory_budget(self) -> Any: + return self.engine.device_memory_budget + + def set_device_memory_budget(self, budget_bytes: int) -> int: + # Disable weight streaming for invalid budget size + if budget_bytes < 0: + budget_bytes = self.get_streamable_device_memory_budget() + self.engine.device_memory_budget = budget_bytes + if self.engine.device_memory_budget != budget_bytes: + logger.error(f"Failed to set weight streaming budget to {budget_bytes}") + budget_bytes = self.engine.device_memory_budget + if self.get_streamable_device_memory_budget() == budget_bytes: + logger.warning("Weight streaming is disabled") + + return budget_bytes + def setup_engine(self) -> None: """ Setup engine for a module which has deferred engine setup. diff --git a/docs/_modules/torch_tensorrt/fx/fx2trt.html b/docs/_modules/torch_tensorrt/fx/fx2trt.html index 1cf8e03350..f05ef2725b 100644 --- a/docs/_modules/torch_tensorrt/fx/fx2trt.html +++ b/docs/_modules/torch_tensorrt/fx/fx2trt.html @@ -9,7 +9,7 @@ - torch_tensorrt.fx.fx2trt — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + torch_tensorrt.fx.fx2trt — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -272,7 +272,7 @@
      - v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
      @@ -327,6 +327,7 @@
    • Overloading Torch-TensorRT Converters with Custom Converters
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • Mutable Torch TensorRT Module
    • +
    • Weight Streaming

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt index b4ede94404..5d88c8ecae 100644 --- a/docs/_sources/index.rst.txt +++ b/docs/_sources/index.rst.txt @@ -66,6 +66,7 @@ Tutorials * :ref:`converter_overloading` * :ref:`custom_kernel_plugins` * :ref:`mutable_torchtrt_module_example` +* :ref:`weight_streaming_example` .. toctree:: :caption: Tutorials @@ -82,6 +83,7 @@ Tutorials tutorials/_rendered_examples/dynamo/converter_overloading tutorials/_rendered_examples/dynamo/custom_kernel_plugins tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example + tutorials/_rendered_examples/dynamo/weight_streaming_example Dynamo Frontend ---------------- diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt index cb28b2b62b..d3f55ac92c 100644 --- a/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt @@ -217,6 +217,23 @@ Model Zoo +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_weight_streaming_example_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_weight_streaming_example.py` + +.. raw:: html + +
      Weight Streaming
      +
      + + .. raw:: html
      @@ -287,6 +304,7 @@ Model Zoo /tutorials/_rendered_examples/dynamo/torch_export_gpt2 /tutorials/_rendered_examples/dynamo/torch_export_llama2 /tutorials/_rendered_examples/dynamo/converter_overloading + /tutorials/_rendered_examples/dynamo/weight_streaming_example /tutorials/_rendered_examples/dynamo/vgg16_ptq /tutorials/_rendered_examples/dynamo/engine_caching_example /tutorials/_rendered_examples/dynamo/custom_kernel_plugins diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/weight_streaming_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/weight_streaming_example.rst.txt new file mode 100644 index 0000000000..40c8eb0686 --- /dev/null +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/weight_streaming_example.rst.txt @@ -0,0 +1,249 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "tutorials/_rendered_examples/dynamo/weight_streaming_example.py" +.. LINE NUMBERS ARE GIVEN BELOW. + +.. only:: html + + .. note:: + :class: sphx-glr-download-link-note + + :ref:`Go to the end ` + to download the full example code + +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_tutorials__rendered_examples_dynamo_weight_streaming_example.py: + + +.. _weight_streaming_example: + +Weight Streaming +======================= + +Weight streaming in TensorRT is a powerful feature designed to overcome GPU memory limitations +when working with large models. It enables running models larger than available GPU memory +by streaming weight data from host (CPU) memory to GPU memory during inference. + +Streaming larger amounts of memory will likely result in lower performance. But if +streaming weights allows the user to run larger batch sizes and it can lead to higher throughput. +This increased throughput can sometimes outweigh the slowdown caused by streaming weights. +The optimal amount of memory to stream varies depending on the specific model and hardware. +Experimenting with different memory limits can help find the best balance between streaming +overhead and batch size benefits. + +This example uses a pre-trained Llama-2 model and show how to use weight streaming feature with +Torch-TensorRT. + 1. compile option - build trt engine with weight streaming feature + 2. runtime api - weight streaming budget control by context manager + +.. GENERATED FROM PYTHON SOURCE LINES 25-27 + +Imports and Model Definition +---------------------------------- + +.. GENERATED FROM PYTHON SOURCE LINES 27-85 + +.. code-block:: python + + + import copy + import timeit + + import numpy as np + import torch + import torch_tensorrt + from transformers import AutoModelForCausalLM + from utils import export_llm + + + def time_generate(model, inputs, output_seq_length, iterations=10): + """ + Measure the time for generating a sentence over certain number of iterations + """ + # We only support single input (B x seq_len) for LLMs now + input_seq = inputs[0] + with torch.no_grad(): + timings = [] + for _ in range(iterations): + start_time = timeit.default_timer() + inputs_copy = copy.copy(input_seq) + # Greedy decoding of the model. This generates up to max_tokens. + while inputs_copy.shape[1] <= output_seq_length: + outputs = model(inputs_copy) + logits = outputs.logits + next_token_logits = logits[:, -1, :] + next_tokens = torch.argmax(next_token_logits, dim=-1) + inputs_copy = torch.cat([inputs_copy, next_tokens[:, None]], dim=-1) + torch.cuda.synchronize() + end_time = timeit.default_timer() + timings.append(end_time - start_time) + + times = np.array(timings) + time_mean_ms = np.mean(times) * 1000 + + return time_mean_ms + + + # Load the LLaMA-2 model + DEVICE = torch.device("cuda:0") + llama_path = "meta-llama/Llama-2-7b-chat-hf" + with torch.no_grad(): + model = AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ).eval() + + # Set input and output sequence lengths + isl = 128 + osl = 256 + + # Create random input tensors + input_tensors = [torch.randint(0, 5, (1, isl), dtype=torch.int64).cuda()] + # Convert the model to half precision (FP16) + model = model.half() + # Exports the LLM model into an ExportedProgram with dynamic shapes. + llama2_ep = export_llm(model, input_tensors[0], max_seq_len=osl) + + +.. GENERATED FROM PYTHON SOURCE LINES 86-93 + +Compiler option +---------------------------------- + +enable_weight_streaming=True option and use_explicit_typing=True are required to build +the engine with weight streaming feature. use_explicit_typing=True option creates a +`strongly typed network `_ and only float32 precision is allowed in enabled_precisions option + + +.. GENERATED FROM PYTHON SOURCE LINES 93-108 + +.. code-block:: python + + + # Create a TensorRT-compiled model + trt_model = torch_tensorrt.dynamo.compile( + llama2_ep, + inputs=input_tensors, + enabled_precisions={torch.float32}, + truncate_double=True, + device=DEVICE, + use_explicit_typing=True, + enable_weight_streaming=True, + ) + + # Warm up for 3 iterations + _ = time_generate(trt_model, input_tensors, osl, 3) + + +.. GENERATED FROM PYTHON SOURCE LINES 109-115 + +Running with automatic budget size +---------------------------------- + +Once you specify the enable_weight_streaming compile option, automatic budget size is configured. +This automatic size may not always provide the optimal solution because the automatically determined +budget lacks insight into the user's specific memory constraints and usage patterns + +.. GENERATED FROM PYTHON SOURCE LINES 115-128 + +.. code-block:: python + + + # Weight streaming context to get current weight budget information + weight_streaming_ctx = torch_tensorrt.runtime.weight_streaming(trt_model) + # Measure the mean latency of the model with weight streaming + mean_latency = time_generate(trt_model, input_tensors, osl, 1) + # Calculate the percentage of current weight budget used + weight_budget_pct = ( + weight_streaming_ctx.device_budget / weight_streaming_ctx.total_device_budget * 100 + ) + print( + f"Set weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms" + ) + + +.. GENERATED FROM PYTHON SOURCE LINES 129-137 + +Running with weight streaming context manager +---------------------------------- + +Weight streaming budget can be limited by using weight streaming context manager. +The permissible range for the budget size is from 0 to ctx.total_device_budget. +0 means maximum memory savings occur by using minimum amounts of memory. Value +equal to ctx.total_device_budget will disable weight streaming. +If multiple trt engines are created, budgets are distributed proportionally + +.. GENERATED FROM PYTHON SOURCE LINES 137-175 + +.. code-block:: python + + + # Use a context manager for weight streaming + with torch_tensorrt.runtime.weight_streaming(trt_model) as weight_streaming_ctx: + # Get the total size of streamable weights in the engine + streamable_budget = weight_streaming_ctx.total_device_budget + + # Scenario 1: Automatic weight streaming budget + # Get the automatically determined weight streaming budget + requested_budget = weight_streaming_ctx.get_automatic_weight_streaming_budget() + # Set the device budget to the automatically determined value + weight_streaming_ctx.device_budget = requested_budget + # Measure the mean latency with automatic budget + mean_latency = time_generate(trt_model, input_tensors, osl, 1) + # Calculate the percentage of the weight budget used + weight_budget_pct = ( + weight_streaming_ctx.device_budget + / weight_streaming_ctx.total_device_budget + * 100 + ) + print( + f"Set auto weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms" + ) + + # Scenario 2: Manual 10% weight streaming budget + # Set the budget to 10% of the total streamable weights + requested_budget = int(streamable_budget * 0.1) + weight_streaming_ctx.device_budget = requested_budget + # Measure the mean latency with 10% budget + mean_latency = time_generate(trt_model, input_tensors, osl, 1) + # Calculate the percentage of the weight budget used + weight_budget_pct = ( + weight_streaming_ctx.device_budget + / weight_streaming_ctx.total_device_budget + * 100 + ) + print( + f"Set weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms" + ) + + +.. rst-class:: sphx-glr-timing + + **Total running time of the script:** ( 0 minutes 0.000 seconds) + + +.. _sphx_glr_download_tutorials__rendered_examples_dynamo_weight_streaming_example.py: + +.. only:: html + + .. container:: sphx-glr-footer sphx-glr-footer-example + + + + + .. container:: sphx-glr-download sphx-glr-download-python + + :download:`Download Python source code: weight_streaming_example.py ` + + .. container:: sphx-glr-download sphx-glr-download-jupyter + + :download:`Download Jupyter notebook: weight_streaming_example.ipynb ` + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/_sources/tutorials/_rendered_examples/index.rst.txt b/docs/_sources/tutorials/_rendered_examples/index.rst.txt index 0acd41a003..6a994d6a40 100644 --- a/docs/_sources/tutorials/_rendered_examples/index.rst.txt +++ b/docs/_sources/tutorials/_rendered_examples/index.rst.txt @@ -229,6 +229,23 @@ Model Zoo
      +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_weight_streaming_example_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_weight_streaming_example.py` + +.. raw:: html + +
      Weight Streaming
      +
      + + .. raw:: html
      diff --git a/docs/_static/documentation_options.js b/docs/_static/documentation_options.js index c1161d20b7..37712f63f2 100644 --- a/docs/_static/documentation_options.js +++ b/docs/_static/documentation_options.js @@ -1,6 +1,6 @@ var DOCUMENTATION_OPTIONS = { URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), - VERSION: 'v2.6.0.dev0+b6ed1c5', + VERSION: 'v2.6.0.dev0+92bf700', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/docs/cli/torchtrtc.html b/docs/cli/torchtrtc.html index 2b6944c8de..abc7bd0ddf 100644 --- a/docs/cli/torchtrtc.html +++ b/docs/cli/torchtrtc.html @@ -10,7 +10,7 @@ - torchtrtc — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + torchtrtc — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -275,7 +275,7 @@
      - v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
      @@ -330,6 +330,7 @@
    • Overloading Torch-TensorRT Converters with Custom Converters
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • Mutable Torch TensorRT Module
    • +
    • Weight Streaming

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      diff --git a/docs/dynamo/torch_compile.html b/docs/dynamo/torch_compile.html index e795e36837..d64c94984f 100644 --- a/docs/dynamo/torch_compile.html +++ b/docs/dynamo/torch_compile.html @@ -10,7 +10,7 @@ - TensorRT Backend for torch.compile — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + TensorRT Backend for torch.compile — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -40,7 +40,7 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      + + + + + +
      +
      +
      + + + + + + + + + + + +
      +
      +
      + + + + + + + + + + + + + + + + +
      + + + + +
      +
      + +
      + Shortcuts +
      +
      + +
      +
      + + + + + + +
      + +
      +
      + + +
      +

      Weight Streaming

      +

      Weight streaming in TensorRT is a powerful feature designed to overcome GPU memory limitations +when working with large models. It enables running models larger than available GPU memory +by streaming weight data from host (CPU) memory to GPU memory during inference.

      +

      Streaming larger amounts of memory will likely result in lower performance. But if +streaming weights allows the user to run larger batch sizes and it can lead to higher throughput. +This increased throughput can sometimes outweigh the slowdown caused by streaming weights. +The optimal amount of memory to stream varies depending on the specific model and hardware. +Experimenting with different memory limits can help find the best balance between streaming +overhead and batch size benefits.

      +

      This example uses a pre-trained Llama-2 model and show how to use weight streaming feature with +Torch-TensorRT.

      +
      +
        +
      1. compile option - build trt engine with weight streaming feature

      2. +
      3. runtime api - weight streaming budget control by context manager

      4. +
      +
      +
      +

      Imports and Model Definition

      +
      import copy
      +import timeit
      +
      +import numpy as np
      +import torch
      +import torch_tensorrt
      +from transformers import AutoModelForCausalLM
      +from utils import export_llm
      +
      +
      +def time_generate(model, inputs, output_seq_length, iterations=10):
      +    """
      +    Measure the time for generating a sentence over certain number of iterations
      +    """
      +    # We only support single input (B x seq_len) for LLMs now
      +    input_seq = inputs[0]
      +    with torch.no_grad():
      +        timings = []
      +        for _ in range(iterations):
      +            start_time = timeit.default_timer()
      +            inputs_copy = copy.copy(input_seq)
      +            # Greedy decoding of the model. This generates up to max_tokens.
      +            while inputs_copy.shape[1] <= output_seq_length:
      +                outputs = model(inputs_copy)
      +                logits = outputs.logits
      +                next_token_logits = logits[:, -1, :]
      +                next_tokens = torch.argmax(next_token_logits, dim=-1)
      +                inputs_copy = torch.cat([inputs_copy, next_tokens[:, None]], dim=-1)
      +            torch.cuda.synchronize()
      +            end_time = timeit.default_timer()
      +            timings.append(end_time - start_time)
      +
      +    times = np.array(timings)
      +    time_mean_ms = np.mean(times) * 1000
      +
      +    return time_mean_ms
      +
      +
      +# Load the LLaMA-2 model
      +DEVICE = torch.device("cuda:0")
      +llama_path = "meta-llama/Llama-2-7b-chat-hf"
      +with torch.no_grad():
      +    model = AutoModelForCausalLM.from_pretrained(
      +        llama_path, use_cache=False, attn_implementation="eager"
      +    ).eval()
      +
      +# Set input and output sequence lengths
      +isl = 128
      +osl = 256
      +
      +# Create random input tensors
      +input_tensors = [torch.randint(0, 5, (1, isl), dtype=torch.int64).cuda()]
      +# Convert the model to half precision (FP16)
      +model = model.half()
      +# Exports the LLM model into an ExportedProgram with dynamic shapes.
      +llama2_ep = export_llm(model, input_tensors[0], max_seq_len=osl)
      +
      +
      +
      +
      +

      Compiler option

      +

      enable_weight_streaming=True option and use_explicit_typing=True are required to build +the engine with weight streaming feature. use_explicit_typing=True option creates a +strongly typed network and only float32 precision is allowed in enabled_precisions option

      +
      # Create a TensorRT-compiled model
      +trt_model = torch_tensorrt.dynamo.compile(
      +    llama2_ep,
      +    inputs=input_tensors,
      +    enabled_precisions={torch.float32},
      +    truncate_double=True,
      +    device=DEVICE,
      +    use_explicit_typing=True,
      +    enable_weight_streaming=True,
      +)
      +
      +# Warm up for 3 iterations
      +_ = time_generate(trt_model, input_tensors, osl, 3)
      +
      +
      +
      +
      +

      Running with automatic budget size

      +

      Once you specify the enable_weight_streaming compile option, automatic budget size is configured. +This automatic size may not always provide the optimal solution because the automatically determined +budget lacks insight into the user’s specific memory constraints and usage patterns

      +
      # Weight streaming context to get current weight budget information
      +weight_streaming_ctx = torch_tensorrt.runtime.weight_streaming(trt_model)
      +# Measure the mean latency of the model with weight streaming
      +mean_latency = time_generate(trt_model, input_tensors, osl, 1)
      +# Calculate the percentage of current weight budget used
      +weight_budget_pct = (
      +    weight_streaming_ctx.device_budget / weight_streaming_ctx.total_device_budget * 100
      +)
      +print(
      +    f"Set weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms"
      +)
      +
      +
      +
      +
      +

      Running with weight streaming context manager

      +

      Weight streaming budget can be limited by using weight streaming context manager. +The permissible range for the budget size is from 0 to ctx.total_device_budget. +0 means maximum memory savings occur by using minimum amounts of memory. Value +equal to ctx.total_device_budget will disable weight streaming. +If multiple trt engines are created, budgets are distributed proportionally

      +
      # Use a context manager for weight streaming
      +with torch_tensorrt.runtime.weight_streaming(trt_model) as weight_streaming_ctx:
      +    # Get the total size of streamable weights in the engine
      +    streamable_budget = weight_streaming_ctx.total_device_budget
      +
      +    # Scenario 1: Automatic weight streaming budget
      +    # Get the automatically determined weight streaming budget
      +    requested_budget = weight_streaming_ctx.get_automatic_weight_streaming_budget()
      +    # Set the device budget to the automatically determined value
      +    weight_streaming_ctx.device_budget = requested_budget
      +    # Measure the mean latency with automatic budget
      +    mean_latency = time_generate(trt_model, input_tensors, osl, 1)
      +    # Calculate the percentage of the weight budget used
      +    weight_budget_pct = (
      +        weight_streaming_ctx.device_budget
      +        / weight_streaming_ctx.total_device_budget
      +        * 100
      +    )
      +    print(
      +        f"Set auto weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms"
      +    )
      +
      +    # Scenario 2: Manual 10% weight streaming budget
      +    # Set the budget to 10% of the total streamable weights
      +    requested_budget = int(streamable_budget * 0.1)
      +    weight_streaming_ctx.device_budget = requested_budget
      +    # Measure the mean latency with 10% budget
      +    mean_latency = time_generate(trt_model, input_tensors, osl, 1)
      +    # Calculate the percentage of the weight budget used
      +    weight_budget_pct = (
      +        weight_streaming_ctx.device_budget
      +        / weight_streaming_ctx.total_device_budget
      +        * 100
      +    )
      +    print(
      +        f"Set weight streaming budget as {weight_budget_pct}%. {weight_streaming_ctx.device_budget} bytes out of {weight_streaming_ctx.total_device_budget}. mean latency = {mean_latency} ms"
      +    )
      +
      +
      +

      Total running time of the script: ( 0 minutes 0.000 seconds)

      + +

      Gallery generated by Sphinx-Gallery

      +
      +
      + + +
      + +
      + + +
      +
      + + +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      +
      +

      Docs

      +

      Access comprehensive developer documentation for PyTorch

      + View Docs +
      + +
      +

      Tutorials

      +

      Get in-depth tutorials for beginners and advanced developers

      + View Tutorials +
      + +
      +

      Resources

      +

      Find development resources and get your questions answered

      + View Resources +
      +
      +
      +
      + + + + + + + + + +
      +
      +
      +
      + + +
      +
      +
      + + +
      + + + + + + + + \ No newline at end of file diff --git a/docs/tutorials/_rendered_examples/index.html b/docs/tutorials/_rendered_examples/index.html index 9362f6cad7..3dc6a3b133 100644 --- a/docs/tutorials/_rendered_examples/index.html +++ b/docs/tutorials/_rendered_examples/index.html @@ -10,7 +10,7 @@ - Torch-TensorRT Tutorials — Torch-TensorRT v2.6.0.dev0+b6ed1c5 documentation + Torch-TensorRT Tutorials — Torch-TensorRT v2.6.0.dev0+92bf700 documentation @@ -273,7 +273,7 @@
      - v2.6.0.dev0+b6ed1c5 + v2.6.0.dev0+92bf700
      @@ -328,6 +328,7 @@
    • Overloading Torch-TensorRT Converters with Custom Converters
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • Mutable Torch TensorRT Module
    • +
    • Weight Streaming

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend