From 0899f3494bb751a15d99522a846e87fe4c032433 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Mon, 30 Sep 2024 11:25:07 -0700 Subject: [PATCH] Update torchao to 0.4.0 and fix GPU quantization tutorial --- .ci/docker/requirements.txt | 2 +- prototype_source/gpu_quantization_torchao_tutorial.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 2384fb1b00..14104155b7 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -68,5 +68,5 @@ iopath pygame==2.6.0 pycocotools semilearn==0.3.2 -torchao==0.0.3 +torchao==0.4.0 segment_anything==1.0 diff --git a/prototype_source/gpu_quantization_torchao_tutorial.py b/prototype_source/gpu_quantization_torchao_tutorial.py index 4050a88e56..8767f4aca6 100644 --- a/prototype_source/gpu_quantization_torchao_tutorial.py +++ b/prototype_source/gpu_quantization_torchao_tutorial.py @@ -44,7 +44,7 @@ # import torch -from torchao.quantization import change_linear_weights_to_int8_dqtensors +from torchao.quantization.quant_api import quantize_, int8_dynamic_activation_int8_weight from segment_anything import sam_model_registry from torch.utils.benchmark import Timer @@ -156,9 +156,9 @@ def get_sam_model(only_one_block=False, batchsize=1): # in memory bound situations where the benefit comes from loading less # weight data, rather than doing less computation. The torchao APIs: # -# ``change_linear_weights_to_int8_dqtensors``, -# ``change_linear_weights_to_int8_woqtensors`` or -# ``change_linear_weights_to_int4_woqtensors`` +# ``int8_dynamic_activation_int8_weight()``, +# ``int8_dynamic_activation_int8_semi_sparse_weight`` or +# ``int8_dynamic_activation_int4_weight`` # # can be used to easily apply the desired quantization technique and then # once the model is compiled with ``torch.compile`` with ``max-autotune``, quantization is @@ -185,7 +185,7 @@ def get_sam_model(only_one_block=False, batchsize=1): model, image = get_sam_model(only_one_block, batchsize) model = model.to(torch.bfloat16) image = image.to(torch.bfloat16) -change_linear_weights_to_int8_dqtensors(model) +quantize_(model, int8_dynamic_activation_int8_weight()) model_c = torch.compile(model, mode='max-autotune') quant_res = benchmark(model_c, image) print(f"bf16 compiled runtime of the quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")