From 19ea412dcd1400a1ca6c4f8812d41408596e9836 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 21 Oct 2024 10:52:08 +0200 Subject: [PATCH] File updates --- .../quantization/compression_primitives.py | 328 --------- .../weight_lowering/dispatched_functions.py | 4 +- ...t_lowering_dispatcher.py => dispatcher.py} | 0 .../weight_lowering/ov_backend.py | 2 +- .../weight_lowering/tensor_backend.py | 2 +- .../weight_compression/weight_lowering__.py | 635 ------------------ run_weight_compression.py | 373 ---------- weight_compression.py | 236 ------- 8 files changed, 4 insertions(+), 1576 deletions(-) delete mode 100644 nncf/openvino/quantization/compression_primitives.py rename nncf/quantization/algorithms/weight_compression/weight_lowering/{weight_lowering_dispatcher.py => dispatcher.py} (100%) delete mode 100644 nncf/quantization/algorithms/weight_compression/weight_lowering__.py delete mode 100644 run_weight_compression.py delete mode 100644 weight_compression.py diff --git a/nncf/openvino/quantization/compression_primitives.py b/nncf/openvino/quantization/compression_primitives.py deleted file mode 100644 index 2b0c4e95086..00000000000 --- a/nncf/openvino/quantization/compression_primitives.py +++ /dev/null @@ -1,328 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import inspect -import os -from dataclasses import dataclass -from typing import List, Optional, Tuple - -import numpy as np -import openvino as ov -from openvino.runtime import opset13 as opset - -import nncf -from nncf import CompressWeightsMode -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig - - -@dataclass -class PrimitiveParameters: - dynamic: bool = False - recompile: bool = False - release_memory: bool = True - share_outputs: bool = True - input_dtype: str = "fp32" - - -class CompressionPrimitiveCache: - _cache = {} - - -COMPRESSION_PRIMITIVE_CACHE = CompressionPrimitiveCache() - - -def clear_cache(): - COMPRESSION_PRIMITIVE_CACHE._cache = {} - - -def cache_results(func): - def wrapper(*args, **kwargs): - sig = inspect.signature(func) - new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} - new_kwargs.update(kwargs) - cache_key = (func.__name__, frozenset(new_kwargs.items())) - recompile = new_kwargs.get("params", PrimitiveParameters()).recompile - cache = COMPRESSION_PRIMITIVE_CACHE._cache - if not recompile and cache_key in cache: - return cache[cache_key] - result = func(cache, *args, **kwargs) - cache[cache_key] = result - return result - - return wrapper - - -@cache_results -def get_compress_weight_primitive( - config: WeightCompressionConfig, - weight_shape: Tuple, - scale_shape: Optional[Tuple] = None, - zero_point_shape: Optional[Tuple] = None, - reduction_axes: Optional[Tuple] = None, - params: Optional[PrimitiveParameters] = None, -): - if scale_shape is None and zero_point_shape is not None: - raise Exception("Zero point shape can only be provided if scale shape is provided.") - if (scale_shape is None) != (reduction_axes is not None): - raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.") - - if params is None: - params = PrimitiveParameters() - if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: - params.dynamic = False - - if params.dynamic: - weight_shape = (-1,) * len(weight_shape) - if scale_shape is not None: - scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) - if zero_point_shape is not None: - zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) - - return _build_compress_model( - config, - params, - weight_shape, - scale_shape, - zero_point_shape, - reduction_axes, - return_nodes=False, - ) - - -@cache_results -def get_compress_decompress_weight_primitive( - config: WeightCompressionConfig, - weight_shape: Tuple, - scale_shape: Optional[Tuple], - zero_point_shape: Optional[Tuple] = None, - params: Optional[PrimitiveParameters] = None, -): - if params is None: - params = PrimitiveParameters() - if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: - params.dynamic = False - - if params.dynamic: - weight_shape = (-1,) * len(weight_shape) - scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) - if zero_point_shape is not None: - zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) - - return _build_compress_decompress_model( - config, - params, - weight_shape, - scale_shape, - zero_point_shape, - ) - - -def _build_compress_decompress_model( - config: WeightCompressionConfig, - params: PrimitiveParameters, - weight_shape: Tuple, - scale_shape: Tuple, - zero_point_shape: Optional[Tuple] = None, -): - ov_parameters, ov_results = _build_compress_model( - config, params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True - ) - return _get_compress_decompress_model( - config, - params, - ov_parameters, - ov_results, - ) - - -def _build_compress_model( - config: WeightCompressionConfig, - params: PrimitiveParameters, - weight_shape: Tuple, - scale_shape: Optional[Tuple] = None, - zero_point_shape: Optional[Tuple] = None, - reduction_axes: Optional[Tuple] = None, - return_nodes: bool = False, -): - if params.input_dtype == "fp32": - input_dtype = ov.Type.f32 - elif params.input_dtype == "fp16": - input_dtype = ov.Type.f16 - elif params.input_dtype == "bf16": - input_dtype = ov.Type.bf16 - else: - raise Exception - weight = opset.parameter(weight_shape, name="w", dtype=input_dtype) - ov_parameters = [weight] - - if scale_shape is not None: - # Compute only the compressed weight - - scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32) - ov_parameters.append(scale) - - zero_point = None - if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) - ov_parameters.append(zero_point) - else: - # Compute compressed weight, scale and, possibly, zero point - - group_size = config.group_size - if group_size != -1: - if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1: - reduction_axes = reduction_axes[0] - if not isinstance(reduction_axes, int): - raise NotImplementedError( - f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." - ) - channel_size = weight.shape[reduction_axes] - if channel_size % group_size != 0: - raise nncf.ValidationError( - f"Channel size {channel_size} should be divisible by size of group {group_size}" - ) - - num_groups_per_channel = channel_size // group_size - shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis - shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size) - weight = opset.reshape(weight, shape, special_zero=False) - reduction_axes += 1 - - mode = config.mode - num_bits = config.num_bits - eps = np.finfo(np.float32).eps - if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: - min_values = opset.reduce_min( - weight, reduction_axes=reduction_axes, keep_dims=True - ) # [a1, r, a2] -> [a1, 1, a2] - max_values = opset.reduce_max( - weight, reduction_axes=reduction_axes, keep_dims=True - ) # [a1, r, a2] -> [a1, 1, a2] - min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) - - level_low = 0 - level_high = 2**num_bits - 1 - levels = level_high - level_low + 1 - scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32) - scale = opset.select(opset.abs(scale) < eps, eps, scale) - - zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) - zero_point = opset.clamp(zero_point, level_low, level_high) - else: - zero_point = None - level_high = opset.constant(2 ** (num_bits - 1), ov.Type.f32) - - w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)) - w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) - w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) - - scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max) - scale /= level_high - scale = opset.select(opset.abs(scale) < eps, eps, scale) - - return _get_compress_model( - config, - params, - ov_parameters, - weight, - scale, - zero_point, - return_nodes, - ) - - -def _get_compress_model( - config: WeightCompressionConfig, - params: PrimitiveParameters, - ov_parameters: List[ov._pyopenvino.op.Parameter], - w: ov.runtime.Node, - s: ov.runtime.Node, - zp: Optional[ov.runtime.Node] = None, - return_nodes: Optional[bool] = False, -): - if w.get_element_type() != ov.Type.f32: - w = opset.convert(w, ov.Type.f32) - - compressed_w = w / s - - num_bits = config.num_bits - if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - # dtype = ov.Type.u8 - dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 - level_low = 0 - level_high = 2**num_bits - 1 - compressed_w += opset.convert(zp, ov.Type.f32) - elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: - # dtype = ov.Type.i8 - dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4 - level_low = -(2 ** (num_bits - 1)) - level_high = 2 ** (num_bits - 1) - 1 - else: - raise Exception - - compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high) - compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights") - - ov_results = [compressed_w] - if len(ov_parameters) == 1: - ov_results.append(s) - if zp is not None: - ov_results.append(opset.convert(zp, compressed_w.get_element_type())) - - if return_nodes: - return ov_parameters, ov_results - - model = ov.Model(ov_results, ov_parameters) - compiled_model = ov.compile_model(model, device_name="CPU") - - def infer(inputs): - infer_request = compiled_model.create_infer_request() - infer_request.infer(inputs, share_outputs=params.share_outputs) - outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))] - if params.release_memory: - compiled_model.release_memory() - return outputs - - return infer - - -def _get_compress_decompress_model( - config: WeightCompressionConfig, - params: PrimitiveParameters, - parameters: List[ov._pyopenvino.op.Parameter], - results: List[ov._pyopenvino.Node], -): - if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - if len(results) == 1: - compressed_w = results[0] - s, zp = parameters[1], parameters[2] - else: - compressed_w, s, zp = results - decompressed_w = (compressed_w - zp) * s - else: - if len(results) == 1: - compressed_w = results[0] - s = parameters[1] - else: - compressed_w, s = results - decompressed_w = compressed_w * s - - model = ov.Model([decompressed_w], parameters) - compiled_model = ov.compile_model(model, device_name="CPU") - - def infer(inputs): - infer_request = compiled_model.create_infer_request() - infer_request.infer(inputs, share_outputs=params.share_outputs) - outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))] - if params.release_memory: - compiled_model.release_memory() - return outputs - - return infer diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering/dispatched_functions.py b/nncf/quantization/algorithms/weight_compression/weight_lowering/dispatched_functions.py index 7dbe02a0108..77749556788 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering/dispatched_functions.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering/dispatched_functions.py @@ -13,8 +13,8 @@ from nncf.tensor import Tensor from ..config import WeightCompressionConfig -from .weight_lowering_dispatcher import ov_available_backend_selector -from .weight_lowering_dispatcher import weight_lowering_dispatcher +from .dispatcher import ov_available_backend_selector +from .dispatcher import weight_lowering_dispatcher @weight_lowering_dispatcher(ov_available_backend_selector) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering/weight_lowering_dispatcher.py b/nncf/quantization/algorithms/weight_compression/weight_lowering/dispatcher.py similarity index 100% rename from nncf/quantization/algorithms/weight_compression/weight_lowering/weight_lowering_dispatcher.py rename to nncf/quantization/algorithms/weight_compression/weight_lowering/dispatcher.py diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering/ov_backend.py b/nncf/quantization/algorithms/weight_compression/weight_lowering/ov_backend.py index ac27fc60a38..d5a71a03394 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering/ov_backend.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering/ov_backend.py @@ -36,7 +36,7 @@ from .dispatched_functions import calculate_quantized_dequantized_weight from .dispatched_functions import do_int_quantization -from .weight_lowering_dispatcher import WeightLoweringBackend +from .dispatcher import WeightLoweringBackend @dataclass diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering/tensor_backend.py b/nncf/quantization/algorithms/weight_compression/weight_lowering/tensor_backend.py index 47889dddb14..95833f6b37f 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering/tensor_backend.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering/tensor_backend.py @@ -28,7 +28,7 @@ from .common import reshape_weight_for_grouped_quantization from .dispatched_functions import calculate_quantized_dequantized_weight from .dispatched_functions import do_int_quantization -from .weight_lowering_dispatcher import WeightLoweringBackend +from .dispatcher import WeightLoweringBackend ReductionAxes = Tuple[int, ...] diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering__.py b/nncf/quantization/algorithms/weight_compression/weight_lowering__.py deleted file mode 100644 index 795e5308046..00000000000 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering__.py +++ /dev/null @@ -1,635 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -from dataclasses import dataclass -from typing import Optional, Tuple - -import numpy as np - -import nncf -from nncf.common.logging.logger import log_once -from nncf.parameters import CompressWeightsMode -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.fake_quantize import calculate_scale_zero_point -from nncf.tensor import Tensor -from nncf.tensor import functions as fns -from nncf.tensor.definitions import TensorBackend -from nncf.tensor.definitions import TensorDataType -from nncf.utils import is_openvino_available - -ReductionAxes = Tuple[int, ...] - -NF4_QUANTILES = np.array( - [ - -1.0, - -0.6961928009986877, - -0.5250730514526367, - -0.39491748809814453, - -0.28444138169288635, - -0.18477343022823334, - -0.09105003625154495, - 0.0, - 0.07958029955625534, - 0.16093020141124725, - 0.24611230194568634, - 0.33791524171829224, - 0.44070982933044434, - 0.5626170039176941, - 0.7229568362236023, - 1.0, - ], - dtype=np.float32, -) - -CENTER_OF_NF4_QUANTILES = (NF4_QUANTILES[1:] + NF4_QUANTILES[:-1]) / 2 - - -@dataclass -class CompressedWeight: - """ - Compressed weight and decompression parameters. - - :param tensor: The tensor with compressed weight. - :param scale: The decompression scale, in practice it is dequantization scale for the INT quantization. - :param zero_point: The zero-point, it is the value of the compression type corresponding to the value 0 - in the non-compression realm. Applicable for INT quantization. - """ - - tensor: Tensor - scale: Tensor - zero_point: Optional[Tensor] = None - - -def reshape_weight_for_grouped_quantization( - weight: Tensor, reduction_axes: ReductionAxes, group_size: int -) -> Tuple[Tensor, int]: - """ - Reshapes weight for group-wise quantization and return a reduction axis for collecting statistics per group - dimension. Having a transposed weight with shapes [c_out, c_in] and group size = 128, shape of reshaped weight is - [c_out, c_in // 128, 128], reduction axis = 1 and the returned reduction axis = 2. - - :param weight: Weight array to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). - :return: reshaped weight and new reduction axis. - """ - assert group_size != -1 - if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1: - reduction_axes = reduction_axes[0] - if not isinstance(reduction_axes, int): - raise NotImplementedError( - f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." - ) - channel_size = weight.shape[reduction_axes] - if channel_size % group_size != 0: - raise nncf.ValidationError(f"Channel size {channel_size} should be divisible by size of group {group_size}") - - num_groups_per_channel = channel_size // group_size - shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis - shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size) - reshaped_weight = weight.reshape(shape) - reduction_axes += 1 - return reshaped_weight, reduction_axes - - -def calculate_nf4_scale(weight: Tensor, reduction_axes: ReductionAxes) -> Tensor: - """ - Calculates the scale for nf4 quantization. - - :param weight: Weight array to compress. - :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). - :return: Scale tensor of float32 type for nf4 quantization. - """ - if weight.dtype != TensorDataType.float32: - weight = weight.astype(TensorDataType.float32) - - scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) - - # NOTE: adding machine epsilon to avoid division by zero - eps = fns.finfo(weight).eps - scale = fns.where(fns.abs(scale) < eps, eps, scale) - - return scale - - -def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=6.0) -> Tensor: - """ - Calculates the scale for e2m1 quantization. - - :param weight: Weight array to compress. - :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). - :param max_val: Maximal value of e2m1 type. - :param to_e8m0: Defines convert scale to e8m0 or not. - :return: Scale tensor of float32 type for e2m1 quantization. - """ - scale = calculate_nf4_scale(weight, reduction_axes) / max_val - - scale = fns.log2(scale) - scale = fns.ceil(scale) - scale = fns.clip(scale, -127, 127) - scale = 2**scale - - return scale - - -def calculate_signed_scale( - weight: Tensor, - reduction_axes: ReductionAxes, - num_bits=4, - invert_division: Optional[bool] = False, -) -> Tensor: - """ - Calculates the signed scale for symmetric quantization. - - :param weight: Weight array to compress. - :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). - :param num_bits: number of bits in compression. - :return: Scale tensor. - """ - level_high = 2 ** (num_bits - 1) - - w_abs_min = fns.abs(fns.min(weight, axis=reduction_axes, keepdims=True)) - w_max = fns.max(weight, axis=reduction_axes, keepdims=True) - - scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max) - if invert_division: - scale *= 1.0 / level_high - else: - scale /= level_high - - eps = fns.finfo(scale).eps - scale = fns.where(fns.abs(scale) < eps, eps, scale) - - return scale - - -def calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor: - """ - Normalizes the weight tensor using the provided scale. - - :param weight: Weight tensor to normalize. - :param scale: Scale tensor used for normalization. - :return: Normalized weight tensor. - """ - if weight.dtype != TensorDataType.float32: - weight = weight.astype(TensorDataType.float32) - if scale.dtype != TensorDataType.float32: - scale = scale.astype(TensorDataType.float32) - - return weight / scale - - -def do_nf4_quantization(weight: Tensor, scale: Tensor, is_normalized_weight: bool = False) -> Tensor: - """ - Performs NF4 quantization. The floating point values are represented by floating point scale and look-up with - 16 floating-point values on [-1, 1]. Scale normalizes original values to [-1, 1] interval and look-up table - "rounds" or "quantize" to the closest quant. - - :param weight: Weight tensor to quantize. - :param scale: Scale tensor used for normalization. - :param is_normalized_weight: Whether weight was scaled to [-1, 1] interval. Defaults to False. - :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants on [-1, 1]. - """ - norm_weight = weight if is_normalized_weight else calculate_normalized_weight(weight, scale) - center_nf4_quantiles = fns.from_numpy(CENTER_OF_NF4_QUANTILES, backend=norm_weight.backend) - indexes = fns.searchsorted(center_nf4_quantiles, norm_weight) - nf4_quantiles = fns.from_numpy(NF4_QUANTILES, backend=indexes.backend) - nf4_weight = nf4_quantiles[indexes] - return nf4_weight - - -def do_nf4_dequantization(nf4_weight: Tensor, scale: Tensor, reduction_axis: int = -1) -> Tensor: - """ - Decompresses the NF4 quantized weight tensor. - - :param nf4_weight: Tensor with floating-point values, - where each of them corresponds to 1 out of 16 quants on [-1, 1]. - :param scale: Scale tensor used for decompression. - :param reduction_axis: axis along which weights were reshaped for group quantization and will be reshaped back to - original shapes. If equals to -1, weights are not reshaped, assumed not a group quantization. Defaults to -1. - :return: Decompressed weight tensor. - """ - decompressed_weight = nf4_weight * scale - if reduction_axis != -1: - decompressed_weight = ungroup_weights(decompressed_weight, reduction_axis) - return decompressed_weight - - -def calculate_normalized_weight_and_fp4_scale( - weight: Tensor, - reduction_axes: ReductionAxes, - group_size: int = -1, - precomputed_scale: Tensor = None, - mode: CompressWeightsMode = CompressWeightsMode.NF4, -) -> Tuple[Tensor, Tensor]: - """ - Calculates scale for fp4 (nf4, e2m1) quantization and normalizes weights by the scale. - Weights are reshaped in case of positive value of group size. - - :param weight: Weight array to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param group_size: Number of weights (e.g. 128) in the channel dimension that share quantization parameters (scale). - The value -1 means no grouping. Defaults to -1. - :param precomputed_scale: Precomputed scale. - :return: Normalized weight tensor of float32 type and nf4 scale tensor of float32 type. - """ - assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] - if weight.dtype != TensorDataType.float32: - weight = weight.astype(TensorDataType.float32) - - if group_size != -1: - # weights are reshaped: [a1, r, a2] -> [a1, r//gs, gs, a2] - weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) - - if mode == CompressWeightsMode.NF4: - scale = calculate_nf4_scale(weight, reduction_axes) if precomputed_scale is None else precomputed_scale - if mode == CompressWeightsMode.E2M1: - scale = calculate_e2m1_scale(weight, reduction_axes) if precomputed_scale is None else precomputed_scale - norm_weight = calculate_normalized_weight(weight, scale) - return norm_weight, scale - - -def calculate_integer_quantization_params( - weight: Tensor, - reduction_axes: ReductionAxes, - config: WeightCompressionConfig, - invert_division: Optional[bool] = False, -) -> Tuple[Tensor, Tensor]: - """ - Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into - equal intervals, and each interval is assigned a quant. - - :param weight: Weight array to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param config: Weight compression configuration. - :return: Scale and zero point tensors. - """ - mode = config.mode - assert config.is_integer(), "The function supports integer quantization only" - num_bits = config.num_bits - - if weight.dtype != TensorDataType.float32: - weight = weight.astype(TensorDataType.float32) - - if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: - level_low = 0 - level_high = 2**num_bits - 1 - min_values = fns.min(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] - max_values = fns.max(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] - scale, zero_point = calculate_scale_zero_point( - min_values, - max_values, - level_low, - level_high, - narrow_range=False, - invert_division=invert_division, - ) - return scale, zero_point - - scale = calculate_signed_scale(weight, reduction_axes, num_bits, invert_division=invert_division) - return scale, None - - -def compare_np_to_ov(config, w, w_ov, s, s_ov, zp=None, zp_ov=None): - def compare(gt, x, label, threshold): - diff = gt.astype(TensorDataType.float32) - x.astype(TensorDataType.float32) - max_diff = fns.abs(diff).max() - mean_abs_diff = fns.mean(fns.abs(diff)) - mean_rel_diff = fns.mean(fns.abs(diff) / fns.maximum(gt.astype(TensorDataType.float32), 1)) - too_large = max_diff > threshold - if too_large: - print(f"{label}: Max diff: {max_diff}. Mean abs diff: {mean_abs_diff}. Mean rel diff: {mean_rel_diff}.") - return True - return False - - # --invert-numpy-division should be enabled - diff_too_large = False - if w_ov is not None: - sym_mode = config.mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT4_SYM] - diff_too_large = compare(w, w_ov, "Weight", threshold=0.0 if sym_mode else 1.0) - if s_ov is not None: - diff_too_large = compare(s, s_ov, "Scale", 0.0) or diff_too_large - if zp_ov is not None: - diff_too_large = compare(zp, zp_ov, "Zero point", 0.0) or diff_too_large - if diff_too_large: - exit(1) - - -def calculate_quantized_weight( - weight: Tensor, - config: WeightCompressionConfig, - scale: Tensor, - zero_point: Optional[Tensor] = None, - reduction_axes: Optional[Tuple] = None, - invert_division=False, -) -> Tensor: - """ - Quantizes the weight tensor using the provided scale and zero point. - - :param weight: Weight tensor to quantize. - :param config: Weight compression configuration. - :param scale: Scale tensor used for quantization. - :param zero_point: Zero point tensor used for quantization. - :param invert_division: apply division `a/b` as `a*(1/b)`. - :return: Quantized weight tensor of uint8 or int8 type. - """ - - asym_quant = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] - - if weight.backend == TensorBackend.numpy and not is_openvino_available(): - log_once(logging.INFO, "Compression time may improve after installing OpenVINO") - - if hasattr(weight.data, "flags"): - assert weight.data.flags["C_CONTIGUOUS"] - - NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) - END_TO_END_COMPRESSION = bool(int(os.environ.get("END_TO_END_COMPRESSION", "0"))) - COMPARE_WITH_NUMPY = bool(int(os.environ.get("COMPARE_WITH_NUMPY", "0"))) - INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32") - ov_compression = ( - weight.backend in [TensorBackend.numpy, TensorBackend.ov] and is_openvino_available() and not NUMPY_COMPRESSION - ) - compressed_weights_ov, scale_ov, zero_point_ov = None, None, None - if ov_compression: - from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE - - if INPUT_DTYPE == "bf16": - import openvino as ov - - assert weight.data.dtype == np.float16 - weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) - else: - weight_data = weight.data - input_tensors = (weight_data,) - if not END_TO_END_COMPRESSION: - zero_point_shape = None if zero_point is None else zero_point.shape - compiled_model, compress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive( - config, weight.shape, scale.shape, zero_point_shape - ) - input_tensors += (scale.data,) - if zero_point is not None: - input_tensors += (zero_point.data,) - compressed_weights_ov = Tensor(compress_weight_primitive(input_tensors)[0]) - else: - compiled_model, compress_weight_primitive = ( - OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive_end_to_end( - config, weight.shape, reduction_axes - ) - ) - results = compress_weight_primitive(input_tensors) - results = [Tensor(results[i]) for i in range(len(results))] - if asym_quant: - compressed_weights_ov, scale_ov, zero_point_ov = results - else: - compressed_weights_ov, scale_ov = results - - RELEASE_MEMORY = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) - if RELEASE_MEMORY: - compiled_model.release_memory() - if not ov_compression or COMPARE_WITH_NUMPY: - if weight.dtype != TensorDataType.float32: - weight = weight.astype(TensorDataType.float32) - - if INPUT_DTYPE == "bf16" and COMPARE_WITH_NUMPY: - # We need such workaround because `weight` actually contains bf16 data - MODEL_PATH = os.environ.get("MODEL_PATH") - CURRENT_NODE_NAME = os.environ.get("CURRENT_NODE_NAME") - import openvino as ov - - model = ov.Core().read_model(MODEL_PATH) - name_to_node_mapping = {node.get_friendly_name(): node for node in model.get_ordered_ops()} - weight_node = name_to_node_mapping[CURRENT_NODE_NAME] - weight = Tensor(weight_node.get_data(dtype=np.float32)) - - if COMPARE_WITH_NUMPY and scale is None: - if config.group_size != -1: - # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] - weight, reduction_axes = reshape_weight_for_grouped_quantization( - weight, reduction_axes, config.group_size - ) - if scale is None or zero_point is None: - scale, zero_point = calculate_integer_quantization_params( - weight, reduction_axes, config, invert_division - ) - assert scale.dtype == TensorDataType.float32 - - num_bits = config.num_bits - level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) - level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - - compressed_weights = weight * (1.0 / scale) if invert_division else weight / scale - if zero_point is not None: - compressed_weights += zero_point.astype(weight.dtype) - compressed_weights = fns.round(compressed_weights) - compressed_weights = fns.clip(compressed_weights, level_low, level_high) - - if COMPARE_WITH_NUMPY: - compare_np_to_ov( - config, compressed_weights, compressed_weights_ov, scale, scale_ov, zero_point, zero_point_ov - ) - - if compressed_weights_ov is not None: - compressed_weights = compressed_weights_ov - - dtype = TensorDataType.uint8 if asym_quant else TensorDataType.int8 - if isinstance(compressed_weights.data, np.ndarray) and compressed_weights.dtype != dtype: - compressed_weights = compressed_weights.astype(dtype) - if scale_ov is not None: - scale, zero_point = scale_ov, zero_point_ov - - return compressed_weights, scale, zero_point - - -def calculate_quantized_dequantized_weight( - weight: Tensor, config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None -) -> Tensor: - - if weight.backend == TensorBackend.numpy and not is_openvino_available(): - log_once(logging.INFO, "Compression time may improve after installing OpenVINO") - - if weight.backend == TensorBackend.numpy and is_openvino_available(): - from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE - - zero_point_shape = None if zero_point is None else zero_point.shape - compress_decompress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_decompress_weight_primitive( - config, weight.shape, scale.shape, zero_point_shape - ) - input_tensors = weight.data, scale.data - if zero_point is not None: - input_tensors += (zero_point.data,) - decompressed_weight = Tensor(compress_decompress_weight_primitive(input_tensors)) - else: - compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point) - decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point) - return decompressed_weight - - -def do_int_quantization( - weight: Tensor, - reduction_axes: ReductionAxes, - config: WeightCompressionConfig, - precomputed_scale: Tensor = None, - precomputed_zero_point: Tensor = None, - invert_division=False, -) -> Tuple[Tensor, Tensor, Tensor]: - """ - The method quantizes the given weights to integer data type uniformly in accordance with the compression config. - The config defines a quantization mode: - INT8_SYM mode refers to signed int8 symmetric weight compression without zero point - - quantization to [-128, 127] range. - INT8_ASYM mode refers to unsigned int8 asymmetric weight compression with a typical non-fixed zero-point - - quantization to [0, 255] range. - INT4_ASYM mode refers to unsigned int4 asymmetric weight compression with a typical non-fixed zero-point - - quantization to [0, 15] range. - INT4_SYM mode refers to signed int4 symmetric weight compression without zero point - - quantization to [-8, 7] range. - NF4 or E2M1 mode requires a dedicated procedure and it is not supported in this method. - One of the parameter of compression config is a group size. Quantization is per-channel, if group size equals to -1, - otherwise it's per-group, i.e. group size number of weights in the channel dimension share quantization parameters - (scales). - - :param weight: Weight array to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param config: Information on how to compress (quantize) a specific weight. - :param precomputed_scale: Precomputed scale. - :param precomputed_zero_point: Precomputed zero point. - :param invert_scale: applies inversion for scale and then multiply by weights instead of division. - Need as reference implementation for OV. - :return: The compressed weights tensor of uint8 (asymmetric mode) or int8 (symmetric mode) type, - scale tensor of float32 type and zero point tensor of int32 type that was used for its quantization. - """ - assert config.is_integer(), "The function supports integer quantization only" - group_size = config.group_size - - INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32") - if weight.dtype != TensorDataType.float32 and INPUT_DTYPE == "fp32": - weight = weight.astype(TensorDataType.float32) - - END_TO_END_COMPRESSION = bool(int(os.environ.get("END_TO_END_COMPRESSION", "0"))) - COMPARE_WITH_NUMPY = bool(int(os.environ.get("COMPARE_WITH_NUMPY", "0"))) - if not END_TO_END_COMPRESSION and not COMPARE_WITH_NUMPY: - if group_size != -1: - # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] - weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) - - if precomputed_zero_point is None or precomputed_scale is None: - scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config, invert_division) - if precomputed_scale is not None: - scale = precomputed_scale - if precomputed_zero_point is not None: - zero_point = precomputed_zero_point - else: - scale = zero_point = None - - compressed_weights, scale, zero_point = calculate_quantized_weight( - weight, config, scale, zero_point, reduction_axes, invert_division - ) - return compressed_weights, scale, zero_point - - -def get_integer_quantization_error( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig -) -> float: - """ - Calculates a quantity characterizing the difference between floating point weights and fake quantized - (compressed and decompressed) to integer ones. - - :param weight: Weight array to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param config: Information on how to compress (quantize) a specific weight. - :return: The quantity characterizing the error of integer quantization. - """ - orig_shape = weight.shape - - if weight.dtype != TensorDataType.float32: - weight = weight.astype(TensorDataType.float32) - - compressed_weights, scale, zero_point = do_int_quantization(weight, reduction_axes, config) - decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point) - - decompressed_weight = decompressed_weight.reshape(orig_shape) - diff = (decompressed_weight - weight) ** 2 - layer_err = fns.mean(diff, axis=reduction_axes) - val = fns.max(layer_err) - return val.item() - - -def compress_weight( - weight: Tensor, - reduction_axes: ReductionAxes, - config: WeightCompressionConfig, - precomputed_scale: Tensor = None, - precomputed_zero_point: Tensor = None, -): - """ - Compress weight using compression configuration. - - :param weight: The weight to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param config: Compression configuration. - :param precomputed_scale: Precomputed scale. - :param precomputed_zero_point: Precomputed zero point. - :return: The compressed weight and decompression parameters as instance of CompressedWeight - """ - if not config.is_integer(): - compressed_weight, scale = calculate_normalized_weight_and_fp4_scale( - weight, reduction_axes, config.group_size, precomputed_scale, config.mode - ) - return CompressedWeight(compressed_weight, scale) - - INVERT_NUMPY_DIVISION = bool(int(os.environ.get("INVERT_NUMPY_DIVISION", "0"))) - compressed_weight, scale, zero_point = do_int_quantization( - weight, reduction_axes, config, precomputed_scale, precomputed_zero_point, invert_division=INVERT_NUMPY_DIVISION - ) - - return CompressedWeight(compressed_weight, scale, zero_point) - - -def ungroup_weights(weights: Tensor, reduction_axis: int) -> Tensor: - """ - Reshapes weights used for group quantization back to original shape. - - :param weights: The weight to reshape. - :param reduction_axis: The axis, along which weights were reshaped for group quantization and will be reshaped back - to original shapes. If equals to -1, weights are not reshaped, assumed not a group quantization. Default to -1. - :return: Reshaped weight. - """ - shape = list(weights.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis - shape[reduction_axis] = shape[reduction_axis] * shape[reduction_axis + 1] - shape[reduction_axis + 1] = 1 - reshaped_weight = weights.reshape(shape) - reshaped_weight = fns.squeeze(reshaped_weight) - weights = reshaped_weight - return weights - - -def do_int_dequantization( - compressed_weights: Tensor, scale: Tensor, zero_point: Optional[Tensor] = None, reduction_axis: int = -1 -) -> Tensor: - """ - The method dequantizes the given weights to float point data type in accordance with the scale and - zero_point data type. - - :param compressed_weights: compressed weights. - :param scale: scale in compression/quantization. - :param zero_point: zero point in compression/quantization. - :param reduction_axis: axis along which weights were reshaped for group quantization and will be reshaped back to - original shapes. If equals to -1: weights are not reshaped, assumed not a group quantization. Default to -1. - :return: dequantized/decompressed weights. - """ - decompressed_weight = compressed_weights - zero_point if zero_point is not None else compressed_weights - decompressed_weight = decompressed_weight.astype(scale.dtype) * scale - - if reduction_axis > -1: - decompressed_weight = ungroup_weights(decompressed_weight, reduction_axis) - - return decompressed_weight diff --git a/run_weight_compression.py b/run_weight_compression.py deleted file mode 100644 index 0413034449d..00000000000 --- a/run_weight_compression.py +++ /dev/null @@ -1,373 +0,0 @@ -import os -import shutil -import subprocess -import threading -import time -from pathlib import Path - - -def stream_handler(stream, target_file): - for line in iter(stream.readline, ''): - print(line, end='') - target_file.write(line) - - -parent_model_dir = Path("/home/nsavel/workspace/openvino.genai/llm_bench/python/models") -parent_log_dir = Path("compression_logs") - -experiment_params = [ - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # - # - # - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # - # - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), - # - # - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), -] - -for model_dir, log_dir, params in experiment_params: - model_path = model_dir / "openvino_model.xml" - cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}" - - log_dir.mkdir(parents=True, exist_ok=True) - with open(log_dir / "log.txt", "a") as log_file: - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True, - universal_newlines=True, - preexec_fn=os.setsid, - ) - - stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file)) - stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file)) - - stdout_thread.start() - stderr_thread.start() - - stdout_thread.join() - stderr_thread.join() - - process.wait() - time.sleep(10) - -evaluated_paths = set() -for _, log_dir, _ in experiment_params: - for model_path in log_dir.rglob("**/*"): - model_path: Path - if model_path.suffix != ".xml": - continue - if model_path.absolute() in evaluated_paths: - continue - evaluated_paths.add(model_path.absolute()) - - model_dir = model_path.parent.absolute() - cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}" - process = subprocess.Popen(cmd, shell=True) - process.wait() diff --git a/weight_compression.py b/weight_compression.py deleted file mode 100644 index ed4ff33b696..00000000000 --- a/weight_compression.py +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -import shutil -import time -from functools import partial -from pathlib import Path - -import openvino as ov - -import nncf -from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE -from tools.memory_monitor import MemoryMonitor -from tools.memory_monitor import MemoryType - - -def parse_arguments(): - parser = argparse.ArgumentParser() - - parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored") - - parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved") - - parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode") - - parser.add_argument("--numpy", action="store_true", help="Enable numpy compression") - - parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models") - - parser.add_argument("--end-to-end", action="store_true", help="Enable end-to-end OV compression") - - parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype") - - parser.add_argument("--fp32-output", action="store_true", help="Output in fp32 instead of (u)int8") - - parser.add_argument("--recompile", action="store_true", help="Recompile model every time") - - parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs") - - parser.add_argument("--save-model", action="store_true", help="Save compressed model") - - parser.add_argument("--compare-with-numpy", action="store_true", help="Compare compressed weight with the one computed with NumPy") - - parser.add_argument("--invert-numpy-division", action="store_true", help="Invert division when compressing with NumPy") - - parser.add_argument("--release-memory", action="store_true", help="Release memory") - - return parser.parse_args() - - -def log(mm, fz, log_dir): - mm.save_memory_logs( - *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else "" - ) - - -def count_node_dtypes(model): - # Get the main dtype of weight constants - node_count_per_dtype = dict(f32=0, f16=0, bf16=0) - for node in model.get_ordered_ops(): - friendly_name = node.get_friendly_name() - if node.get_type_name() != "Constant" or ".weight" not in friendly_name: - continue - const_dtype = node.get_element_type().get_type_name() - if const_dtype in node_count_per_dtype: - node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1 - return node_count_per_dtype - - -def main(args): - model_path = Path(args.model_path) - log_dir = Path(args.log_dir) - - numpy_compression = args.numpy - dynamic_compression = args.dynamic - end_to_end_compression = args.end_to_end - input_dtype = args.input_dtype - fp32_output = args.fp32_output - recompile = args.recompile - share_outputs = args.share_outputs - save_model = args.save_model - compare_with_numpy = args.compare_with_numpy - invert_numpy_division = args.invert_numpy_division or compare_with_numpy - release_memory = args.release_memory - - log_dir_suffix = f"{model_path.parent.name}_" - if numpy_compression: - log_dir_suffix = f"{log_dir_suffix}numpy" - if invert_numpy_division: - log_dir_suffix += "_inverted" - else: - log_dir_suffix = f"{log_dir_suffix}{'end-to-end_' if end_to_end_compression else ''}" - log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}" - log_dir_suffix = f"{log_dir_suffix}_{'output-fp32' if fp32_output else 'output-i8'}" - if input_dtype is not None: - log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}" - if recompile: - log_dir_suffix = f"{log_dir_suffix}_recompile" - if release_memory: - log_dir_suffix = f"{log_dir_suffix}_release-memory" - if share_outputs: - log_dir_suffix = f"{log_dir_suffix}_share-outputs" - print(f"Log dir suffix: {log_dir_suffix}") - - memory_monitors = [] - for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]: - memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0)) - memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix)) - memory_monitors.append(memory_monitor) - - core = ov.Core() - # core.set_property({"ENABLE_MMAP": "NO"}) - model = core.read_model(model_path) - - node_count_per_dtype = count_node_dtypes(model) - assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type" - node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True) - model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]] - - # Update input dtype based on model - input_dtype = input_dtype or model_dtype - - os.environ["MODEL_PATH"] = str(model_path) - os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}" - os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}" - os.environ["END_TO_END_COMPRESSION"] = f"{int(end_to_end_compression)}" - os.environ["INPUT_DTYPE"] = input_dtype - os.environ["FP32_OUTPUT"] = f"{int(fp32_output)}" - os.environ["RECOMPILE"] = f"{int(recompile)}" - os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}" - os.environ["COMPARE_WITH_NUMPY"] = f"{int(compare_with_numpy)}" - os.environ["INVERT_NUMPY_DIVISION"] = f"{int(invert_numpy_division)}" - os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}" - - start_time = time.perf_counter() - if args.compression_mode == "int8_asym": - compression_mode = nncf.CompressWeightsMode.INT8_ASYM - elif args.compression_mode == "int8_sym": - compression_mode = nncf.CompressWeightsMode.INT8_SYM - elif args.compression_mode == "int4_asym": - compression_mode = nncf.CompressWeightsMode.INT4_ASYM - elif args.compression_mode == "int4_sym": - compression_mode = nncf.CompressWeightsMode.INT4_SYM - else: - raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}") - compressed_model = nncf.compress_weights(model, mode=compression_mode) - compression_time = time.perf_counter() - start_time - print(f"Compression Time: {compression_time:.2f} sec.") - - if save_model: - ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml") - for filepath in model_path.parent.glob("*.json"): - shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name)) - - del core - del model - del compressed_model - gc.collect() - time.sleep(0.5) - - before_cache_deletion = memory_monitors[2].get_data(True)[1][-1] - if OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_model_cache or \ - OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_end_to_end_model_cache: - OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_model_cache.clear() - OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_end_to_end_model_cache.clear() - gc.collect() - time.sleep(memory_monitors[0].interval * 10) - after_cache_deletion = memory_monitors[2].get_data(True)[1][-1] - else: - after_cache_deletion = before_cache_deletion - cache_size = before_cache_deletion - after_cache_deletion - print(f"Cache size: {cache_size:.2f} MiB") - - time.sleep(memory_monitors[0].interval * 10) - - leftover_memory = memory_monitors[2].get_data(True)[1][-1] - peak_memory = max(memory_monitors[2].get_data(True)[1]) - print(f"Peak memory: {peak_memory:.2f} MiB") - print(f"Leftover memory: {leftover_memory:.2f} MiB") - print("Done") - - csv_path = log_dir / "results.csv" - csv_exists = csv_path.exists() - csv_path.parent.mkdir(exist_ok=True, parents=True) - with open(csv_path, "a") as f: - if not csv_exists: - f.write( - "Model Path," - "Model dtype," - "Backend," - "End to end," - "Recompile," - "Release memory," - "Share outputs," - "Input Shapes," - "Input," - "Output," - "Compression Time," - "Peak Memory," - "Cache Size," - "Leftover Memory" - "\n" - ) - f.write( - f"{model_path}," - f"{model_dtype.upper()}," - f"{'NumPy' if numpy_compression else 'OV'}," - f"{'-' if numpy_compression else end_to_end_compression}," - f"{'-' if numpy_compression else recompile}," - f"{'-' if numpy_compression else release_memory}," - f"{'-' if numpy_compression else share_outputs}," - f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'}," - f"{'-' if numpy_compression else input_dtype.upper()}," - f"{'-' if numpy_compression else 'FP32' if fp32_output else 'INT8'}," - f"{compression_time:.2f}," - f"{peak_memory:.2f}," - f"{cache_size:.2f}," - f"{leftover_memory:.2f}" - f"\n" - ) - - -if __name__ == "__main__": - args = parse_arguments() - main(args)