diff --git a/notebooks/openvino/requirements.txt b/notebooks/openvino/requirements.txt index bb7a517cff..64ccd6d8cc 100644 --- a/notebooks/openvino/requirements.txt +++ b/notebooks/openvino/requirements.txt @@ -4,4 +4,3 @@ evaluate[evaluator] ipywidgets pillow torchaudio - diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb new file mode 100644 index 0000000000..714544aa9a --- /dev/null +++ b/notebooks/openvino/sentence_transformer_quantization.ipynb @@ -0,0 +1,625 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantization of Text Embedding model from Sentence Transformers library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install optimum[openvino]\n", + "%pip install evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantize staticly model to 8-bit with NNCF via Optimum-Intel API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code snippet below shows how to use Optimum-Intel [Model Optimization API](https://huggingface.co/docs/optimum/en/intel/openvino/optimization#static-quantization) to quantize the model staticly. It leaverages [NNCF](https://github.com/openvinotoolkit/nncf) capabilites for static quantization of Transformer models where a combination of the special quantization scheme + SmoothQuant method + Bias Correction method are used to provide state-of-the-art accuracy.\n", + "\n", + "The static quantization requires some data to estimate quantization parameters of activations. It means that some calibration dataset should be provided. `OVQuantizer` class used for quantization provides an API to build such a dataset with `.get_calibration_dataset()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n", + "Framework not specified. Using pt to export the model.\n", + "Using framework PyTorch: 2.4.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "Compiling the model to CPU ...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a9bd847756fd467e905a7ad7a243640c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d8ad91623d642f48e85b60ac823aca4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a2a7d09a573c4092a830bbaadc39f756",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b67c493aab36426090f8fafd25a17a00",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Configuration saved in all-MiniLM-L6-v2_int8/openvino_config.json\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "('all-MiniLM-L6-v2_int8/tokenizer_config.json',\n",
+       " 'all-MiniLM-L6-v2_int8/special_tokens_map.json',\n",
+       " 'all-MiniLM-L6-v2_int8/vocab.txt',\n",
+       " 'all-MiniLM-L6-v2_int8/added_tokens.json',\n",
+       " 'all-MiniLM-L6-v2_int8/tokenizer.json')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from functools import partial\n",
+    "import datasets\n",
+    "from transformers import AutoTokenizer\n",
+    "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n",
+    "\n",
+    "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "base_model_path = \"all-MiniLM-L6-v2\"\n",
+    "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n",
+    "\n",
+    "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
+    "model.save_pretrained(base_model_path)\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
+    "tokenizer.save_pretrained(base_model_path)\n",
+    "\n",
+    "\n",
+    "quantizer = OVQuantizer.from_pretrained(model)\n",
+    "\n",
+    "def preprocess_function(examples, tokenizer):\n",
+    "    return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",
+    "\n",
+    "\n",
+    "calibration_dataset = quantizer.get_calibration_dataset(\n",
+    "    \"glue\",\n",
+    "    dataset_config_name=\"sst2\",\n",
+    "    preprocess_function=partial(preprocess_function, tokenizer=tokenizer),\n",
+    "    num_samples=300,\n",
+    "    dataset_split=\"train\",\n",
+    ")\n",
+    "\n",
+    "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n",
+    "\n",
+    "quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=int8_ptq_model_path)\n",
+    "tokenizer.save_pretrained(int8_ptq_model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Benchmark model accuracy on GLUE STSB task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we estimate accuracy impact from model quantization. We evaluate accuracy of both the baseline and quantized model on a different task from the GLUE benchmark."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Pipeline\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "# copied from the model card \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "def mean_pooling(model_output, attention_mask):\n",
+    "    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings\n",
+    "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
+    "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n",
+    "\n",
+    "\n",
+    "class SentenceEmbeddingPipeline(Pipeline):\n",
+    "    def _sanitize_parameters(self, **kwargs):\n",
+    "        # we don\"t have any hyperameters to sanitize\n",
+    "        preprocess_kwargs = {}\n",
+    "        return preprocess_kwargs, {}, {}\n",
+    "\n",
+    "    def preprocess(self, inputs):\n",
+    "        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
+    "        return encoded_inputs\n",
+    "\n",
+    "    def _forward(self, model_inputs):\n",
+    "        outputs = self.model(**model_inputs)\n",
+    "        return {\"outputs\": outputs, \"attention_mask\": model_inputs[\"attention_mask\"]}\n",
+    "\n",
+    "    def postprocess(self, model_outputs):\n",
+    "        # Perform pooling\n",
+    "        sentence_embeddings = mean_pooling(model_outputs[\"outputs\"], model_outputs[\"attention_mask\"])\n",
+    "        # Normalize embeddings\n",
+    "        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n",
+    "        return sentence_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compiling the model to CPU ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compiling the model to CPU ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = OVModelForFeatureExtraction.from_pretrained(base_model_path)\n",
+    "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
+    "\n",
+    "q_model = OVModelForFeatureExtraction.from_pretrained(int8_ptq_model_path)\n",
+    "q8_emb = SentenceEmbeddingPipeline(model=q_model, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from evaluate import load\n",
+    "\n",
+    "eval_dataset = load_dataset(\"glue\", \"stsb\", split=\"validation\")\n",
+    "metric = load(\"glue\", \"stsb\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5cab9e8fc58245a4b395a9575017633b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1500 [00:00\n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 12.27 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         1988.84 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        9.74 ms\n",
+      "[ INFO ]    Average:       9.77 ms\n",
+      "[ INFO ]    Min:           9.59 ms\n",
+      "[ INFO ]    Max:           11.12 ms\n",
+      "[ INFO ] Throughput:   100.56 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# FP32 baseline model\n",
+    "!benchmark_app -m all-MiniLM-L6-v2/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Step 1/11] Parsing and validating input arguments\n",
+      "[ INFO ] Parsing input parameters\n",
+      "[Step 2/11] Loading OpenVINO Runtime\n",
+      "[ INFO ] OpenVINO:\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
+      "[ INFO ] \n",
+      "[ INFO ] Device info:\n",
+      "[ INFO ] CPU\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
+      "[ INFO ] \n",
+      "[ INFO ] \n",
+      "[Step 3/11] Setting device configuration\n",
+      "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
+      "[Step 4/11] Reading model files\n",
+      "[ INFO ] Loading model files\n",
+      "[ INFO ] Read model took 20.87 ms\n",
+      "[ INFO ] Original model I/O parameters:\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [?,?,384]\n",
+      "[Step 5/11] Resizing model to match image sizes and given batch\n",
+      "[ INFO ] Model batch size: 1\n",
+      "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
+      "[ INFO ] Reshape model took 3.42 ms\n",
+      "[Step 6/11] Configuring input of the model\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
+      "[Step 7/11] Loading the model to the device\n",
+      "[ INFO ] Compile model took 323.91 ms\n",
+      "[Step 8/11] Querying optimal runtime parameters\n",
+      "[ INFO ] Model:\n",
+      "[ INFO ]   NETWORK_NAME: Model0\n",
+      "[ INFO ]   OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1\n",
+      "[ INFO ]   NUM_STREAMS: 1\n",
+      "[ INFO ]   INFERENCE_NUM_THREADS: 18\n",
+      "[ INFO ]   PERF_COUNT: NO\n",
+      "[ INFO ]   INFERENCE_PRECISION_HINT: \n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 6.72 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         853.85 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        4.13 ms\n",
+      "[ INFO ]    Average:       4.15 ms\n",
+      "[ INFO ]    Min:           4.05 ms\n",
+      "[ INFO ]    Max:           5.13 ms\n",
+      "[ INFO ] Throughput:   234.23 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# INT8 counterpart\n",
+    "!benchmark_app -m all-MiniLM-L6-v2_int8/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test3.11_cpu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}