diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ace5c150df..0a195773d9 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union from packaging import version -from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel +from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, TFPreTrainedModel from transformers.utils import is_tf_available from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig @@ -75,6 +75,7 @@ JaisModelPatcher, LlamaModelPatcher, LlavaImageEmbeddingModelPatcher, + LlavaQwen2ImageEmbeddingsModelPatcher, MistralModelPatcher, MixtralModelPatcher, MPTModelPatcher, @@ -1577,6 +1578,165 @@ def patch_model_for_export( return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs) +@register_in_tasks_manager( + "llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" +) +class LlavaQwen2OpenVINOConfig(OnnxConfig): + SUPPORTS_PAST = True + MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") + SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior] + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + use_past: bool = False, + ): + self._behavior = behavior + self._orig_config = config + if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: + config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True) + if hasattr(config, "vision_config"): + config = config.vision_config + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: + return {} + return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}} + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: + return {} + return {"last_hidden_state": {0: "batch_size"}} + + def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]): + if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): + behavior = LlavaConfigBehavior(behavior) + + if behavior == LlavaConfigBehavior.LANGUAGE: + model.forward = super(type(model), model).forward + return model + + if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: + return model + + if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS: + text_embedding = model.model.embed_tokens + text_embedding.config = model.model.config + return text_embedding + + def with_behavior( + self, + behavior: Union[str, LlavaConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior): + behavior = LlavaConfigBehavior(behavior) + + if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS: + model_type = self._orig_config.model_type.replace("llava-", "") + model_type = model_type.replace("_", "-") + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: + raise ValueError( + f"Unsupported language model type provided `{model_type}`. Please define custom export config" + ) + + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: + raise ValueError( + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" + ) + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ + "text-generation-with-past" + ] + internal_export_config = internal_export_config_class( + self._orig_config, + use_past=True, + use_past_in_inputs=True, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + ) + InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS + export_config = InputEmbedOpenvVINOConfig( + self._orig_config, + task="feature-extraction", + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + ) + return export_config + + if behavior == LlavaConfigBehavior.LANGUAGE: + model_type = self._orig_config.model_type.replace("llava-", "") + model_type = model_type.replace("_", "-") + + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: + raise ValueError( + f"Unsupported language model type provided `{model_type}`. Please define custom export config" + ) + + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: + raise ValueError( + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" + ) + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ + "text-generation-with-past" + ] + internal_export_config = internal_export_config_class( + self._orig_config, + use_past=True, + use_past_in_inputs=True, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + ) + export_config = LMInputEmbedsConfigHelper(internal_export_config) + export_config._normalized_config = internal_export_config._normalized_config + return export_config + + if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ): + model_kwargs = model_kwargs or {} + if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS: + return super().patch_model_for_export(model, model_kwargs) + return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs) + + def rename_ambiguous_inputs(self, inputs): + if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS: + model_inputs = {} + model_inputs["images"] = inputs["pixel_values"] + return model_inputs + return super().rename_ambiguous_inputs(inputs) + + class PooledProjectionsDummyInputGenerator(DummyInputGenerator): SUPPORTED_INPUT_NAMES = ["pooled_projections"] diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3bc9452ff9..4bc5bda27b 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2743,3 +2743,21 @@ def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) if hasattr(self._model.pos_embed, "_orig_forward"): self._model.pos_embed.forward = self._model.pos_embed._orig_forward + + +class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = model.encode_images + super().__init__(config, model, model_kwargs) + if not self._model.get_vision_tower().is_loaded: + self._model.get_vision_tower().load_model() + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 75106fc2b5..2440a0de92 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -208,4 +208,4 @@ def get_submodels(model): return custom_export, fn_get_submodels -MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "internvl-chat"] +MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat"] diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 141abeb87f..21c392ab62 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -14,7 +14,7 @@ from transformers.modeling_outputs import BaseModelOutputWithPooling from ...exporters.openvino import main_export -from ...exporters.openvino.stateful import ensure_stateful_is_available +from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM @@ -122,8 +122,8 @@ def prepare_inputs( else: position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] + if past_len: + position_ids = position_ids[:, -inputs_embeds.shape[1] :] inputs["position_ids"] = position_ids @@ -176,9 +176,16 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None: self.hidden_states_output_names = [ key.get_any_name() for key in self.model.outputs[2:] if "hidden_states" in key.get_any_name() ] + self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)} + self._main_input = "images" if model_has_input_output_name(self.model, "images") else "pixel_values" def forward(self, pixel_values, **kwargs): - result = self.request({"pixel_values": pixel_values}) + inputs = {self._main_input: pixel_values} + if len(self.input_names) > 1: + for name in self.input_names: + if name in kwargs: + inputs[name] = kwargs[name] + result = self.request(inputs) last_hidden_state = result[0] hidden_states = None pooler_out = None @@ -547,14 +554,17 @@ def half(self): def forward( self, input_ids, - pixel_values, + pixel_values=None, past_key_values=None, inputs_embeds=None, image_sizes=None, attention_mask=None, position_ids=None, + images=None, **kwargs, ): + if pixel_values is None and images is not None: + pixel_values = images inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( input_ids, pixel_values, @@ -604,6 +614,7 @@ def get_multimodal_embeddings( ) return inputs_embeds, attention_mask, position_ids + # Adopted from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L521 def prepare_inputs_for_generation( self, input_ids, @@ -621,21 +632,22 @@ def prepare_inputs_for_generation( # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + if attention_mask is not None and past_length + 1 > input_ids.shape[1]: + input_discount = max(attention_mask.shape[1] - past_length, 1) + input_ids = input_ids[:, -input_discount:] # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard # input_ids based on the past_length.llava elif past_length < input_ids.shape[1]: input_ids = input_ids[:, past_length:] # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - elif self.config.image_token_index in input_ids: + elif getattr(self.config, "image_token_index", -1) in input_ids: input_ids = input_ids[:, input_ids.shape[1] - 1 :] position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: + if past_key_values is not None: position_ids = position_ids[:, -input_ids.shape[1] :] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step @@ -652,6 +664,7 @@ def prepare_inputs_for_generation( "attention_mask": attention_mask, "pixel_values": pixel_values, "image_sizes": image_sizes, + "images": kwargs.get("images"), } ) return model_inputs @@ -1123,8 +1136,181 @@ def merge_vision_text_embeddings( return input_embeds, attention_mask, position_ids +class _OVNanoLlavaForCausalLM(OVModelForVisualCausalLM): + def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): + if input_ids is not None and input_ids.shape[1] == 1: + return None + if isinstance(pixel_values, list) or pixel_values.ndim == 5: + concat_images = torch.cat(pixel_values, dim=0) if isinstance(pixel_values, list) else pixel_values + image_features = torch.from_numpy(self.vision_embeddings(concat_images).last_hidden_state) + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(image_features, split_sizes, dim=0) + image_features = [x.flatten(0, 1).to(self.device) for x in image_features] + else: + image_features = self.vision_embeddings(pixel_values).last_hidden_state + + return image_features + + def get_multimodal_embeddings( + self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs + ): + vision_embeds = None + IGNORE_INDEX = -100 + IMAGE_TOKEN_INDEX = -200 + if pixel_values is not None: + vision_embeds = self.get_vision_embeddings(pixel_values, input_ids=input_ids, **kwargs) + if vision_embeds is None: + inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) + past_len = self.language_model._get_past_length(kwargs.get("past_key_values")) + if attention_mask is not None and attention_mask.shape[1] < past_len + input_ids.shape[1]: + attention_mask = torch.cat( + [ + attention_mask, + torch.ones(attention_mask.shape[0], past_len + input_ids.shape[1] - attention_mask.shape[1]), + ], + dim=1, + ) + position_ids = None + return inputs_embeds, attention_mask, position_ids + + vision_embeds = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids, dtype=torch.long) + if position_ids is None: + position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device) + labels = torch.full_like(input_ids, IGNORE_INDEX) + + # remove the padding using attention_mask -- TODO: double check + input_ids = [ + cur_input_ids[cur_attention_mask] + for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask.bool()) + ] + labels = [ + cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask.bool()) + ] + + new_input_embeds = [] + new_labels = [] + cur_image_idx = 0 + for batch_idx, cur_input_ids in enumerate(input_ids): + num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum() + if num_images == 0: + cur_image_features = vision_embeds[cur_image_idx] + cur_input_embeds_1 = torch.from_numpy(self.get_text_embeddings(cur_input_ids.unsqueeze(0))[0]) + cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0) + new_input_embeds.append(cur_input_embeds) + new_labels.append(labels[batch_idx]) + cur_image_idx += 1 + continue + + image_token_indices = ( + [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]] + ) + cur_input_ids_noim = [] + cur_labels = labels[batch_idx] + cur_labels_noim = [] + for i in range(len(image_token_indices) - 1): + cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]]) + cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]) + split_sizes = [x.shape[0] for x in cur_labels_noim] + cur_input_embeds = torch.from_numpy( + self.get_text_embeddings(torch.cat(cur_input_ids_noim).unsqueeze(0))[0] + ) + cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0) + cur_new_input_embeds = [] + cur_new_labels = [] + + for i in range(num_images + 1): + cur_new_input_embeds.append(cur_input_embeds_no_im[i]) + cur_new_labels.append(cur_labels_noim[i]) + if i < num_images: + cur_image_features = vision_embeds[cur_image_idx] + cur_image_idx += 1 + cur_new_input_embeds.append(cur_image_features) + cur_new_labels.append( + torch.full( + (cur_image_features.shape[0],), + IGNORE_INDEX, + device=cur_labels.device, + dtype=cur_labels.dtype, + ) + ) + + cur_new_input_embeds = torch.cat(cur_new_input_embeds) + cur_new_labels = torch.cat(cur_new_labels) + + new_input_embeds.append(cur_new_input_embeds) + new_labels.append(cur_new_labels) + + # Truncate sequences to max length as image embeddings can make the sequence longer + tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None) + if tokenizer_model_max_length is not None: + new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds] + new_labels = [x[:tokenizer_model_max_length] for x in new_labels] + + # Combine them + max_len = max(x.shape[0] for x in new_input_embeds) + batch_size = len(new_input_embeds) + + new_input_embeds_padded = [] + new_labels_padded = torch.full( + (batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device + ) + attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device) + position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device) + + for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)): + cur_len = cur_new_embed.shape[0] + if getattr(self.config, "tokenizer_padding_side", "right") == "left": + new_input_embeds_padded.append( + torch.cat( + ( + torch.zeros( + (max_len - cur_len, cur_new_embed.shape[1]), + dtype=cur_new_embed.dtype, + device=cur_new_embed.device, + ), + cur_new_embed, + ), + dim=0, + ) + ) + if cur_len > 0: + new_labels_padded[i, -cur_len:] = cur_new_labels + attention_mask[i, -cur_len:] = True + position_ids[i, -cur_len:] = torch.arange( + 0, cur_len, dtype=position_ids.dtype, device=position_ids.device + ) + else: + new_input_embeds_padded.append( + torch.cat( + ( + cur_new_embed, + torch.zeros( + (max_len - cur_len, cur_new_embed.shape[1]), + dtype=cur_new_embed.dtype, + device=cur_new_embed.device, + ), + ), + dim=0, + ) + ) + if cur_len > 0: + new_labels_padded[i, :cur_len] = cur_new_labels + attention_mask[i, :cur_len] = True + position_ids[i, :cur_len] = torch.arange( + 0, cur_len, dtype=position_ids.dtype, device=position_ids.device + ) + + new_input_embeds = torch.stack(new_input_embeds_padded, dim=0) + + return new_input_embeds, attention_mask, position_ids + + MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, "internvl_chat": _OvInternVLForCausalLM, + "llava-qwen2": _OVNanoLlavaForCausalLM, } diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 119e004035..8f23705301 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -50,6 +50,7 @@ AutoModelForSpeechSeq2Seq, AutoModelForTokenClassification, AutoModelForVision2Seq, + AutoProcessor, AutoTokenizer, GenerationConfig, Pix2StructForConditionalGeneration, @@ -1867,12 +1868,12 @@ def test_compare_with_and_without_past_key_values(self): class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = [ - "llava", - ] + SUPPORTED_ARCHITECTURES = ["llava"] + + REMOTE_CODE_MODELS = ["minicpmv", "nanollava"] if is_transformers_version(">=", "4.40.0"): - SUPPORTED_ARCHITECTURES += ["llava_next"] + SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"] TASK = "image-text-to-text" IMAGE = Image.open( @@ -1891,19 +1892,47 @@ def get_transformer_model_class(self, model_arch): from transformers import LlavaNextForConditionalGeneration return LlavaNextForConditionalGeneration - return None + return AutoModelForCausalLM + + def gen_inputs(self, model_arch, base_text_prompt, image=None): + model_id = MODEL_NAMES[model_arch] + prompt = f"\n {base_text_prompt}" + if model_arch != "nanollava": + processor = AutoProcessor.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + inputs = processor(images=[self.IMAGE.resize((600, 600))], text=[prompt], return_tensors="pt") + else: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + processor = AutoProcessor.from_pretrained( + config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + image_input = None + if image is not None: + image_input = processor(images=image, return_tensors="pt")["pixel_values"] + text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] + + input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + attention_mask = torch.ones_like(input_ids, dtype=torch.int64) + inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} + return inputs @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): - prompt = "\n What is shown in this image?" model_id = MODEL_NAMES[model_arch] - processor = get_preprocessor(model_id) - transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(model_id) - inputs = processor(images=self.IMAGE, text=prompt, return_tensors="pt") - set_seed(SEED) - with torch.no_grad(): - transformers_outputs = transformers_model(**inputs) - ov_model = OVModelForVisualCausalLM.from_pretrained(model_id, export=True) + transformers_model = self.get_transformer_model_class(model_arch).from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + if "nanollava" in model_arch: + transformers_model.get_vision_tower().load_model() + inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE) + + ov_model = OVModelForVisualCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) self.assertIsInstance(ov_model, MODEL_TYPE_TO_CLS_MAPPING[ov_model.config.model_type]) self.assertIsInstance(ov_model.vision_embeddings, OVVisionEmbedding) self.assertIsInstance(ov_model.language_model, OVModelWithEmbedForCausalLM) @@ -1911,6 +1940,9 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(hasattr(ov_model, additional_part)) self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) self.assertIsInstance(ov_model.config, PretrainedConfig) + set_seed(SEED) + with torch.no_grad(): + transformers_outputs = transformers_model(**inputs) ov_outputs = ov_model(**inputs) self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) @@ -1921,7 +1953,6 @@ def test_compare_to_transformers(self, model_arch): gen_config = GenerationConfig( max_new_tokens=30, min_new_tokens=30, - num_beams=3, do_sample=False, eos_token_id=None, ) @@ -1942,20 +1973,25 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_generate_utils(self, model_arch): model_id = MODEL_NAMES[model_arch] - model = OVModelForVisualCausalLM.from_pretrained(model_id, export=True) - preprocessor = get_preprocessor(model_id) - question = "\nDescribe image" - inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") - + model = OVModelForVisualCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE) # General case outputs = model.generate(**inputs, max_new_tokens=10) - outputs = preprocessor.batch_decode(outputs, skip_special_tokens=True) + # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 + outputs = outputs[:, inputs["input_ids"].shape[1] :] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) self.assertIsInstance(outputs[0], str) + # No input image case question = "Hi, how are you?" - inputs = preprocessor(images=None, text=question, return_tensors="pt") + inputs = self.gen_inputs(model_arch, question, None) outputs = model.generate(**inputs, max_new_tokens=10) - outputs = preprocessor.batch_decode(outputs, skip_special_tokens=True) + # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 + outputs = outputs[:, inputs["input_ids"].shape[1] :] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) self.assertIsInstance(outputs[0], str) del model diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index e5a9f73a64..8d51620068 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -94,6 +94,7 @@ "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", "mpnet": "hf-internal-testing/tiny-random-MPNetModel", "mt5": "stas/mt5-tiny-random", + "nanollava": "katuni4ka/tiny-random-nanollava", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", "olmo": "katuni4ka/tiny-random-olmo-hf", "orion": "katuni4ka/tiny-random-orion",