diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 20e2d7ca3..695c22e98 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -105,6 +105,12 @@ def parse_args_openvino(parser: "ArgumentParser"): "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it." ), ) + optional_group.add_argument( + "--variant", + type=str, + default=None, + help=("Select a variant of the model to export."), + ) optional_group.add_argument( "--ratio", type=float, @@ -410,6 +416,10 @@ def run(self): from optimum.intel import OVFluxPipeline model_cls = OVFluxPipeline + elif class_name == "SanaPipeline": + from optimum.intel import OVSanaPipeline + + model_cls = OVSanaPipeline else: raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") @@ -442,6 +452,8 @@ def run(self): quantization_config=quantization_config, stateful=not self.args.disable_stateful, trust_remote_code=self.args.trust_remote_code, + variant=self.args.variant, + cache_dir=self.args.cache_dir, ) model.save_pretrained(self.args.output) @@ -463,5 +475,6 @@ def run(self): stateful=not self.args.disable_stateful, convert_tokenizer=not self.args.disable_convert_tokenizer, library_name=library_name, + model_variant=self.args.variant, # **input_shapes, ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 4047ab64a..520a28559 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -122,6 +122,7 @@ def main_export( convert_tokenizer: bool = False, library_name: Optional[str] = None, model_loading_kwargs: Optional[Dict[str, Any]] = None, + model_variant: Optional[str] = None, **kwargs_shapes, ): """ @@ -237,6 +238,8 @@ def main_export( custom_architecture = False patch_16bit = False loading_kwargs = model_loading_kwargs or {} + if model_variant is not None: + loading_kwargs["variant"] = model_variant if library_name == "transformers": config = AutoConfig.from_pretrained( model_name_or_path, @@ -347,6 +350,7 @@ class StoreAttr(object): GPTQQuantizer.post_init_model = post_init_model elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"): + _loading_kwargs = {} if model_variant is None else {"variant": model_variant} dtype = deduce_diffusers_dtype( model_name_or_path, revision=revision, @@ -355,6 +359,7 @@ class StoreAttr(object): local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, + **_loading_kwargs, ) if dtype in [torch.float16, torch.bfloat16]: loading_kwargs["torch_dtype"] = dtype diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 22a3ca884..0d6c2b436 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1016,6 +1016,7 @@ def get_diffusion_models_for_export_ext( is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL") is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3") is_flux = pipeline.__class__.__name__.startswith("Flux") + is_sana = pipeline.__class__.__name__.startswith("Sana") is_sd = pipeline.__class__.__name__.startswith("StableDiffusion") and not is_sd3 is_lcm = pipeline.__class__.__name__.startswith("LatentConsistencyModel") @@ -1034,11 +1035,78 @@ def get_diffusion_models_for_export_ext( models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) elif is_flux: models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype) + elif is_sana: + models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype) else: raise ValueError(f"Unsupported pipeline type `{pipeline.__class__.__name__}` provided") return None, models_for_export +def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype): + models_for_export = {} + text_encoder = pipeline.text_encoder + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="gemma2-text-encoder", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + text_encoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} + models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + transformer = pipeline.transformer + transformer.config.text_encoder_projection_dim = transformer.config.caption_channels + transformer.config.requires_aesthetics_score = False + transformer.config.time_cond_proj_dim = None + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="sana-transformer", + ) + transformer_export_config = export_config_constructor( + pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (transformer, transformer_export_config) + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + vae_encoder.forward = lambda sample: {"latent": vae_encoder.encode(x=sample)["latent"]} + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="dcae-encoder", + ) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + vae_encoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-decoder", + ) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + vae_decoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) + + return models_for_export + + def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): models_for_export = {} diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 4b1dbb50b..e73039159 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -41,6 +41,7 @@ PhiOnnxConfig, T5OnnxConfig, UNetOnnxConfig, + VaeEncoderOnnxConfig, VisionOnnxConfig, WhisperOnnxConfig, ) @@ -105,6 +106,7 @@ Qwen2VLVisionEmbMergerPatcher, QwenModelPatcher, RotaryEmbPatcher, + SanaTextEncoderModelPatcher, StatefulSeq2SeqDecoderPatcher, UpdateCausalMaskModelPatcher, XverseModelPatcher, @@ -133,6 +135,8 @@ def init_model_configs(): if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} + TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline") + TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline" supported_model_types = [ "_SUPPORTED_MODEL_TYPE", @@ -1891,6 +1895,83 @@ class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig): pass +@register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers") +class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig): + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "batch_size", 1: "sequence_length"}, + } + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return SanaTextEncoderModelPatcher(self, model, model_kwargs) + + +class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "decoder_input_ids", + "decoder_attention_mask", + "encoder_outputs", + "encoder_hidden_states", + "encoder_attention_mask", + ) + + +class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"] // 8, + height: int = DEFAULT_DUMMY_SHAPES["height"] // 8, + # Reduce img shape by 4 for FLUX to reduce memory usage on conversion + **kwargs, + ): + super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs) + + +@register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers") +class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig): + NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( + image_size="sample_size", + num_channels="in_channels", + hidden_size="caption_channels", + vocab_size="attention_head_dim", + allow_new=True, + ) + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummySanaTransformerVisionInputGenerator, + DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator, + ) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1] + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"} + return common_inputs + + def rename_ambiguous_inputs(self, inputs): + # The input name in the model signature is `x, hence the export input name is updated. + hidden_states = inputs.pop("sample", None) + if hidden_states is not None: + inputs["hidden_states"] = hidden_states + return inputs + + +@register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers") +class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig): + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "latent": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, + } + + class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( "pixel_values", diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index e7a777938..08bc14988 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -21,9 +21,11 @@ import torch import torch.nn.functional as F +from transformers import PreTrainedModel, TFPreTrainedModel from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling from transformers.utils import is_tf_available +from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.model_patcher import ( DecoderModelPatcher, ModelPatcher, @@ -114,9 +116,11 @@ def patch_model_with_bettertransformer(model): return model -def patch_update_causal_mask(model, transformers_version, inner_model_name="model", patch_fn=None): +def patch_update_causal_mask( + model, transformers_version, inner_model_name="model", patch_fn=None, patch_extrnal_model=False +): if is_transformers_version(">=", transformers_version): - inner_model = getattr(model, inner_model_name, None) + inner_model = getattr(model, inner_model_name, None) if not patch_extrnal_model else model if inner_model is not None: if hasattr(inner_model, "_update_causal_mask"): inner_model._orig_update_causal_mask = inner_model._update_causal_mask @@ -124,8 +128,8 @@ def patch_update_causal_mask(model, transformers_version, inner_model_name="mode inner_model._update_causal_mask = types.MethodType(patch_fn, inner_model) -def unpatch_update_causal_mask(model, inner_model_name="model"): - inner_model = getattr(model, inner_model_name, None) +def unpatch_update_causal_mask(model, inner_model_name="model", patch_extrnal_model=False): + inner_model = getattr(model, inner_model_name, None) if not patch_extrnal_model else model if inner_model is not None and hasattr(inner_model, "._orig_update_causal_mask"): inner_model._update_causal_mask = inner_model._orig_update_causal_mask @@ -3791,3 +3795,29 @@ def patched_forward(*args, **kwargs): model.forward = patched_forward super().__init__(config, model, model_kwargs) + + +class SanaTextEncoderModelPatcher(ModelPatcher): + def __enter__(self): + super().__enter__() + patch_update_causal_mask(self._model, "4.39.0", None, patch_extrnal_model=True) + + if self._model.config._attn_implementation != "sdpa": + self._model.config._orig_attn_implementation = self._model.config._attn_implementation + self._model.config._attn_implementation = "sdpa" + if is_transformers_version("<", "4.47.0"): + from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES + + sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"] + for layer in self._model.layers: + layer.self_attn._orig_forward = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + unpatch_update_causal_mask(self._model, None, True) + if hasattr(self._model.config, "_orig_attn_implementation"): + self._model.config._attn_implementation = self._model.config._orig_attn_implementation + for layer in self._model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 46b151e7d..1743dc59b 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -257,9 +257,15 @@ def deduce_diffusers_dtype(model_name_or_path, **loading_kwargs): model_part_name = "unet" if model_part_name: directory = path / model_part_name - safetensors_files = [ - filename for filename in directory.glob("*.safetensors") if len(filename.suffixes) == 1 - ] + + pattern = "*.safetensors" + if "variant" in loading_kwargs: + variant = loading_kwargs["variant"] + pattern = f"*.{variant}.safetensors" + safetensors_files = list(directory.glob(pattern)) + else: + # filter out variant files + safetensors_files = [filename for filename in directory.glob(pattern) if len(filename.suffixes) == 1] safetensors_file = None if len(safetensors_files) > 0: safetensors_file = safetensors_files.pop(0) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 91aaf57ae..2c85dcc98 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -127,6 +127,7 @@ "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", + "OVSanaPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", @@ -150,6 +151,7 @@ "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", + "OVSanaPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 8945dc638..d3142ad80 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -91,6 +91,7 @@ OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVSanaPipeline, OVStableDiffusion3Img2ImgPipeline, OVStableDiffusion3InpaintPipeline, OVStableDiffusion3Pipeline, diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 99422f1a5..3fd26a6e0 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -594,6 +594,8 @@ def _from_transformers( else: ov_config = OVConfig(dtype="fp32") + variant = kwargs.pop("variant", None) + main_export( model_name_or_path=model_id, output=save_dir_path, @@ -607,6 +609,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, library_name=cls._library_name, + model_variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 11ee8f89a..c60c0ec70 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -408,6 +408,7 @@ def _from_transformers( else: ov_config = OVConfig(dtype="fp32") stateful = kwargs.get("stateful", True) + variant = kwargs.pop("variant", None) main_export( model_name_or_path=model_id, @@ -422,6 +423,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, stateful=stateful, + model_variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 4897db145..b411bf07d 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -310,6 +310,8 @@ def _from_transformers( if torch_dtype is not None: model_loading_kwargs["torch_dtype"] = torch_dtype + variant = kwargs.pop("variant", None) + main_export( model_name_or_path=model_id, output=save_dir_path, @@ -325,6 +327,7 @@ def _from_transformers( stateful=stateful, model_loading_kwargs=model_loading_kwargs, library_name=cls._library_name, + model_variant=variant, ) if config.model_type == "phi3" and config.max_position_embeddings != getattr( diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index bc2f75e0c..c2e245c5e 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -103,9 +103,10 @@ FluxInpaintPipeline = object if is_diffusers_version(">=", "0.32.0"): - from diffusers import FluxFillPipeline + from diffusers import FluxFillPipeline, SanaPipeline else: FluxFillPipeline = object + SanaPipeline = object DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" @@ -574,6 +575,7 @@ def _from_transformers( model_save_dir = TemporaryDirectory() model_save_path = Path(model_save_dir.name) + variant = kwargs.pop("variant", None) main_export( model_name_or_path=model_id, @@ -588,6 +590,7 @@ def _from_transformers( force_download=force_download, ov_config=ov_config, library_name=cls._library_name, + model_variant=variant, ) return cls._from_pretrained( @@ -817,9 +820,14 @@ def reshape( if self.tokenizer is None and self.tokenizer_2 is None: tokenizer_max_len = -1 else: - tokenizer_max_len = ( - self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length - ) + if self.tokenizer is not None and "Gemma" in self.tokenizer.__class__.__name__: + tokenizer_max_len = -1 + else: + tokenizer_max_len = ( + self.tokenizer.model_max_length + if self.tokenizer is not None + else self.tokenizer_2.model_max_length + ) if self.unet is not None: self.unet.model = self._reshape_unet( @@ -838,17 +846,23 @@ def reshape( if self.text_encoder is not None: self.text_encoder.model = self._reshape_text_encoder( - self.text_encoder.model, batch_size, self.tokenizer.model_max_length + self.text_encoder.model, + batch_size, + self.tokenizer.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, ) if self.text_encoder_2 is not None: self.text_encoder_2.model = self._reshape_text_encoder( - self.text_encoder_2.model, batch_size, self.tokenizer_2.model_max_length + self.text_encoder_2.model, + batch_size, + self.tokenizer_2.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, ) if self.text_encoder_3 is not None: self.text_encoder_3.model = self._reshape_text_encoder( - self.text_encoder_3.model, batch_size, self.tokenizer_3.model_max_length + self.text_encoder_3.model, + batch_size, + self.tokenizer_3.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, ) self.clear_requests() @@ -1041,6 +1055,7 @@ def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPi self.hidden_states_output_names = [ name for out in self.model.outputs for name in out.names if name.startswith("hidden_states") ] + self.input_names = [inp.get_any_name() for inp in self.model.inputs] def forward( self, @@ -1052,6 +1067,11 @@ def forward( self._compile() model_inputs = {"input_ids": input_ids} + if "attention_mask" in self.input_names: + model_inputs["attention_mask"] = ( + attention_mask if attention_mask is not None else torch.ones(input_ids.shape, dtype=torch.long) + ) + ov_outputs = self.request(model_inputs, share_inputs=True) main_out = ov_outputs[0] model_outputs = {} @@ -1139,6 +1159,8 @@ def forward( guidance: torch.Tensor = None, block_controlnet_hidden_states: List = None, joint_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: torch.LongTensor = None, + attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, ): self._compile() @@ -1147,9 +1169,10 @@ def forward( "hidden_states": hidden_states, "timestep": timestep, "encoder_hidden_states": encoder_hidden_states, - "pooled_projections": pooled_projections, } + if pooled_projections is not None: + model_inputs["pooled_projections"] = pooled_projections if img_ids is not None: model_inputs["img_ids"] = img_ids if txt_ids is not None: @@ -1157,6 +1180,9 @@ def forward( if guidance is not None: model_inputs["guidance"] = guidance + if encoder_attention_mask is not None: + model_inputs["encoder_attention_mask"] = encoder_attention_mask + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() model_outputs = {} @@ -1498,6 +1524,12 @@ class OVFluxFillPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, Flu auto_model_class = FluxFillPipeline +class OVSanaPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, SanaPipeline): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = SanaPipeline + + SUPPORTED_OV_PIPELINES = [ OVStableDiffusionPipeline, OVStableDiffusionImg2ImgPipeline, @@ -1569,6 +1601,8 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru if is_diffusers_version(">=", "0.32.0"): OV_INPAINT_PIPELINES_MAPPING["flux-fill"] = OVFluxFillPipeline SUPPORTED_OV_PIPELINES.append(OVFluxFillPipeline) + OV_TEXT2IMAGE_PIPELINES_MAPPING["sana"] = OVSanaPipeline + SUPPORTED_OV_PIPELINES.append(OVSanaPipeline) SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 1c0e35cca..c7cd7227f 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -615,6 +615,7 @@ def _from_transformers( ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto") stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache) + variant = kwargs.pop("variant", None) main_export( model_name_or_path=model_id, @@ -629,6 +630,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, stateful=stateful, + model_variant=variant, ) config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code) return cls._from_pretrained( diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index fbb108c7d..d9df9419a 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -125,6 +125,7 @@ "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", "stable-diffusion-3": "OVStableDiffusion3Pipeline", + "sana": "OVSanaPipeline", "flux": "OVFluxPipeline", "flux-fill": "OVFluxFillPipeline", "pix2struct": "OVModelForPix2Struct", diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 2e669875f..f6341e543 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -222,3 +222,14 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVSanaPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 477799345..bff187934 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -77,8 +77,8 @@ class OVPipelineForText2ImageTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] if is_transformers_version(">=", "4.40.0"): - SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux"]) - NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append("stable-diffusion-3") + SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux", "sana"]) + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append(["stable-diffusion-3"]) CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image @@ -94,6 +94,13 @@ def generate_inputs(self, height=128, width=128, batch_size=1): return inputs + def get_auto_cls(self, model_arch): + if model_arch == "sana": + from diffusers import SanaPipeline + + return SanaPipeline + return self.AUTOMODEL_CLASS + @require_diffusers def test_load_vanilla_model_which_is_not_supported(self): with self.assertRaises(Exception) as context: @@ -104,12 +111,14 @@ def test_load_vanilla_model_which_is_not_supported(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_ov_pipeline_class_dispatch(self, model_arch: str): - auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + auto_cls = self.get_auto_cls(model_arch) + auto_pipeline = DiffusionPipeline if model_arch != "sana" else auto_cls + auto_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) - auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + auto_pipeline = auto_pipeline.from_pretrained(MODEL_NAMES[model_arch]) ov_pipeline = OVDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) @@ -130,30 +139,38 @@ def test_num_images_per_prompt(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): - height, width, batch_size = 128, 128, 1 + height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - + auto_cls = self.get_auto_cls(model_arch) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - - for output_type in ["latent", "np", "pt"]: - inputs["output_type"] = output_type + diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) - ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + with torch.no_grad(): + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type + if model_arch == "sana": + # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations + inputs["use_resolution_binning"] = False + atol = 1e-4 - np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2) # test on inputs nondivisible on 64 height, width, batch_size = 96, 96, 1 for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type + if model_arch == "sana": + # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations + inputs["use_resolution_binning"] = False + atol = 6e-3 ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2) @parameterized.expand(CALLBACK_SUPPORT_ARCHITECTURES) @require_diffusers @@ -174,7 +191,8 @@ def __call__(self, *args, **kwargs) -> None: auto_callback = Callback() ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + auto_cls = self.get_auto_cls(model_arch) + auto_pipe = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) # callback_steps=1 to trigger callback every step ov_pipe(**inputs, callback=ov_callback, callback_steps=1) @@ -191,6 +209,8 @@ def test_shape(self, model_arch: str): height, width, batch_size = 128, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + if model_arch == "sana": + inputs["use_resolution_binning"] = False for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index f4b96ec99..7fbeb2e9e 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -44,6 +44,7 @@ OVModelOpenCLIPForZeroShotImageClassification, OVModelOpenCLIPText, OVModelOpenCLIPVisual, + OVSanaPipeline, OVSentenceTransformer, OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, @@ -84,7 +85,12 @@ class OVCLIExportTestCase(unittest.TestCase): if is_transformers_version(">=", "4.45"): SUPPORTED_ARCHITECTURES.extend( - [("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux"), ("inpainting", "flux-fill")] + [ + ("text-to-image", "stable-diffusion-3"), + ("text-to-image", "flux"), + ("inpainting", "flux-fill"), + ("text-to-image", "sana"), + ] ) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { "gpt2": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, @@ -102,6 +108,7 @@ class OVCLIExportTestCase(unittest.TestCase): "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "flux-fill": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, + "sana": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, } SUPPORTED_SD_HYBRID_ARCHITECTURES = [ @@ -113,6 +120,7 @@ class OVCLIExportTestCase(unittest.TestCase): if is_transformers_version(">=", "4.45"): SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65)) SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("flux", 7, 56)) + SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("sana", 19, 53)) SUPPORTED_QUANTIZATION_ARCHITECTURES = [ ( @@ -351,9 +359,15 @@ def test_exporters_cli_int8(self, task: str, model_type: str): models = [model.encoder, model.decoder] if task.endswith("with-past") and not model.decoder.stateful: models.append(model.decoder_with_past) - elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"): + elif ( + model_type.startswith("stable-diffusion") + or model_type.startswith("flux") + or model_type.startswith("sana") + ): models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] - models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) + models.append( + model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2 + ) elif task.startswith("image-text-to-text"): models = [model.language_model, model.vision_embeddings] else: diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 26ad44401..4da88418b 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -59,6 +59,7 @@ OVStableDiffusionXLPipeline, OVStableDiffusion3Pipeline, OVQuantizer, + OVSanaPipeline, OVTrainer, OVQuantizationConfig, OVWeightQuantizationConfig, @@ -543,6 +544,7 @@ class OVWeightCompressionTest(unittest.TestCase): [ (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65), (OVFluxPipeline, "flux", 7, 56), + (OVSanaPipeline, "sana", 19, 53), ] ) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 3100df615..83ea3751d 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -168,6 +168,7 @@ "open-clip-ov": "zofinka/tiny-open-clip-model", "st-bert": "sentence-transformers/all-MiniLM-L6-v2", "st-mpnet": "sentence-transformers/all-mpnet-base-v2", + "sana": "katuni4ka/tiny-random-sana", } @@ -200,6 +201,7 @@ "minicpmv": (30, 26, 1, 6), "nanollava": (30, 15, 1), "qwen2_vl": (30, 1, 1, 10), + "sana": (58, 28, 28, 18), } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"