NVIDIA · edknv · Nov 14, 2024 · Nov 14, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -133,14 +133,17 @@ services:
       - CACHED_HTTP_ENDPOINT=http://cached:8000/v1/infer
       - CACHED_INFER_PROTOCOL=grpc
       - CUDA_VISIBLE_DEVICES=0
-      - DEPLOT_GRPC_ENDPOINT=""
+      - DEPLOT_GRPC_ENDPOINT=
       # self hosted deplot
       - DEPLOT_HEALTH_ENDPOINT=deplot:8000
       - DEPLOT_HTTP_ENDPOINT=http://deplot:8000/v1/chat/completions
       # build.nvidia.com hosted deplot
       - DEPLOT_INFER_PROTOCOL=http
       #- DEPLOT_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/vlm/google/deplot
-      - DOUGHNUT_GRPC_TRITON=triton-doughnut:8001
+      - DOUGHNUT_GRPC_ENDPOINT=
+      # build.nvidia.com hosted doughnut
+      - DOUGHNUT_HTTP_ENDPOINT=https://placeholder
+      - DOUGHNUT_INFER_PROTOCOL=http
       - INGEST_LOG_LEVEL=DEFAULT
       - MESSAGE_CLIENT_HOST=redis
       - MESSAGE_CLIENT_PORT=6379

diff --git a/src/nv_ingest/extraction_workflows/pdf/doughnut_helper.py b/src/nv_ingest/extraction_workflows/pdf/doughnut_helper.py
@@ -18,7 +18,6 @@
 # limitations under the License.
 
 import logging
-import os
 import uuid
 from typing import Dict
 from typing import List
@@ -39,6 +38,8 @@
 from nv_ingest.util.image_processing.transforms import crop_image
 from nv_ingest.util.image_processing.transforms import numpy_to_base64
 from nv_ingest.util.nim import doughnut as doughnut_utils
+from nv_ingest.util.nim.helpers import call_image_inference_model
+from nv_ingest.util.nim.helpers import create_inference_client
 from nv_ingest.util.pdf.metadata_aggregators import Base64Image
 from nv_ingest.util.pdf.metadata_aggregators import LatexTable
 from nv_ingest.util.pdf.metadata_aggregators import construct_image_metadata_from_pdf_image
@@ -48,8 +49,6 @@
 
 logger = logging.getLogger(__name__)
 
-DOUGHNUT_GRPC_TRITON = os.environ.get("DOUGHNUT_GRPC_TRITON", "triton:8001")
-DEFAULT_BATCH_SIZE = 16
 DEFAULT_RENDER_DPI = 300
 DEFAULT_MAX_WIDTH = 1024
 DEFAULT_MAX_HEIGHT = 1280
@@ -80,9 +79,10 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
     """
     logger.debug("Extracting PDF with doughnut backend.")
 
-    doughnut_triton_url = kwargs.get("doughnut_grpc_triton", DOUGHNUT_GRPC_TRITON)
+    doughnut_config = kwargs.get("doughnut_config", {})
+    doughnut_config = doughnut_config if doughnut_config is not None else {}
 
-    batch_size = int(kwargs.get("doughnut_batch_size", DEFAULT_BATCH_SIZE))
+    batch_size = doughnut_config.doughnut_batch_size
 
     row_data = kwargs.get("row_data")
     # get source_id
@@ -146,10 +146,12 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
     accumulated_tables = []
     accumulated_images = []
 
-    triton_client = grpcclient.InferenceServerClient(url=doughnut_triton_url)
+    doughnut_client = create_inference_client(
+        doughnut_config.doughnut_endpoints, doughnut_config.auth_token, doughnut_config.doughnut_infer_protocol
+    )
 
     for batch, batch_page_offset in zip(batches, batch_page_offsets):
-        responses = preprocess_and_send_requests(triton_client, batch, batch_page_offset)
+        responses = preprocess_and_send_requests(doughnut_client, batch, batch_page_offset)
 
         for page_idx, raw_text, bbox_offset in responses:
             page_image = None
@@ -164,10 +166,11 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
             }
 
             for cls, bbox, txt in zip(classes, bboxes, texts):
-                if extract_text:
+
+                if extract_text and (cls in doughnut_utils.ACCEPTED_TEXT_CLASSES):
                     txt = doughnut_utils.postprocess_text(txt, cls)
 
-                    if extract_images and identify_nearby_objects:
+                    if identify_nearby_objects:
                         bbox = doughnut_utils.reverse_transform_bbox(
                             bbox=bbox,
                             bbox_offset=bbox_offset,
@@ -179,16 +182,21 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
 
                     accumulated_text.append(txt)
 
-                elif extract_tables and (cls == "Table"):
+                if extract_tables and (cls == "Table"):
                     try:
                         txt = txt.encode().decode("unicode_escape")  # remove double backlashes
                     except UnicodeDecodeError:
                         pass
-                    bbox = doughnut_utils.reverse_transform_bbox(bbox, bbox_offset)
-                    table = LatexTable(latex=txt, bbox=bbox, max_width=page_width, max_height=page_height)
+                    bbox = doughnut_utils.reverse_transform_bbox(
+                        bbox=bbox,
+                        bbox_offset=bbox_offset,
+                        original_width=DEFAULT_MAX_WIDTH,
+                        original_height=DEFAULT_MAX_HEIGHT,
+                    )
+                    table = LatexTable(latex=txt, bbox=bbox, max_width=DEFAULT_MAX_WIDTH, max_height=DEFAULT_MAX_HEIGHT)
                     accumulated_tables.append(table)
 
-                elif extract_images and (cls == "Picture"):
+                if extract_images and (cls == "Picture"):
                     if page_image is None:
                         scale_tuple = (DEFAULT_MAX_WIDTH, DEFAULT_MAX_HEIGHT)
                         padding_tuple = (DEFAULT_MAX_WIDTH, DEFAULT_MAX_HEIGHT)
@@ -200,14 +208,19 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
                     img_numpy = crop_image(page_image, bbox)
                     if img_numpy is not None:
                         base64_img = numpy_to_base64(img_numpy)
-                        bbox = doughnut_utils.reverse_transform_bbox(bbox, bbox_offset)
+                        bbox = doughnut_utils.reverse_transform_bbox(
+                            bbox=bbox,
+                            bbox_offset=bbox_offset,
+                            original_width=DEFAULT_MAX_WIDTH,
+                            original_height=DEFAULT_MAX_HEIGHT,
+                        )
                         image = Base64Image(
                             image=base64_img,
                             bbox=bbox,
                             width=img_numpy.shape[1],
                             height=img_numpy.shape[0],
-                            max_width=page_width,
-                            max_height=page_height,
+                            max_width=DEFAULT_MAX_WIDTH,
+                            max_height=DEFAULT_MAX_HEIGHT,
                         )
                         accumulated_images.append(image)
 
@@ -275,13 +288,14 @@ def doughnut(pdf_stream, extract_text: bool, extract_images: bool, extract_table
         if len(text_extraction) > 0:
             extracted_data.append(text_extraction)
 
-    triton_client.close()
+    if isinstance(doughnut_client, grpcclient.InferenceServerClient):
+        doughnut_client.close()
 
     return extracted_data
 
 
 def preprocess_and_send_requests(
-    triton_client,
+    doughnut_client,
     batch: List[pdfium.PdfPage],
     batch_offset: int,
 ) -> List[Tuple[int, str]]:
@@ -299,24 +313,15 @@ def preprocess_and_send_requests(
 
     batch = np.array(page_images)
 
-    input_tensors = [grpcclient.InferInput("image", batch.shape, datatype="UINT8")]
-    input_tensors[0].set_data_from_numpy(batch)
-
-    outputs = [grpcclient.InferRequestedOutput("text")]
-
-    query_response = triton_client.infer(
-        model_name="doughnut",
-        inputs=input_tensors,
-        outputs=outputs,
-    )
+    output = call_image_inference_model(doughnut_client, "doughnut", batch)
 
-    text = query_response.as_numpy("text").tolist()
-    text = [t.decode() for t in text]
-
-    if len(text) != len(batch):
-        return []
+    if len(output) != len(batch):
+        raise RuntimeError(
+            f"Dimensions mismatch: there are {len(batch)} pages in the input but there are "
+            f"{len(output)} pages in the response."
+        )
 
-    return list(zip(page_numbers, text, bbox_offsets))
+    return list(zip(page_numbers, output, bbox_offsets))
 
 
 @pdfium_exception_handler(descriptor="doughnut")
@@ -346,8 +351,10 @@ def _construct_table_metadata(
     }
     table_metadata = {
         "caption": "",
+        "table_content": content,
         "table_format": table_format,
         "table_location": table.bbox,
+        "table_location_max_dimensions": (table.max_width, table.max_height),
     }
     ext_unified_metadata = base_unified_metadata.copy()
 

diff --git a/src/nv_ingest/schemas/pdf_extractor_schema.py b/src/nv_ingest/schemas/pdf_extractor_schema.py
@@ -68,17 +68,91 @@ def validate_endpoints(cls, values):
             If both gRPC and HTTP services are empty for any endpoint.
         """
 
-        def clean_service(service):
-            """Set service to None if it's an empty string or contains only spaces or quotes."""
-            if service is None or not service.strip() or service.strip(" \"'") == "":
-                return None
-            return service
-
         for model_name in ["yolox"]:
             endpoint_name = f"{model_name}_endpoints"
             grpc_service, http_service = values.get(endpoint_name)
-            grpc_service = clean_service(grpc_service)
-            http_service = clean_service(http_service)
+            grpc_service = _clean_service(grpc_service)
+            http_service = _clean_service(http_service)
+
+            if not grpc_service and not http_service:
+                raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
+
+            values[endpoint_name] = (grpc_service, http_service)
+
+            protocol_name = f"{model_name}_infer_protocol"
+            protocol_value = values.get(protocol_name)
+            if not protocol_value:
+                protocol_value = "http" if http_service else "grpc" if grpc_service else ""
+            protocol_value = protocol_value.lower()
+            values[protocol_name] = protocol_value
+
+        return values
+
+    class Config:
+        extra = "forbid"
+
+
+class DoughnutConfigSchema(BaseModel):
+    """
+    Configuration schema for Doughnut endpoints and options.
+
+    Parameters
+    ----------
+    auth_token : Optional[str], default=None
+        Authentication token required for secure services.
+
+    doughnut_endpoints : Tuple[str, str]
+        A tuple containing the gRPC and HTTP services for the doughnut endpoint.
+        Either the gRPC or HTTP service can be empty, but not both.
+
+    Methods
+    -------
+    validate_endpoints(values)
+        Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
+
+    Raises
+    ------
+    ValueError
+        If both gRPC and HTTP services are empty for any endpoint.
+
+    Config
+    ------
+    extra : str
+        Pydantic config option to forbid extra fields.
+    """
+
+    auth_token: Optional[str] = None
+
+    doughnut_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    doughnut_infer_protocol: str = ""
+    doughnut_batch_size: int = 32
+
+    @root_validator(pre=True)
+    def validate_endpoints(cls, values):
+        """
+        Validates the gRPC and HTTP services for all endpoints.
+
+        Parameters
+        ----------
+        values : dict
+            Dictionary containing the values of the attributes for the class.
+
+        Returns
+        -------
+        dict
+            The validated dictionary of values.
+
+        Raises
+        ------
+        ValueError
+            If both gRPC and HTTP services are empty for any endpoint.
+        """
+
+        for model_name in ["doughnut"]:
+            endpoint_name = f"{model_name}_endpoints"
+            grpc_service, http_service = values.get(endpoint_name)
+            grpc_service = _clean_service(grpc_service)
+            http_service = _clean_service(http_service)
 
             if not grpc_service and not http_service:
                 raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
@@ -92,6 +166,10 @@ def clean_service(service):
             protocol_value = protocol_value.lower()
             values[protocol_name] = protocol_value
 
+        # Currently both build.nvidia.com and NIM do not support batch size > 1.
+        if values.get("doughnut_infer_protocol") == "http":
+            values["doughnut_batch_size"] = 1
+
         return values
 
     class Config:
@@ -122,6 +200,14 @@ class PDFExtractorSchema(BaseModel):
     raise_on_failure: bool = False
 
     pdfium_config: Optional[PDFiumConfigSchema] = None
+    doughnut_config: Optional[DoughnutConfigSchema] = None
 
     class Config:
         extra = "forbid"
+
+
+def _clean_service(service):
+    """Set service to None if it's an empty string or contains only spaces or quotes."""
+    if service is None or not service.strip() or service.strip(" \"'") == "":
+        return None
+    return service
diff --git a/src/nv_ingest/stages/nim/chart_extraction.py b/src/nv_ingest/stages/nim/chart_extraction.py
@@ -2,22 +2,24 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import logging
 import functools
-import pandas as pd
+import logging
 from typing import Any
 from typing import Dict
 from typing import Optional
 from typing import Tuple
 
+import pandas as pd
 import tritonclient.grpc as grpcclient
 from morpheus.config import Config
 
 from nv_ingest.schemas.chart_extractor_schema import ChartExtractorSchema
+from nv_ingest.schemas.metadata_schema import TableFormatEnum
 from nv_ingest.stages.multiprocessing_stage import MultiProcessingBaseStage
 from nv_ingest.util.image_processing.table_and_chart import join_cached_and_deplot_output
 from nv_ingest.util.image_processing.transforms import base64_to_numpy
-from nv_ingest.util.nim.helpers import call_image_inference_model, create_inference_client
+from nv_ingest.util.nim.helpers import call_image_inference_model
+from nv_ingest.util.nim.helpers import create_inference_client
 
 logger = logging.getLogger(f"morpheus.{__name__}")
 
@@ -62,7 +64,8 @@ def _update_metadata(row: pd.Series, cached_client: Any, deplot_client: Any, tra
     # Only modify if content type is structured and subtype is 'chart' and chart_metadata exists
     if ((content_metadata.get("type") != "structured") or
             (content_metadata.get("subtype") != "chart") or
-            (chart_metadata is None)):
+            (chart_metadata is None) or
+            (chart_metadata.get("table_format") != TableFormatEnum.IMAGE)):
         return metadata
 
     # Modify chart metadata with the result from the inference model
@@ -113,6 +116,13 @@ def _extract_chart_data(df: pd.DataFrame, task_props: Dict[str, Any],
 
     _ = task_props  # unused
 
+    if trace_info is None:
+        trace_info = {}
+        logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
+
+    if df.empty:
+        return df, trace_info
+
     deplot_client = create_inference_client(
         validated_config.stage_config.deplot_endpoints,
         validated_config.stage_config.auth_token,
@@ -125,10 +135,6 @@ def _extract_chart_data(df: pd.DataFrame, task_props: Dict[str, Any],
         validated_config.stage_config.cached_infer_protocol
     )
 
-    if trace_info is None:
-        trace_info = {}
-        logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
-
     try:
         # Apply the _update_metadata function to each row in the DataFrame
         df["metadata"] = df.apply(_update_metadata, axis=1, args=(cached_client, deplot_client, trace_info))