dlt-hub · rudolfix · Feb 24, 2024 · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py
@@ -61,7 +61,7 @@
     "merge_key",
 ]
 """Known hints of a column used to declare hint regexes."""
-TWriteDisposition = Literal["skip", "append", "replace", "merge"]
+TWriteDisposition = Literal["skip", "append", "replace", "merge", "replicate"]
 TTableFormat = Literal["iceberg"]
 TTypeDetections = Literal[
     "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double"
@@ -150,6 +150,38 @@ class NormalizerInfo(TypedDict, total=True):
     new_table: bool
 
 
+class TCdcOperationMapperStr(TypedDict, total=True):
+    """
+    Dictionary that informs dlt which string literals are used
+    in the change data to identify inserts, updates, and deletes.
+    """
+
+    insert: str
+    update: str
+    delete: str
+
+
+class TCdcOperationMapperInt(TypedDict, total=True):
+    """
+    Dictionary that informs dlt which integer literals are used
+    in the change data to identify inserts, updates, and deletes.
+    """
+
+    insert: int
+    update: int
+    delete: int
+
+
+class TCdcConfig(TypedDict, total=True):
+    """Dictionary that informs dlt how change data is organized."""
+
+    operation_column: str
+    """Name of the column containing the operation type ("insert", "update", or "delete") for the change record."""
+    operation_mapper: Union[TCdcOperationMapperStr, TCdcOperationMapperInt]
+    sequence_column: str
+    """Name of the column containing a sequence identifier that can be used to order the change records."""
+
+
 # TypedDict that defines properties of a table
 
 
@@ -166,6 +198,7 @@ class TTableSchema(TypedDict, total=False):
     columns: TTableSchemaColumns
     resource: Optional[str]
     table_format: Optional[TTableFormat]
+    cdc_config: Optional[TCdcConfig]
 
 
 class TPartialTableSchema(TTableSchema):

diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py
@@ -37,6 +37,7 @@
     TTypeDetections,
     TWriteDisposition,
     TSchemaContract,
+    TCdcConfig,
 )
 from dlt.common.schema.exceptions import (
     CannotCoerceColumnException,
@@ -317,6 +318,19 @@ def validate_stored_schema(stored_schema: TStoredSchema) -> None:
             if parent_table_name not in stored_schema["tables"]:
                 raise ParentTableNotFoundException(table_name, parent_table_name)
 
+        # check for "replicate" tables that miss a primary key or "cdc_config"
+        if table.get("write_disposition") == "replicate":
+            if len(get_columns_names_with_prop(table, "primary_key", True)) == 0:
+                raise SchemaException(
+                    f'Primary key missing for table "{table_name}" with "replicate" write'
+                    " disposition."
+                )
+            if "cdc_config" not in table:
+                raise SchemaException(
+                    f'"cdc_config" missing for table "{table_name}" with "replicate" write'
+                    " disposition."
+                )
+
 
 def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> TStoredSchema:
     if from_engine == to_engine:
@@ -724,6 +738,7 @@ def new_table(
     resource: str = None,
     schema_contract: TSchemaContract = None,
     table_format: TTableFormat = None,
+    cdc_config: TCdcConfig = None,
 ) -> TTableSchema:
     table: TTableSchema = {
         "name": table_name,
@@ -742,6 +757,8 @@ def new_table(
             table["schema_contract"] = schema_contract
         if table_format:
             table["table_format"] = table_format
+        if cdc_config is not None:
+            table["cdc_config"] = cdc_config
     if validate_schema:
         validate_dict_ignoring_xkeys(
             spec=TColumnSchema,

diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
@@ -351,7 +351,9 @@ def _from_db_type(
         return self.type_mapper.from_db_type(hive_t, precision, scale)
 
     def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
-        return f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}"
+        return (
+            f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}"
+        )
 
     def _get_table_update_sql(
         self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool
@@ -376,19 +378,15 @@ def _get_table_update_sql(
         # use qualified table names
         qualified_table_name = self.sql_client.make_qualified_ddl_table_name(table_name)
         if is_iceberg and not generate_alter:
-            sql.append(
-                f"""CREATE TABLE {qualified_table_name}
+            sql.append(f"""CREATE TABLE {qualified_table_name}
                     ({columns})
                     LOCATION '{location}'
-                    TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');"""
-            )
+                    TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');""")
         elif not generate_alter:
-            sql.append(
-                f"""CREATE EXTERNAL TABLE {qualified_table_name}
+            sql.append(f"""CREATE EXTERNAL TABLE {qualified_table_name}
                     ({columns})
                     STORED AS PARQUET
-                    LOCATION '{location}';"""
-            )
+                    LOCATION '{location}';""")
         # alter table to add new columns at the end
         else:
             sql.append(f"""ALTER TABLE {qualified_table_name} ADD COLUMNS ({columns});""")

diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
@@ -252,9 +252,9 @@ def _get_table_update_sql(
             elif (c := partition_list[0])["data_type"] == "date":
                 sql[0] = f"{sql[0]}\nPARTITION BY {self.capabilities.escape_identifier(c['name'])}"
             elif (c := partition_list[0])["data_type"] == "timestamp":
-                sql[
-                    0
-                ] = f"{sql[0]}\nPARTITION BY DATE({self.capabilities.escape_identifier(c['name'])})"
+                sql[0] = (
+                    f"{sql[0]}\nPARTITION BY DATE({self.capabilities.escape_identifier(c['name'])})"
+                )
             # Automatic partitioning of an INT64 type requires us to be prescriptive - we treat the column as a UNIX timestamp.
             # This is due to the bounds requirement of GENERATE_ARRAY function for partitioning.
             # The 10,000 partitions limit makes it infeasible to cover the entire `bigint` range.
@@ -272,7 +272,9 @@ def _get_table_update_sql(
 
     def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
         name = self.capabilities.escape_identifier(c["name"])
-        return f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}"
+        return (
+            f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}"
+        )
 
     def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]:
         schema_table: TTableSchemaColumns = {}

diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
@@ -166,12 +166,14 @@ def __init__(
             else:
                 raise LoadJobTerminalException(
                     file_path,
-                    f"Databricks cannot load data from staging bucket {bucket_path}. Only s3 and azure buckets are supported",
+                    f"Databricks cannot load data from staging bucket {bucket_path}. Only s3 and"
+                    " azure buckets are supported",
                 )
         else:
             raise LoadJobTerminalException(
                 file_path,
-                "Cannot load from local file. Databricks does not support loading from local files. Configure staging with an s3 or azure storage bucket.",
+                "Cannot load from local file. Databricks does not support loading from local files."
+                " Configure staging with an s3 or azure storage bucket.",
             )
 
         # decide on source format, stage_file_path will either be a local file or a bucket path
@@ -181,27 +183,33 @@ def __init__(
             if not config.get("data_writer.disable_compression"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader does not support gzip compressed JSON files. Please disable compression in the data writer configuration: https://dlthub.com/docs/reference/performance#disabling-and-enabling-file-compression",
+                    "Databricks loader does not support gzip compressed JSON files. Please disable"
+                    " compression in the data writer configuration:"
+                    " https://dlthub.com/docs/reference/performance#disabling-and-enabling-file-compression",
                 )
             if table_schema_has_type(table, "decimal"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader cannot load DECIMAL type columns from json files. Switch to parquet format to load decimals.",
+                    "Databricks loader cannot load DECIMAL type columns from json files. Switch to"
+                    " parquet format to load decimals.",
                 )
             if table_schema_has_type(table, "binary"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader cannot load BINARY type columns from json files. Switch to parquet format to load byte values.",
+                    "Databricks loader cannot load BINARY type columns from json files. Switch to"
+                    " parquet format to load byte values.",
                 )
             if table_schema_has_type(table, "complex"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader cannot load complex columns (lists and dicts) from json files. Switch to parquet format to load complex types.",
+                    "Databricks loader cannot load complex columns (lists and dicts) from json"
+                    " files. Switch to parquet format to load complex types.",
                 )
             if table_schema_has_type(table, "date"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader cannot load DATE type columns from json files. Switch to parquet format to load dates.",
+                    "Databricks loader cannot load DATE type columns from json files. Switch to"
+                    " parquet format to load dates.",
                 )
 
             source_format = "JSON"
@@ -311,7 +319,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non
 
     def _get_storage_table_query_columns(self) -> List[str]:
         fields = super()._get_storage_table_query_columns()
-        fields[
-            1
-        ] = "full_data_type"  # Override because this is the only way to get data type with precision
+        fields[1] = (  # Override because this is the only way to get data type with precision
+            "full_data_type"
+        )
         return fields
diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py
@@ -175,15 +175,13 @@ def __init__(
                     f'PUT file://{file_path} @{stage_name}/"{load_id}" OVERWRITE = TRUE,'
                     " AUTO_COMPRESS = FALSE"
                 )
-            client.execute_sql(
-                f"""COPY INTO {qualified_table_name}
+            client.execute_sql(f"""COPY INTO {qualified_table_name}
                 {from_clause}
                 {files_clause}
                 {credentials_clause}
                 FILE_FORMAT = {source_format}
                 MATCH_BY_COLUMN_NAME='CASE_INSENSITIVE'
-                """
-            )
+                """)
             if stage_file_path and not keep_staged_files:
                 client.execute_sql(f"REMOVE {stage_file_path}")
 

diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
@@ -250,7 +250,7 @@ def create_table_chain_completed_followup_jobs(
         write_disposition = table_chain[0]["write_disposition"]
         if write_disposition == "append":
             jobs.extend(self._create_append_followup_jobs(table_chain))
-        elif write_disposition == "merge":
+        elif write_disposition in ("merge", "replicate"):
             jobs.extend(self._create_merge_followup_jobs(table_chain))
         elif write_disposition == "replace":
             jobs.extend(self._create_replace_followup_jobs(table_chain))
@@ -581,10 +581,24 @@ def with_staging_dataset(self) -> Iterator["SqlJobClientBase"]:
             self.in_staging_mode = False
 
     def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool:
-        if table["write_disposition"] == "merge":
+        if table["write_disposition"] in ("merge", "replicate"):
             return True
         elif table["write_disposition"] == "replace" and (
             self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]
         ):
             return True
         return False
+
+    def _create_table_update(
+        self, table_name: str, storage_columns: TTableSchemaColumns
+    ) -> Sequence[TColumnSchema]:
+        updates = super()._create_table_update(table_name, storage_columns)
+        table = self.schema.get_table(table_name)
+        if "write_disposition" in table and table["write_disposition"] == "replicate":
+            # operation and sequence columns should only be present in staging table
+            # not in final table
+            if not self.in_staging_mode:
+                op_col = table["cdc_config"]["operation_column"]
+                seq_col = table["cdc_config"]["sequence_column"]
+                updates = [d for d in updates if d["name"] not in (op_col, seq_col)]
+        return updates