dlt-hub · sh-rp · Oct 8, 2024 · Jun 19, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py
@@ -46,31 +46,26 @@ def __init__(
         )
         self.fs_client = fs_client
         self.using_external_database = duckdb_connection is not None
-        self.create_persistent_secrets = False
-        self.autocreate_required_views = False
+        self.autocreate_required_views = True
 
         if self.fs_client.config.protocol not in SUPPORTED_PROTOCOLS:
             raise NotImplementedError(
                 f"Protocol {self.fs_client.config.protocol} currently not supported for"
                 f" FilesystemSqlClient. Supported protocols are {SUPPORTED_PROTOCOLS}."
             )
 
-    def open_connection(self) -> duckdb.DuckDBPyConnection:
-        # we keep the in memory instance around, so if this prop is set, return it
-        if self._conn:
-            return self._conn
-        super().open_connection()
+    def create_authentication(self, persistent: bool = False, secret_name: str = None) -> None:
+        if not secret_name:
+            secret_name = f"secret_{self.fs_client.config.protocol}"
 
-        # set up connection and dataset
-        self._existing_views: List[str] = []  # remember which views already where created
-        if not self.has_dataset():
-            self.create_dataset()
-        self._conn.sql(f"USE {self.dataset_name}")
-        self.autocreate_required_views = True
+        persistent_stmt = ""
+        if persistent:
+            persistent_stmt = " PERSISTENT "
 
-        persistent = ""
-        if self.create_persistent_secrets:
-            persistent = " PERSISTENT "
+        # abfss buckets have an @ compontent
+        scope = self.fs_client.config.bucket_url
+        if "@" in scope:
+            scope = scope.split("@")[0]
 
         # add secrets required for creating views
         if self.fs_client.config.protocol == "s3":
@@ -81,12 +76,13 @@ def open_connection(self) -> duckdb.DuckDBPyConnection:
                 else "s3.amazonaws.com"
             )
             self._conn.sql(f"""
-            CREATE {persistent} SECRET secret_aws (
+            CREATE OR REPLACE {persistent_stmt} SECRET {secret_name} (
                 TYPE S3,
                 KEY_ID '{aws_creds.aws_access_key_id}',
                 SECRET '{aws_creds.aws_secret_access_key}',
                 REGION '{aws_creds.region_name}',
-                ENDPOINT '{endpoint}'
+                ENDPOINT '{endpoint}',
+                SCOPE '{scope}'
             );""")
 
         # azure with storage account creds
@@ -95,9 +91,10 @@ def open_connection(self) -> duckdb.DuckDBPyConnection:
         ):
             azsa_creds = self.fs_client.config.credentials
             self._conn.sql(f"""
-            CREATE {persistent} SECRET secret_az (
+            CREATE OR REPLACE {persistent_stmt} SECRET {secret_name} (
                 TYPE AZURE,
-                CONNECTION_STRING 'AccountName={azsa_creds.azure_storage_account_name};AccountKey={azsa_creds.azure_storage_account_key}'
+                CONNECTION_STRING 'AccountName={azsa_creds.azure_storage_account_name};AccountKey={azsa_creds.azure_storage_account_key}',
+                SCOPE '{scope}'
             );""")
 
         # azure with service principal creds
@@ -106,14 +103,21 @@ def open_connection(self) -> duckdb.DuckDBPyConnection:
         ):
             azsp_creds = self.fs_client.config.credentials
             self._conn.sql(f"""
-            CREATE SECRET secret_az (
+            CREATE OR REPLACE {persistent_stmt} SECRET {secret_name} (
                 TYPE AZURE,
                 PROVIDER SERVICE_PRINCIPAL,
                 TENANT_ID '{azsp_creds.azure_tenant_id}',
                 CLIENT_ID '{azsp_creds.azure_client_id}',
                 CLIENT_SECRET '{azsp_creds.azure_client_secret}',
-                ACCOUNT_NAME '{azsp_creds.azure_storage_account_name}'
+                ACCOUNT_NAME '{azsp_creds.azure_storage_account_name}',
+                SCOPE '{scope}'
             );""")
+        elif persistent:
+            raise Exception(
+                "Cannot create persistent secret for filesystem protocol"
+                f" {self.fs_client.config.protocol}. If you are trying to use persistent secrets"
+                " with gs/gcs, please use the s3 compatibility layer."
+            )
 
         # native google storage implementation is not supported..
         elif self.fs_client.config.protocol in ["gs", "gcs"]:
@@ -127,6 +131,28 @@ def open_connection(self) -> duckdb.DuckDBPyConnection:
         elif self.fs_client.config.protocol == "memory":
             self._conn.register_filesystem(self.fs_client.fs_client)
 
+    def open_connection(self) -> duckdb.DuckDBPyConnection:
+        # we keep the in memory instance around, so if this prop is set, return it
+        if self._conn:
+            return self._conn
+        super().open_connection()
+
+        # set up connection and dataset
+        self._existing_views: List[str] = []  # remember which views already where created
+
+        self.autocreate_required_views = False
+        if not self.has_dataset():
+            self.create_dataset()
+        self.autocreate_required_views = True
+        self._conn.sql(f"USE {self.dataset_name}")
+
+        # the line below solves problems with certificate path lookup on linux
+        # see duckdb docs
+        self._conn.sql("SET azure_transport_option_type = 'curl';")
+
+        # create authentication to data provider
+        self.create_authentication()
+
         return self._conn
 
     def close_connection(self) -> None:
@@ -135,7 +161,7 @@ def close_connection(self) -> None:
             return super().close_connection()
 
     @raise_database_error
-    def create_view_for_tables(self, tables: Dict[str, str]) -> None:
+    def create_views_for_tables(self, tables: Dict[str, str]) -> None:
         """Add the required tables as views to the duckdb in memory instance"""
 
         # create all tables in duck instance
@@ -208,7 +234,7 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB
         if self.autocreate_required_views:  # skip this step when operating on the schema..
             expression = sqlglot.parse_one(query, read="duckdb")  # type: ignore
             load_tables = {t.name: t.name for t in expression.find_all(exp.Table)}
-            self.create_view_for_tables(load_tables)
+            self.create_views_for_tables(load_tables)
 
         # TODO: raise on non-select queries here, they do not make sense in this context
         with super().execute_query(query, *args, **kwargs) as cursor:

diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py
@@ -1,9 +1,12 @@
+from typing import Any
+
 import pytest
 import dlt
 import os
 
 from dlt import Pipeline
 from dlt.common import Decimal
+from dlt.common.utils import uniq_id
 
 from typing import List
 from functools import reduce
@@ -200,7 +203,7 @@ def double_items():
 
         # check we can create new tables from the views
         with pipeline.sql_client() as c:
-            c.create_view_for_tables({"items": "items", "double_items": "double_items"})
+            c.create_views_for_tables({"items": "items", "double_items": "double_items"})
             c.execute_sql(
                 "CREATE TABLE items_joined AS (SELECT i.id, di.double_id FROM items as i JOIN"
                 " double_items as di ON (i.id = di.id));"
@@ -218,30 +221,48 @@ def double_items():
             except Exception as exc:
                 assert "double_items is not an table" in str(exc)
 
-        # we create a second duckdb pipieline and will see if we can make our filesystem views available there
-        other_pipeline = dlt.pipeline("other_pipeline", dev_mode=True, destination="duckdb")
-        other_db_location = (
-            other_pipeline.destination_client().config.credentials.database  #  type: ignore
-        )
-        other_pipeline.run([1, 2, 3], table_name="items")
-        assert len(other_pipeline._dataset().items.fetchall()) == 3
-
-        # TODO: implement these tests
-        return
+        # we create a duckdb with a table an see wether we can add more views
+        duck_db_location = "_storage/" + uniq_id()
+        external_db = duckdb.connect(duck_db_location)
+        external_db.execute("CREATE SCHEMA first;")
+        external_db.execute("CREATE SCHEMA second;")
+        external_db.execute("CREATE TABLE first.items AS SELECT i FROM range(0, 3) t(i)")
+        assert len(external_db.sql("SELECT * FROM first.items").fetchall()) == 3
 
         # now we can use the filesystemsql client to create the needed views
+        fs_client: Any = pipeline.destination_client()
         fs_sql_client = FilesystemSqlClient(
-            pipeline.destination_client(),
-            dataset_name=other_pipeline.dataset_name,
-            duckdb_connection=duckdb.connect(other_db_location),
+            dataset_name="second",
+            fs_client=fs_client,
+            duckdb_connection=external_db,
         )
-        fs_sql_client.create_persistent_secrets = True
         with fs_sql_client as sql_client:
-            sql_client.create_view_for_tables({"items": "referenced_items"})
+            sql_client.create_views_for_tables({"items": "referenced_items"})
+        assert len(external_db.sql("SELECT * FROM second.referenced_items").fetchall()) == 3000
+        assert len(external_db.sql("SELECT * FROM first.items").fetchall()) == 3
+
+        # test creating persistent secrets
+        # NOTE: there is some kind of duckdb cache that makes testing persistent secrets impossible
+        # because somehow the non-persistent secrets are around as long as the python process runs, even
+        # wenn closing the db connection, renaming the db file and reconnecting
+        secret_name = f"secret_{uniq_id()}_secret"
+
+        supports_persistent_secrets = (
+            destination_config.bucket_url.startswith("s3")
+            or destination_config.bucket_url.startswith("az")
+            or destination_config.bucket_url.startswith("abfss")
+        )
 
-        # we now have access to this view on the original dataset
-        assert len(other_pipeline._dataset().items.fetchall()) == 3
-        assert len(other_pipeline._dataset().referenced_items.fetchall()) == 3000
+        try:
+            with fs_sql_client as sql_client:
+                fs_sql_client.create_authentication(persistent=True, secret_name=secret_name)
+            # the line below would error if there were no persistent secrets of the given name
+            external_db.execute(f"DROP PERSISTENT SECRET {secret_name}")
+        except Exception as exc:
+            assert (
+                not supports_persistent_secrets
+            ), f"{destination_config.bucket_url} is expected to support persistent secrets"
+            assert "Cannot create persistent secret" in str(exc)
 
 
 @pytest.mark.essential