diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index 00e7462311..4dbfd3fb9a 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -11,6 +11,7 @@ env: jobs: deploy: runs-on: ubuntu-latest + if: ${{ !github.event.pull_request.head.repo.fork }} steps: - name: Trigger deploy hook run: curl ${{ env.NETLIFY_DOCS_PRODUCTION_DEPLOY_HOOK }} -X POST diff --git a/dlt/common/__init__.py b/dlt/common/__init__.py index 466fd7c546..0c8a09ec3e 100644 --- a/dlt/common/__init__.py +++ b/dlt/common/__init__.py @@ -1,8 +1,8 @@ +from dlt.common import logger from dlt.common.arithmetics import Decimal from dlt.common.wei import Wei from dlt.common.pendulum import pendulum from dlt.common.json import json from dlt.common.runtime.signals import sleep -from dlt.common.runtime import logger __all__ = ["Decimal", "Wei", "pendulum", "json", "sleep", "logger"] diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index b398f0463a..ebfa7b6b89 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -76,7 +76,7 @@ def initialize_credentials(hint: Any, initial_value: Any) -> CredentialsConfigur first_credentials: CredentialsConfiguration = None for idx, spec in enumerate(specs_in_union): try: - credentials = spec(initial_value) + credentials = spec.from_init_value(initial_value) if credentials.is_resolved(): return credentials # keep first credentials in the union to return in case all of the match but not resolve @@ -88,7 +88,7 @@ def initialize_credentials(hint: Any, initial_value: Any) -> CredentialsConfigur return first_credentials else: assert issubclass(hint, CredentialsConfiguration) - return hint(initial_value) # type: ignore + return hint.from_init_value(initial_value) # type: ignore def inject_section( diff --git a/dlt/common/configuration/specs/api_credentials.py b/dlt/common/configuration/specs/api_credentials.py index fd7ae8cb09..918cd4ee45 100644 --- a/dlt/common/configuration/specs/api_credentials.py +++ b/dlt/common/configuration/specs/api_credentials.py @@ -6,9 +6,9 @@ @configspec class OAuth2Credentials(CredentialsConfiguration): - client_id: str - client_secret: TSecretValue - refresh_token: Optional[TSecretValue] + client_id: str = None + client_secret: TSecretValue = None + refresh_token: Optional[TSecretValue] = None scopes: Optional[List[str]] = None token: Optional[TSecretValue] = None diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index ee7360e2cb..ee49e79e40 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -121,3 +121,9 @@ def parse_native_representation(self, native_value: Any) -> None: self.__is_resolved__ = True except Exception: raise InvalidBoto3Session(self.__class__, native_value) + + @classmethod + def from_session(cls, botocore_session: Any) -> "AwsCredentials": + self = cls() + self.parse_native_representation(botocore_session) + return self diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 62abf42f27..06fb97fcdd 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -2,6 +2,7 @@ import inspect import contextlib import dataclasses +import warnings from collections.abc import Mapping as C_Mapping from typing import ( @@ -19,7 +20,7 @@ ClassVar, TypeVar, ) -from typing_extensions import get_args, get_origin +from typing_extensions import get_args, get_origin, dataclass_transform from functools import wraps if TYPE_CHECKING: @@ -44,6 +45,7 @@ _F_BaseConfiguration: Any = type(object) _F_ContainerInjectableContext: Any = type(object) _T = TypeVar("_T", bound="BaseConfiguration") +_C = TypeVar("_C", bound="CredentialsConfiguration") def is_base_configuration_inner_hint(inner_hint: Type[Any]) -> bool: @@ -106,18 +108,26 @@ def is_secret_hint(hint: Type[Any]) -> bool: @overload -def configspec(cls: Type[TAnyClass]) -> Type[TAnyClass]: ... +def configspec(cls: Type[TAnyClass], init: bool = True) -> Type[TAnyClass]: ... @overload -def configspec(cls: None = ...) -> Callable[[Type[TAnyClass]], Type[TAnyClass]]: ... +def configspec( + cls: None = ..., init: bool = True +) -> Callable[[Type[TAnyClass]], Type[TAnyClass]]: ... +@dataclass_transform(eq_default=False, field_specifiers=(dataclasses.Field, dataclasses.field)) def configspec( - cls: Optional[Type[Any]] = None, + cls: Optional[Type[Any]] = None, init: bool = True ) -> Union[Type[TAnyClass], Callable[[Type[TAnyClass]], Type[TAnyClass]]]: """Converts (via derivation) any decorated class to a Python dataclass that may be used as a spec to resolve configurations + __init__ method is synthesized by default. `init` flag is ignored if the decorated class implements custom __init__ as well as + when any of base classes has no synthesized __init__ + + All fields must have default values. This decorator will add `None` default values that miss one. + In comparison the Python dataclass, a spec implements full dictionary interface for its attributes, allows instance creation from ie. strings or other types (parsing, deserialization) and control over configuration resolution process. See `BaseConfiguration` and CredentialsConfiguration` for more information. @@ -142,6 +152,10 @@ def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]: # get all annotations without corresponding attributes and set them to None for ann in cls.__annotations__: if not hasattr(cls, ann) and not ann.startswith(("__", "_abc_")): + warnings.warn( + f"Missing default value for field {ann} on {cls.__name__}. None assumed. All" + " fields in configspec must have default." + ) setattr(cls, ann, None) # get all attributes without corresponding annotations for att_name, att_value in list(cls.__dict__.items()): @@ -177,17 +191,18 @@ def default_factory(att_value=att_value): # type: ignore[no-untyped-def] # We don't want to overwrite user's __init__ method # Create dataclass init only when not defined in the class - # (never put init on BaseConfiguration itself) - try: - is_base = cls is BaseConfiguration - except NameError: - is_base = True - init = False - base_params = getattr(cls, "__dataclass_params__", None) - if not is_base and (base_params and base_params.init or cls.__init__ is object.__init__): - init = True + # NOTE: any class without synthesized __init__ breaks the creation chain + has_default_init = super(cls, cls).__init__ == cls.__init__ # type: ignore[misc] + base_params = getattr(cls, "__dataclass_params__", None) # cls.__init__ is object.__init__ + synth_init = init and ((not base_params or base_params.init) and has_default_init) + if synth_init != init and has_default_init: + warnings.warn( + f"__init__ method will not be generated on {cls.__name__} because bas class didn't" + " synthesize __init__. Please correct `init` flag in confispec decorator. You are" + " probably receiving incorrect __init__ signature for type checking" + ) # do not generate repr as it may contain secret values - return dataclasses.dataclass(cls, init=init, eq=False, repr=False) # type: ignore + return dataclasses.dataclass(cls, init=synth_init, eq=False, repr=False) # type: ignore # called with parenthesis if cls is None: @@ -198,12 +213,14 @@ def default_factory(att_value=att_value): # type: ignore[no-untyped-def] @configspec class BaseConfiguration(MutableMapping[str, Any]): - __is_resolved__: bool = dataclasses.field(default=False, init=False, repr=False) + __is_resolved__: bool = dataclasses.field(default=False, init=False, repr=False, compare=False) """True when all config fields were resolved and have a specified value type""" - __section__: str = dataclasses.field(default=None, init=False, repr=False) - """Obligatory section used by config providers when searching for keys, always present in the search path""" - __exception__: Exception = dataclasses.field(default=None, init=False, repr=False) + __exception__: Exception = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) """Holds the exception that prevented the full resolution""" + __section__: ClassVar[str] = None + """Obligatory section used by config providers when searching for keys, always present in the search path""" __config_gen_annotations__: ClassVar[List[str]] = [] """Additional annotations for config generator, currently holds a list of fields of interest that have defaults""" __dataclass_fields__: ClassVar[Dict[str, TDtcField]] @@ -342,9 +359,10 @@ def call_method_in_mro(config, method_name: str) -> None: class CredentialsConfiguration(BaseConfiguration): """Base class for all credentials. Credentials are configurations that may be stored only by providers supporting secrets.""" - __section__: str = "credentials" + __section__: ClassVar[str] = "credentials" - def __init__(self, init_value: Any = None) -> None: + @classmethod + def from_init_value(cls: Type[_C], init_value: Any = None) -> _C: """Initializes credentials from `init_value` Init value may be a native representation of the credentials or a dict. In case of native representation (for example a connection string or JSON with service account credentials) @@ -353,14 +371,10 @@ def __init__(self, init_value: Any = None) -> None: Credentials will be marked as resolved if all required fields are set. """ - if init_value is None: - return - elif isinstance(init_value, C_Mapping): - self.update(init_value) - else: - self.parse_native_representation(init_value) - if not self.is_partial(): - self.resolve() + # create an instance + self = cls() + self._apply_init_value(init_value) + return self def to_native_credentials(self) -> Any: """Returns native credentials object. @@ -369,6 +383,16 @@ def to_native_credentials(self) -> Any: """ return self.to_native_representation() + def _apply_init_value(self, init_value: Any = None) -> None: + if isinstance(init_value, C_Mapping): + self.update(init_value) + elif init_value is not None: + self.parse_native_representation(init_value) + else: + return + if not self.is_partial(): + self.resolve() + def __str__(self) -> str: """Get string representation of credentials to be displayed, with all secret parts removed""" return super().__str__() diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py index 860e7414de..642634fb0a 100644 --- a/dlt/common/configuration/specs/config_providers_context.py +++ b/dlt/common/configuration/specs/config_providers_context.py @@ -1,4 +1,5 @@ import contextlib +import dataclasses import io from typing import ClassVar, List @@ -28,7 +29,7 @@ class ConfigProvidersConfiguration(BaseConfiguration): only_toml_fragments: bool = True # always look in providers - __section__ = known_sections.PROVIDERS + __section__: ClassVar[str] = known_sections.PROVIDERS @configspec @@ -37,8 +38,12 @@ class ConfigProvidersContext(ContainerInjectableContext): global_affinity: ClassVar[bool] = True - providers: List[ConfigProvider] - context_provider: ConfigProvider + providers: List[ConfigProvider] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) + context_provider: ConfigProvider = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) def __init__(self) -> None: super().__init__() diff --git a/dlt/common/configuration/specs/config_section_context.py b/dlt/common/configuration/specs/config_section_context.py index a656a2b0fe..1e6cd56155 100644 --- a/dlt/common/configuration/specs/config_section_context.py +++ b/dlt/common/configuration/specs/config_section_context.py @@ -8,7 +8,7 @@ class ConfigSectionContext(ContainerInjectableContext): TMergeFunc = Callable[["ConfigSectionContext", "ConfigSectionContext"], None] - pipeline_name: Optional[str] + pipeline_name: Optional[str] = None sections: Tuple[str, ...] = () merge_style: TMergeFunc = None source_state_key: str = None @@ -70,13 +70,3 @@ def __str__(self) -> str: super().__str__() + f": {self.pipeline_name} {self.sections}@{self.merge_style} state['{self.source_state_key}']" ) - - if TYPE_CHECKING: - # provide __init__ signature when type checking - def __init__( - self, - pipeline_name: str = None, - sections: Tuple[str, ...] = (), - merge_style: TMergeFunc = None, - source_state_key: str = None, - ) -> None: ... diff --git a/dlt/common/configuration/specs/connection_string_credentials.py b/dlt/common/configuration/specs/connection_string_credentials.py index 54007bb127..21e635a07c 100644 --- a/dlt/common/configuration/specs/connection_string_credentials.py +++ b/dlt/common/configuration/specs/connection_string_credentials.py @@ -1,14 +1,15 @@ -from typing import Any, ClassVar, Dict, List, Optional +import dataclasses +from typing import Any, ClassVar, Dict, List, Optional, Union + from dlt.common.libs.sql_alchemy import URL, make_url from dlt.common.configuration.specs.exceptions import InvalidConnectionString - from dlt.common.typing import TSecretValue from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec @configspec class ConnectionStringCredentials(CredentialsConfiguration): - drivername: str = None + drivername: str = dataclasses.field(default=None, init=False, repr=False, compare=False) database: str = None password: Optional[TSecretValue] = None username: str = None @@ -18,6 +19,11 @@ class ConnectionStringCredentials(CredentialsConfiguration): __config_gen_annotations__: ClassVar[List[str]] = ["port", "password", "host"] + def __init__(self, connection_string: Union[str, Dict[str, Any]] = None) -> None: + """Initializes the credentials from SQLAlchemy like connection string or from dict holding connection string elements""" + super().__init__() + self._apply_init_value(connection_string) + def parse_native_representation(self, native_value: Any) -> None: if not isinstance(native_value, str): raise InvalidConnectionString(self.__class__, native_value, self.drivername) diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 431f35c8d0..4d81a493a3 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -1,5 +1,6 @@ +import dataclasses import sys -from typing import Any, Final, List, Tuple, Union, Dict +from typing import Any, ClassVar, Final, List, Tuple, Union, Dict from dlt.common import json, pendulum from dlt.common.configuration.specs.api_credentials import OAuth2Credentials @@ -22,8 +23,12 @@ @configspec class GcpCredentials(CredentialsConfiguration): - token_uri: Final[str] = "https://oauth2.googleapis.com/token" - auth_uri: Final[str] = "https://accounts.google.com/o/oauth2/auth" + token_uri: Final[str] = dataclasses.field( + default="https://oauth2.googleapis.com/token", init=False, repr=False, compare=False + ) + auth_uri: Final[str] = dataclasses.field( + default="https://accounts.google.com/o/oauth2/auth", init=False, repr=False, compare=False + ) project_id: str = None @@ -69,7 +74,9 @@ def to_gcs_credentials(self) -> Dict[str, Any]: class GcpServiceAccountCredentialsWithoutDefaults(GcpCredentials): private_key: TSecretValue = None client_email: str = None - type: Final[str] = "service_account" # noqa: A003 + type: Final[str] = dataclasses.field( # noqa: A003 + default="service_account", init=False, repr=False, compare=False + ) def parse_native_representation(self, native_value: Any) -> None: """Accepts ServiceAccountCredentials as native value. In other case reverts to serialized services.json""" @@ -121,8 +128,10 @@ def __str__(self) -> str: @configspec class GcpOAuthCredentialsWithoutDefaults(GcpCredentials, OAuth2Credentials): # only desktop app supported - refresh_token: TSecretValue - client_type: Final[str] = "installed" + refresh_token: TSecretValue = None + client_type: Final[str] = dataclasses.field( + default="installed", init=False, repr=False, compare=False + ) def parse_native_representation(self, native_value: Any) -> None: """Accepts Google OAuth2 credentials as native value. In other case reverts to serialized oauth client secret json""" @@ -237,7 +246,7 @@ def __str__(self) -> str: @configspec class GcpDefaultCredentials(CredentialsWithDefault, GcpCredentials): - _LAST_FAILED_DEFAULT: float = 0.0 + _LAST_FAILED_DEFAULT: ClassVar[float] = 0.0 def parse_native_representation(self, native_value: Any) -> None: """Accepts google credentials as native value""" diff --git a/dlt/common/configuration/specs/known_sections.py b/dlt/common/configuration/specs/known_sections.py index 97ba85ffd6..8bd754ddd5 100644 --- a/dlt/common/configuration/specs/known_sections.py +++ b/dlt/common/configuration/specs/known_sections.py @@ -13,6 +13,9 @@ EXTRACT = "extract" """extract stage of the pipeline""" +SCHEMA = "schema" +"""schema configuration, ie. normalizers""" + PROVIDERS = "providers" """secrets and config providers""" diff --git a/dlt/common/configuration/specs/run_configuration.py b/dlt/common/configuration/specs/run_configuration.py index 54ce46ceba..b57b4abbdd 100644 --- a/dlt/common/configuration/specs/run_configuration.py +++ b/dlt/common/configuration/specs/run_configuration.py @@ -1,7 +1,7 @@ import binascii from os.path import isfile, join from pathlib import Path -from typing import Any, Optional, Tuple, IO +from typing import Any, ClassVar, Optional, IO from dlt.common.typing import TSecretStrValue from dlt.common.utils import encoding_for_mode, main_module_file_path, reveal_pseudo_secret @@ -30,7 +30,7 @@ class RunConfiguration(BaseConfiguration): """Platform connection""" dlthub_dsn: Optional[TSecretStrValue] = None - __section__ = "runtime" + __section__: ClassVar[str] = "runtime" def on_resolved(self) -> None: # generate pipeline name from the entry point script name diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index 24935d73ac..b10b1d14b9 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -1,6 +1,6 @@ import gzip import time -from typing import List, IO, Any, Optional, Type, TypeVar, Generic +from typing import ClassVar, List, IO, Any, Optional, Type, TypeVar, Generic from dlt.common.typing import TDataItem, TDataItems from dlt.common.data_writers import TLoaderFileFormat @@ -33,7 +33,7 @@ class BufferedDataWriterConfiguration(BaseConfiguration): disable_compression: bool = False _caps: Optional[DestinationCapabilitiesContext] = None - __section__ = known_sections.DATA_WRITER + __section__: ClassVar[str] = known_sections.DATA_WRITER @with_config(spec=BufferedDataWriterConfiguration) def __init__( diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index 0f3640da1e..2aadb010e0 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -4,6 +4,7 @@ IO, TYPE_CHECKING, Any, + ClassVar, Dict, List, Optional, @@ -236,7 +237,7 @@ class ParquetDataWriterConfiguration(BaseConfiguration): timestamp_timezone: str = "UTC" row_group_size: Optional[int] = None - __section__: str = known_sections.DATA_WRITER + __section__: ClassVar[str] = known_sections.DATA_WRITER class ParquetDataWriter(DataWriter): diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index 36a9cc3b6e..7a64f32ea3 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -30,22 +30,22 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): """Injectable destination capabilities required for many Pipeline stages ie. normalize""" - preferred_loader_file_format: TLoaderFileFormat - supported_loader_file_formats: List[TLoaderFileFormat] - preferred_staging_file_format: Optional[TLoaderFileFormat] - supported_staging_file_formats: List[TLoaderFileFormat] - escape_identifier: Callable[[str], str] - escape_literal: Callable[[Any], Any] - decimal_precision: Tuple[int, int] - wei_precision: Tuple[int, int] - max_identifier_length: int - max_column_identifier_length: int - max_query_length: int - is_max_query_length_in_bytes: bool - max_text_data_type_length: int - is_max_text_data_type_length_in_bytes: bool - supports_transactions: bool - supports_ddl_transactions: bool + preferred_loader_file_format: TLoaderFileFormat = None + supported_loader_file_formats: List[TLoaderFileFormat] = None + preferred_staging_file_format: Optional[TLoaderFileFormat] = None + supported_staging_file_formats: List[TLoaderFileFormat] = None + escape_identifier: Callable[[str], str] = None + escape_literal: Callable[[Any], Any] = None + decimal_precision: Tuple[int, int] = None + wei_precision: Tuple[int, int] = None + max_identifier_length: int = None + max_column_identifier_length: int = None + max_query_length: int = None + is_max_query_length_in_bytes: bool = None + max_text_data_type_length: int = None + is_max_text_data_type_length_in_bytes: bool = None + supports_transactions: bool = None + supports_ddl_transactions: bool = None naming_convention: str = "snake_case" alter_add_multi_column: bool = True supports_truncate_command: bool = True diff --git a/dlt/common/destination/exceptions.py b/dlt/common/destination/exceptions.py new file mode 100644 index 0000000000..1b5423ff02 --- /dev/null +++ b/dlt/common/destination/exceptions.py @@ -0,0 +1,126 @@ +from typing import Any, Iterable, List + +from dlt.common.exceptions import DltException, TerminalException, TransientException + + +class DestinationException(DltException): + pass + + +class UnknownDestinationModule(DestinationException): + def __init__(self, destination_module: str) -> None: + self.destination_module = destination_module + if "." in destination_module: + msg = f"Destination module {destination_module} could not be found and imported" + else: + msg = f"Destination {destination_module} is not one of the standard dlt destinations" + super().__init__(msg) + + +class InvalidDestinationReference(DestinationException): + def __init__(self, destination_module: Any) -> None: + self.destination_module = destination_module + msg = f"Destination {destination_module} is not a valid destination module." + super().__init__(msg) + + +class DestinationTerminalException(DestinationException, TerminalException): + pass + + +class DestinationUndefinedEntity(DestinationTerminalException): + pass + + +class DestinationTransientException(DestinationException, TransientException): + pass + + +class DestinationLoadingViaStagingNotSupported(DestinationTerminalException): + def __init__(self, destination: str) -> None: + self.destination = destination + super().__init__(f"Destination {destination} does not support loading via staging.") + + +class DestinationLoadingWithoutStagingNotSupported(DestinationTerminalException): + def __init__(self, destination: str) -> None: + self.destination = destination + super().__init__(f"Destination {destination} does not support loading without staging.") + + +class DestinationNoStagingMode(DestinationTerminalException): + def __init__(self, destination: str) -> None: + self.destination = destination + super().__init__(f"Destination {destination} cannot be used as a staging") + + +class DestinationIncompatibleLoaderFileFormatException(DestinationTerminalException): + def __init__( + self, destination: str, staging: str, file_format: str, supported_formats: Iterable[str] + ) -> None: + self.destination = destination + self.staging = staging + self.file_format = file_format + self.supported_formats = supported_formats + supported_formats_str = ", ".join(supported_formats) + if self.staging: + if not supported_formats: + msg = ( + f"Staging {staging} cannot be used with destination {destination} because they" + " have no file formats in common." + ) + else: + msg = ( + f"Unsupported file format {file_format} for destination {destination} in" + f" combination with staging destination {staging}. Supported formats:" + f" {supported_formats_str}" + ) + else: + msg = ( + f"Unsupported file format {file_format} destination {destination}. Supported" + f" formats: {supported_formats_str}. Check the staging option in the dlt.pipeline" + " for additional formats." + ) + super().__init__(msg) + + +class IdentifierTooLongException(DestinationTerminalException): + def __init__( + self, + destination_name: str, + identifier_type: str, + identifier_name: str, + max_identifier_length: int, + ) -> None: + self.destination_name = destination_name + self.identifier_type = identifier_type + self.identifier_name = identifier_name + self.max_identifier_length = max_identifier_length + super().__init__( + f"The length of {identifier_type} {identifier_name} exceeds" + f" {max_identifier_length} allowed for {destination_name}" + ) + + +class DestinationHasFailedJobs(DestinationTerminalException): + def __init__(self, destination_name: str, load_id: str, failed_jobs: List[Any]) -> None: + self.destination_name = destination_name + self.load_id = load_id + self.failed_jobs = failed_jobs + super().__init__( + f"Destination {destination_name} has failed jobs in load package {load_id}" + ) + + +class DestinationSchemaTampered(DestinationTerminalException): + def __init__(self, schema_name: str, version_hash: str, stored_version_hash: str) -> None: + self.version_hash = version_hash + self.stored_version_hash = stored_version_hash + super().__init__( + f"Schema {schema_name} content was changed - by a loader or by destination code - from" + " the moment it was retrieved by load package. Such schema cannot reliably be updated" + f" nor saved. Current version hash: {version_hash} != stored version hash" + f" {stored_version_hash}. If you are using destination client directly, without storing" + " schema in load package, you should first save it into schema storage. You can also" + " use schema._bump_version() in test code to remove modified flag." + ) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 6248efa1d6..ddcc5d1146 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -1,7 +1,5 @@ -import datetime # noqa: 251 -import inspect from abc import ABC, abstractmethod -from copy import deepcopy +import dataclasses from importlib import import_module from types import TracebackType from typing import ( @@ -14,7 +12,6 @@ Iterable, Type, Union, - TYPE_CHECKING, List, ContextManager, Dict, @@ -23,21 +20,13 @@ Generic, Final, ) +import datetime # noqa: 251 +from copy import deepcopy +import inspect from dlt.common import logger -from dlt.common.configuration import configspec, resolve_configuration, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration -from dlt.common.configuration.specs import GcpCredentials, AwsCredentialsWithoutDefaults -from dlt.common.destination.capabilities import DestinationCapabilitiesContext -from dlt.common.exceptions import ( - IdentifierTooLongException, - InvalidDestinationReference, - UnknownDestinationModule, -) from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.exceptions import SchemaException -from dlt.common.schema.exceptions import UnknownTableException from dlt.common.schema.utils import ( get_write_disposition, get_table_format, @@ -45,14 +34,25 @@ has_column_with_prop, get_first_column_name_with_prop, ) +from dlt.common.configuration import configspec, resolve_configuration, known_sections +from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration +from dlt.common.configuration.accessors import config +from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.destination.exceptions import ( + IdentifierTooLongException, + InvalidDestinationReference, + UnknownDestinationModule, + DestinationSchemaTampered, +) from dlt.common.schema.utils import is_complete_column +from dlt.common.schema.exceptions import UnknownTableException from dlt.common.storages import FileStorage from dlt.common.storages.load_storage import ParsedLoadJobFileName - TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"] TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") +TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration") class StorageSchemaInfo(NamedTuple): @@ -75,8 +75,10 @@ class StateInfo(NamedTuple): @configspec class DestinationClientConfiguration(BaseConfiguration): - destination_type: Final[str] = None # which destination to load data to - credentials: Optional[CredentialsConfiguration] + destination_type: Final[str] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) # which destination to load data to + credentials: Optional[CredentialsConfiguration] = None destination_name: Optional[str] = ( None # name of the destination, if not set, destination_type is used ) @@ -93,28 +95,33 @@ def __str__(self) -> str: def on_resolved(self) -> None: self.destination_name = self.destination_name or self.destination_type - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[CredentialsConfiguration] = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... - @configspec class DestinationClientDwhConfiguration(DestinationClientConfiguration): """Configuration of a destination that supports datasets/schemas""" - dataset_name: Final[str] = None # dataset must be final so it is not configurable + dataset_name: Final[str] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) # dataset must be final so it is not configurable """dataset name in the destination to load data to, for schemas that are not default schema, it is used as dataset prefix""" - default_schema_name: Optional[str] = None + default_schema_name: Final[Optional[str]] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) """name of default schema to be used to name effective dataset to load data to""" replace_strategy: TLoaderReplaceStrategy = "truncate-and-insert" """How to handle replace disposition for this destination, can be classic or staging""" + def _bind_dataset_name( + self: TDestinationDwhClient, dataset_name: str, default_schema_name: str = None + ) -> TDestinationDwhClient: + """Binds the dataset and default schema name to the configuration + + This method is intended to be used internally. + """ + self.dataset_name = dataset_name # type: ignore[misc] + self.default_schema_name = default_schema_name # type: ignore[misc] + return self + def normalize_dataset_name(self, schema: Schema) -> str: """Builds full db dataset (schema) name out of configured dataset name and schema name: {dataset_name}_{schema.name}. The resulting name is normalized. @@ -131,23 +138,11 @@ def normalize_dataset_name(self, schema: Schema) -> str: ) return ( - schema.naming.normalize_table_identifier(self.dataset_name) - if self.dataset_name - else self.dataset_name + self.dataset_name + if not self.dataset_name + else schema.naming.normalize_table_identifier(self.dataset_name) ) - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[CredentialsConfiguration] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... - @configspec class DestinationClientStagingConfiguration(DestinationClientDwhConfiguration): @@ -161,62 +156,35 @@ class DestinationClientStagingConfiguration(DestinationClientDwhConfiguration): # layout of the destination files layout: str = "{table_name}/{load_id}.{file_id}.{ext}" - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Union[AwsCredentialsWithoutDefaults, GcpCredentials] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - as_staging: bool = False, - bucket_url: str = None, - layout: str = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... - @configspec class DestinationClientDwhWithStagingConfiguration(DestinationClientDwhConfiguration): - """Configuration of a destination that can take data from a staging destination.""" + """Configuration of a destination that can take data from staging destination""" staging_config: Optional[DestinationClientStagingConfiguration] = None - """Configuration of the staging, if present, injected at runtime.""" - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[CredentialsConfiguration] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - staging_config: Optional[DestinationClientStagingConfiguration] = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... + """configuration of the staging, if present, injected at runtime""" TLoadJobState = Literal["running", "failed", "retry", "completed"] class LoadJob: - """Represents a job that loads a single file. + """Represents a job that loads a single file - Each job starts in "running" state and ends in one of the terminal states: "retry", "failed" or "completed". - A filename uniquely identifies each job. The file is guaranteed to exist in "running" state. In terminal state, the file may not be present. - In "running" state, the loader component periodically gets the state via `status()` method. When terminal state is reached, a load job is discarded and not called again. + Each job starts in "running" state and ends in one of terminal states: "retry", "failed" or "completed". + Each job is uniquely identified by a file name. The file is guaranteed to exist in "running" state. In terminal state, the file may not be present. + In "running" state, the loader component periodically gets the state via `status()` method. When terminal state is reached, load job is discarded and not called again. `exception` method is called to get error information in "failed" and "retry" states. The `__init__` method is responsible to put the Job in "running" state. It may raise `LoadClientTerminalException` and `LoadClientTransientException` to - immediately transition a job into "failed" or "retry" state respectively. + immediately transition job into "failed" or "retry" state respectively. """ def __init__(self, file_name: str) -> None: """ - Filename is a job ID (or job ID is deterministically derived), so it must be globally unique. + File name is also a job id (or job id is deterministically derived) so it must be globally unique """ - # Ensure filename. + # ensure file name assert file_name == FileStorage.get_file_name_from_file_path(file_name) self._file_name = file_name self._parsed_file_name = ParsedLoadJobFileName.parse(file_name) @@ -231,7 +199,7 @@ def file_name(self) -> str: return self._file_name def job_id(self) -> str: - """The job ID that is derived from the filename and does not change during job lifecycle.""" + """The job id that is derived from the file name and does not changes during job lifecycle""" return self._parsed_file_name.job_id() def job_file_info(self) -> ParsedLoadJobFileName: @@ -239,7 +207,7 @@ def job_file_info(self) -> ParsedLoadJobFileName: @abstractmethod def exception(self) -> str: - """The exception associated with failed or retry states.""" + """The exception associated with failed or retry states""" pass @@ -248,15 +216,15 @@ class NewLoadJob(LoadJob): @abstractmethod def new_file_path(self) -> str: - """Path to a newly created temporary job file. If empty, no followup job should be created.""" + """Path to a newly created temporary job file. If empty, no followup job should be created""" pass class FollowupJob: - """Adds a trait that allows to create a followup job.""" + """Adds a trait that allows to create a followup job""" def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]: - """Return list of new jobs. `final_state` is state to which this job transits.""" + """Return list of new jobs. `final_state` is state to which this job transits""" return [] @@ -318,6 +286,12 @@ def update_stored_schema( Optional[TSchemaTables]: Returns an update that was applied at the destination. """ self._verify_schema() + # make sure that schema being saved was not modified from the moment it was loaded from storage + version_hash = self.schema.version_hash + if self.schema.is_modified: + raise DestinationSchemaTampered( + self.schema.name, version_hash, self.schema.stored_version_hash + ) return expected_update @abstractmethod @@ -441,8 +415,8 @@ def prepare_load_table( if "table_format" not in table: table["table_format"] = get_table_format(self.schema.tables, table_name) return table - except KeyError as e: - raise UnknownTableException(table_name) from e + except KeyError: + raise UnknownTableException(table_name) class WithStateSync(ABC): @@ -525,7 +499,7 @@ def destination_name(self) -> str: @property def destination_type(self) -> str: - full_path = f"{self.__class__.__module__}.{self.__class__.__qualname__}" + full_path = self.__class__.__module__ + "." + self.__class__.__qualname__ return Destination.normalize_type(full_path) @property @@ -540,12 +514,13 @@ def client_class(self) -> Type[TDestinationClient]: def configuration(self, initial_config: TDestinationConfig) -> TDestinationConfig: """Get a fully resolved destination config from the initial config""" - return resolve_configuration( + config = resolve_configuration( initial_config, sections=(known_sections.DESTINATION, self.destination_name), # Already populated values will supersede resolved env config explicit_value=self.config_params, ) + return config @staticmethod def to_name(ref: TDestinationReferenceArg) -> str: @@ -561,7 +536,7 @@ def to_name(ref: TDestinationReferenceArg) -> str: def normalize_type(destination_type: str) -> str: """Normalizes destination type string into a canonical form. Assumes that type names without dots correspond to build in destinations.""" if "." not in destination_type: - destination_type = f"dlt.destinations.{destination_type}" + destination_type = "dlt.destinations." + destination_type # the next two lines shorten the dlt internal destination paths to dlt.destinations. name = Destination.to_name(destination_type) destination_type = destination_type.replace( diff --git a/dlt/common/exceptions.py b/dlt/common/exceptions.py index c14a743f33..fe526c53dc 100644 --- a/dlt/common/exceptions.py +++ b/dlt/common/exceptions.py @@ -133,115 +133,6 @@ class SystemConfigurationException(DltException): pass -class DestinationException(DltException): - pass - - -class UnknownDestinationModule(DestinationException): - def __init__(self, destination_module: str) -> None: - self.destination_module = destination_module - if "." in destination_module: - msg = f"Destination module {destination_module} could not be found and imported" - else: - msg = f"Destination {destination_module} is not one of the standard dlt destinations" - super().__init__(msg) - - -class InvalidDestinationReference(DestinationException): - def __init__(self, destination_module: Any) -> None: - self.destination_module = destination_module - msg = f"Destination {destination_module} is not a valid destination module." - super().__init__(msg) - - -class DestinationTerminalException(DestinationException, TerminalException): - pass - - -class DestinationUndefinedEntity(DestinationTerminalException): - pass - - -class DestinationTransientException(DestinationException, TransientException): - pass - - -class DestinationLoadingViaStagingNotSupported(DestinationTerminalException): - def __init__(self, destination: str) -> None: - self.destination = destination - super().__init__(f"Destination {destination} does not support loading via staging.") - - -class DestinationLoadingWithoutStagingNotSupported(DestinationTerminalException): - def __init__(self, destination: str) -> None: - self.destination = destination - super().__init__(f"Destination {destination} does not support loading without staging.") - - -class DestinationNoStagingMode(DestinationTerminalException): - def __init__(self, destination: str) -> None: - self.destination = destination - super().__init__(f"Destination {destination} cannot be used as a staging") - - -class DestinationIncompatibleLoaderFileFormatException(DestinationTerminalException): - def __init__( - self, destination: str, staging: str, file_format: str, supported_formats: Iterable[str] - ) -> None: - self.destination = destination - self.staging = staging - self.file_format = file_format - self.supported_formats = supported_formats - supported_formats_str = ", ".join(supported_formats) - if self.staging: - if not supported_formats: - msg = ( - f"Staging {staging} cannot be used with destination {destination} because they" - " have no file formats in common." - ) - else: - msg = ( - f"Unsupported file format {file_format} for destination {destination} in" - f" combination with staging destination {staging}. Supported formats:" - f" {supported_formats_str}" - ) - else: - msg = ( - f"Unsupported file format {file_format} destination {destination}. Supported" - f" formats: {supported_formats_str}. Check the staging option in the dlt.pipeline" - " for additional formats." - ) - super().__init__(msg) - - -class IdentifierTooLongException(DestinationTerminalException): - def __init__( - self, - destination_name: str, - identifier_type: str, - identifier_name: str, - max_identifier_length: int, - ) -> None: - self.destination_name = destination_name - self.identifier_type = identifier_type - self.identifier_name = identifier_name - self.max_identifier_length = max_identifier_length - super().__init__( - f"The length of {identifier_type} {identifier_name} exceeds" - f" {max_identifier_length} allowed for {destination_name}" - ) - - -class DestinationHasFailedJobs(DestinationTerminalException): - def __init__(self, destination_name: str, load_id: str, failed_jobs: List[Any]) -> None: - self.destination_name = destination_name - self.load_id = load_id - self.failed_jobs = failed_jobs - super().__init__( - f"Destination {destination_name} has failed jobs in load package {load_id}" - ) - - class PipelineException(DltException): def __init__(self, pipeline_name: str, msg: str) -> None: """Base class for all pipeline exceptions. Should not be raised.""" diff --git a/dlt/common/runtime/logger.py b/dlt/common/logger.py similarity index 84% rename from dlt/common/runtime/logger.py rename to dlt/common/logger.py index 9dd8ce4e3a..02412248c3 100644 --- a/dlt/common/runtime/logger.py +++ b/dlt/common/logger.py @@ -2,12 +2,7 @@ import logging import traceback from logging import LogRecord, Logger -from typing import Any, Iterator, Protocol - -from dlt.common.json import json -from dlt.common.runtime.exec_info import dlt_version_info -from dlt.common.typing import StrAny, StrStr -from dlt.common.configuration.specs import RunConfiguration +from typing import Any, Mapping, Iterator, Protocol DLT_LOGGER_NAME = "dlt" LOGGER: Logger = None @@ -32,7 +27,7 @@ def wrapper(msg: str, *args: Any, **kwargs: Any) -> None: return wrapper -def metrics(name: str, extra: StrAny, stacklevel: int = 1) -> None: +def metrics(name: str, extra: Mapping[str, Any], stacklevel: int = 1) -> None: """Forwards metrics call to LOGGER""" if LOGGER: LOGGER.info(name, extra=extra, stacklevel=stacklevel) @@ -46,15 +41,6 @@ def suppress_and_warn() -> Iterator[None]: LOGGER.warning("Suppressed exception", exc_info=True) -def init_logging(config: RunConfiguration) -> None: - global LOGGER - - version = dlt_version_info(config.pipeline_name) - LOGGER = _init_logging( - DLT_LOGGER_NAME, config.log_level, config.log_format, config.pipeline_name, version - ) - - def is_logging() -> bool: return LOGGER is not None @@ -75,6 +61,8 @@ def pretty_format_exception() -> str: class _MetricsFormatter(logging.Formatter): def format(self, record: LogRecord) -> str: # noqa: A003 + from dlt.common.json import json + s = super(_MetricsFormatter, self).format(record) # dump metrics dictionary nicely if "metrics" in record.__dict__: @@ -83,7 +71,7 @@ def format(self, record: LogRecord) -> str: # noqa: A003 def _init_logging( - logger_name: str, level: str, fmt: str, component: str, version: StrStr + logger_name: str, level: str, fmt: str, component: str, version: Mapping[str, str] ) -> Logger: if logger_name == "root": logging.basicConfig(level=level) @@ -102,7 +90,7 @@ def _init_logging( from dlt.common.runtime import json_logging class _CustomJsonFormatter(json_logging.JSONLogFormatter): - version: StrStr = None + version: Mapping[str, str] = None def _format_log_object(self, record: LogRecord) -> Any: json_log_object = super(_CustomJsonFormatter, self)._format_log_object(record) diff --git a/dlt/common/normalizers/configuration.py b/dlt/common/normalizers/configuration.py index adeefe2237..54b725db1f 100644 --- a/dlt/common/normalizers/configuration.py +++ b/dlt/common/normalizers/configuration.py @@ -1,8 +1,7 @@ -import dataclasses -from typing import Optional, TYPE_CHECKING +from typing import ClassVar, Optional, TYPE_CHECKING from dlt.common.configuration import configspec -from dlt.common.configuration.specs import BaseConfiguration +from dlt.common.configuration.specs import BaseConfiguration, known_sections from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.normalizers.typing import TJSONNormalizer from dlt.common.typing import DictStrAny @@ -11,7 +10,7 @@ @configspec class NormalizersConfiguration(BaseConfiguration): # always in section - __section__: str = "schema" + __section__: ClassVar[str] = known_sections.SCHEMA naming: Optional[str] = None json_normalizer: Optional[DictStrAny] = None @@ -32,7 +31,3 @@ def on_resolved(self) -> None: self.json_normalizer["config"][ "max_nesting" ] = self.destination_capabilities.max_table_nesting - - if TYPE_CHECKING: - - def __init__(self, naming: str = None, json_normalizer: TJSONNormalizer = None) -> None: ... diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 3cbaafefbe..7c117d4612 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +import dataclasses import os import datetime # noqa: 251 import humanize @@ -32,24 +33,18 @@ from dlt.common.configuration.paths import get_dlt_data_dir from dlt.common.configuration.specs import RunConfiguration from dlt.common.destination import TDestinationReferenceArg, TDestination -from dlt.common.exceptions import ( - DestinationHasFailedJobs, - PipelineStateNotAvailable, - SourceSectionNotAvailable, -) +from dlt.common.destination.exceptions import DestinationHasFailedJobs +from dlt.common.exceptions import PipelineStateNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo -from dlt.common.storages.load_package import PackageStorage - from dlt.common.time import ensure_pendulum_datetime, precise_time from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize from dlt.common.jsonpath import delete_matches, TAnyJsonPath from dlt.common.data_writers.writers import DataWriterMetrics, TLoaderFileFormat from dlt.common.utils import RowCounts, merge_row_counts from dlt.common.versioned_state import TVersionedState -from dlt.common.storages.load_package import TLoadPackageState class _StepInfo(NamedTuple): @@ -559,8 +554,12 @@ def __call__( @configspec class PipelineContext(ContainerInjectableContext): - _deferred_pipeline: Callable[[], SupportsPipeline] - _pipeline: SupportsPipeline + _deferred_pipeline: Callable[[], SupportsPipeline] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) + _pipeline: SupportsPipeline = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) can_create_default: ClassVar[bool] = True @@ -598,14 +597,10 @@ def __init__(self, deferred_pipeline: Callable[..., SupportsPipeline] = None) -> @configspec class StateInjectableContext(ContainerInjectableContext): - state: TPipelineState + state: TPipelineState = None can_create_default: ClassVar[bool] = False - if TYPE_CHECKING: - - def __init__(self, state: TPipelineState = None) -> None: ... - def pipeline_state( container: Container, initial_default: TPipelineState = None diff --git a/dlt/common/runners/configuration.py b/dlt/common/runners/configuration.py index c5de2353f4..5857e1799f 100644 --- a/dlt/common/runners/configuration.py +++ b/dlt/common/runners/configuration.py @@ -16,13 +16,3 @@ class PoolRunnerConfiguration(BaseConfiguration): """# how many threads/processes in the pool""" run_sleep: float = 0.1 """how long to sleep between runs with workload, seconds""" - - if TYPE_CHECKING: - - def __init__( - self, - pool_type: TPoolType = None, - start_method: str = None, - workers: int = None, - run_sleep: float = 0.1, - ) -> None: ... diff --git a/dlt/common/runtime/init.py b/dlt/common/runtime/init.py index dc1430a527..5354dee4ff 100644 --- a/dlt/common/runtime/init.py +++ b/dlt/common/runtime/init.py @@ -5,8 +5,17 @@ _RUN_CONFIGURATION: RunConfiguration = None +def init_logging(config: RunConfiguration) -> None: + from dlt.common import logger + from dlt.common.runtime.exec_info import dlt_version_info + + version = dlt_version_info(config.pipeline_name) + logger.LOGGER = logger._init_logging( + logger.DLT_LOGGER_NAME, config.log_level, config.log_format, config.pipeline_name, version + ) + + def initialize_runtime(config: RunConfiguration) -> None: - from dlt.common.runtime.logger import init_logging from dlt.common.runtime.telemetry import start_telemetry from dlt.sources.helpers import requests diff --git a/dlt/common/runtime/prometheus.py b/dlt/common/runtime/prometheus.py index 1b233ffa9b..07c960efe7 100644 --- a/dlt/common/runtime/prometheus.py +++ b/dlt/common/runtime/prometheus.py @@ -3,7 +3,7 @@ from prometheus_client.metrics import MetricWrapperBase from dlt.common.configuration.specs import RunConfiguration -from dlt.common.runtime import logger +from dlt.common import logger from dlt.common.runtime.exec_info import dlt_version_info from dlt.common.typing import DictStrAny, StrAny diff --git a/dlt/common/runtime/segment.py b/dlt/common/runtime/segment.py index e302767fcc..70b81fb4f4 100644 --- a/dlt/common/runtime/segment.py +++ b/dlt/common/runtime/segment.py @@ -10,7 +10,7 @@ from typing import Literal, Optional from dlt.common.configuration.paths import get_dlt_data_dir -from dlt.common.runtime import logger +from dlt.common import logger from dlt.common.managed_thread_pool import ManagedThreadPool from dlt.common.configuration.specs import RunConfiguration diff --git a/dlt/common/runtime/signals.py b/dlt/common/runtime/signals.py index 8e64c8ba64..8d1cb3803e 100644 --- a/dlt/common/runtime/signals.py +++ b/dlt/common/runtime/signals.py @@ -2,8 +2,9 @@ import signal from contextlib import contextmanager from threading import Event -from typing import Any, TYPE_CHECKING, Iterator +from typing import Any, Iterator +from dlt.common import logger from dlt.common.exceptions import SignalReceivedException _received_signal: int = 0 @@ -11,11 +12,6 @@ def signal_receiver(sig: int, frame: Any) -> None: - if not TYPE_CHECKING: - from dlt.common.runtime import logger - else: - logger: Any = None - global _received_signal logger.info(f"Signal {sig} received") @@ -64,9 +60,5 @@ def delayed_signals() -> Iterator[None]: signal.signal(signal.SIGINT, original_sigint_handler) signal.signal(signal.SIGTERM, original_sigterm_handler) else: - if not TYPE_CHECKING: - from dlt.common.runtime import logger - else: - logger: Any = None logger.info("Running in daemon thread, signals not enabled") yield diff --git a/dlt/common/runtime/slack.py b/dlt/common/runtime/slack.py index 15da89f333..b1e090098d 100644 --- a/dlt/common/runtime/slack.py +++ b/dlt/common/runtime/slack.py @@ -1,8 +1,9 @@ import requests -from dlt.common import json, logger def send_slack_message(incoming_hook: str, message: str, is_markdown: bool = True) -> None: + from dlt.common import json, logger + """Sends a `message` to Slack `incoming_hook`, by default formatted as markdown.""" r = requests.post( incoming_hook, diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 92598fff44..c738f1753e 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -1,4 +1,3 @@ -import yaml from copy import copy, deepcopy from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast, Literal from dlt.common import json @@ -76,8 +75,8 @@ class Schema: _schema_name: str _dlt_tables_prefix: str - _stored_version: int # version at load/creation time - _stored_version_hash: str # version hash at load/creation time + _stored_version: int # version at load time + _stored_version_hash: str # version hash at load time _stored_previous_hashes: Optional[List[str]] # list of ancestor hashes of the schema _imported_version_hash: str # version hash of recently imported schema _schema_description: str # optional schema description @@ -136,12 +135,10 @@ def replace_schema_content( # do not bump version so hash from `schema` is preserved stored_schema = schema.to_dict(bump_version=False) if link_to_replaced_schema: - replaced_version_hash = self.stored_version_hash - assert replaced_version_hash is not None + replaced_version_hash = self.version_hash # do not store hash if the replaced schema is identical - if stored_schema["version_hash"] != replaced_version_hash: + if schema.version_hash != replaced_version_hash: utils.store_prev_hash(stored_schema, replaced_version_hash) - stored_schema["version_hash"] = replaced_version_hash self._reset_schema(schema.name, schema._normalizers_config) self._from_stored_schema(stored_schema) @@ -426,7 +423,7 @@ def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchem self._schema_tables[table_name] = partial_table else: # merge tables performing additional checks - partial_table = utils.merge_tables(table, partial_table) + partial_table = utils.merge_table(table, partial_table) self.data_item_normalizer.extend_table(table_name) return partial_table @@ -442,19 +439,6 @@ def update_schema(self, schema: "Schema") -> None: self._settings = deepcopy(schema.settings) self._compile_settings() - def bump_version(self) -> Tuple[int, str]: - """Computes schema hash in order to check if schema content was modified. In such case the schema ``stored_version`` and ``stored_version_hash`` are updated. - - Should not be used in production code. The method ``to_dict`` will generate TStoredSchema with correct value, only once before persisting schema to storage. - - Returns: - Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple - """ - self._stored_version, self._stored_version_hash, _, _ = utils.bump_version_if_modified( - self.to_dict(bump_version=False) - ) - return self._stored_version, self._stored_version_hash - def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: StrAny) -> StrAny: rv_row: DictStrAny = {} column_prop: TColumnProp = utils.hint_to_column_prop(hint_type) @@ -515,7 +499,7 @@ def normalize_table_identifiers(self, table: TTableSchema) -> TTableSchema: # re-index columns as the name changed, if name space was reduced then # some columns now clash with each other. so make sure that we merge columns that are already there if new_col_name in new_columns: - new_columns[new_col_name] = utils.merge_columns( + new_columns[new_col_name] = utils.merge_column( new_columns[new_col_name], c, merge_defaults=False ) else: @@ -631,6 +615,19 @@ def stored_version_hash(self) -> str: """Version hash of the schema content form the time of schema loading/creation.""" return self._stored_version_hash + @property + def is_modified(self) -> bool: + """Checks if schema was modified from the time it was saved or if this is a new schema + + A current version hash is computed and compared with stored version hash + """ + return self.version_hash != self._stored_version_hash + + @property + def is_new(self) -> bool: + """Checks if schema was ever saved""" + return self._stored_version_hash is None + @property def name(self) -> str: return self._schema_name @@ -646,22 +643,24 @@ def settings(self) -> TSchemaSettings: def to_pretty_json(self, remove_defaults: bool = True) -> str: d = self.to_dict(remove_defaults=remove_defaults) - return json.dumps(d, pretty=True) + return utils.to_pretty_json(d) def to_pretty_yaml(self, remove_defaults: bool = True) -> str: d = self.to_dict(remove_defaults=remove_defaults) - return yaml.dump(d, allow_unicode=True, default_flow_style=False, sort_keys=False) + return utils.to_pretty_yaml(d) def clone(self, with_name: str = None, update_normalizers: bool = False) -> "Schema": """Make a deep copy of the schema, optionally changing the name, and updating normalizers and identifiers in the schema if `update_normalizers` is True - Note that changing of name will break the previous version chain + Note that changing of name will set the schema as new """ - d = deepcopy(self.to_dict()) + d = deepcopy(self.to_dict(bump_version=False)) if with_name is not None: + d["version"] = d["version_hash"] = None + d.pop("imported_version_hash", None) d["name"] = with_name d["previous_hashes"] = [] - schema = Schema.from_dict(d) # type: ignore + schema = Schema.from_stored_schema(d) # update normalizers and possibly all schema identifiers if update_normalizers: schema.update_normalizers() @@ -782,7 +781,7 @@ def _coerce_non_null_value( # if there's incomplete new_column then merge it with inferred column if new_column: # use all values present in incomplete column to override inferred column - also the defaults - new_column = utils.merge_columns(inferred_column, new_column) + new_column = utils.merge_column(inferred_column, new_column) else: new_column = inferred_column @@ -807,6 +806,28 @@ def _infer_hint(self, hint_type: TColumnHint, _: Any, col_name: str) -> bool: else: return False + def _bump_version(self) -> Tuple[int, str]: + """Computes schema hash in order to check if schema content was modified. In such case the schema ``stored_version`` and ``stored_version_hash`` are updated. + + Should not be used directly. The method ``to_dict`` will generate TStoredSchema with correct value, only once before persisting schema to storage. + + Returns: + Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple + """ + self._stored_version, self._stored_version_hash, _, _ = utils.bump_version_if_modified( + self.to_dict(bump_version=False) + ) + return self._stored_version, self._stored_version_hash + + def _drop_version(self) -> None: + """Stores first prev hash as stored hash and decreases numeric version""" + if len(self.previous_hashes) == 0 or self._stored_version is None: + self._stored_version = None + self._stored_version_hash = None + else: + self._stored_version -= 1 + self._stored_version_hash = self._stored_previous_hashes.pop(0) + def _add_standard_tables(self) -> None: self._schema_tables[self.version_table_name] = self.normalize_table_identifiers( utils.version_table() @@ -849,7 +870,7 @@ def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> None: self._schema_tables: TSchemaTables = {} self._schema_name: str = None - self._stored_version = 1 + self._stored_version = None self._stored_version_hash: str = None self._imported_version_hash: str = None self._schema_description: str = None @@ -878,8 +899,6 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No self._add_standard_tables() # compile all known regexes self._compile_settings() - # set initial version hash - self._stored_version_hash = self.version_hash def _from_stored_schema(self, stored_schema: TStoredSchema) -> None: self._schema_tables = stored_schema.get("tables") or {} diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 4f2a4aa22d..0a4e00759d 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -1,7 +1,7 @@ import re import base64 import hashlib - +import yaml from copy import deepcopy, copy from typing import Dict, List, Sequence, Tuple, Type, Any, cast, Iterable, Optional, Union @@ -164,9 +164,11 @@ def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, st """Bumps the `stored_schema` version and version hash if content modified, returns (new version, new hash, old hash, 10 last hashes) tuple""" hash_ = generate_version_hash(stored_schema) previous_hash = stored_schema.get("version_hash") + previous_version = stored_schema.get("version") if not previous_hash: # if hash was not set, set it without bumping the version, that's initial schema - pass + # previous_version may not be None for migrating schemas + stored_schema["version"] = previous_version or 1 elif hash_ != previous_hash: stored_schema["version"] += 1 store_prev_hash(stored_schema, previous_hash) @@ -340,7 +342,7 @@ def compare_complete_columns(a: TColumnSchema, b: TColumnSchema) -> bool: return a["data_type"] == b["data_type"] and a["name"] == b["name"] -def merge_columns( +def merge_column( col_a: TColumnSchema, col_b: TColumnSchema, merge_defaults: bool = True ) -> TColumnSchema: """Merges `col_b` into `col_a`. if `merge_defaults` is True, only hints from `col_b` that are not default in `col_a` will be set. @@ -354,7 +356,7 @@ def merge_columns( return col_a -def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTableSchema: +def diff_table(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTableSchema: """Creates a partial table that contains properties found in `tab_b` that are not present or different in `tab_a`. The name is always present in returned partial. It returns new columns (not present in tab_a) and merges columns from tab_b into tab_a (overriding non-default hint values). @@ -389,7 +391,7 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl None, ) # all other properties can change - merged_column = merge_columns(copy(col_a), col_b) + merged_column = merge_column(copy(col_a), col_b) if merged_column != col_a: new_columns.append(merged_column) else: @@ -428,11 +430,12 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl # return False -def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPartialTableSchema: +def merge_table(table: TTableSchema, partial_table: TPartialTableSchema) -> TPartialTableSchema: """Merges "partial_table" into "table". `table` is merged in place. Returns the diff partial table. `table` and `partial_table` names must be identical. A table diff is generated and applied to `table`: * new columns are added, updated columns are replaced from diff + * incomplete columns in `table` that got completed in `partial_table` are removed to preserve order * table hints are added or replaced from diff * nothing gets deleted """ @@ -441,14 +444,20 @@ def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPa raise TablePropertiesConflictException( table["name"], "name", table["name"], partial_table["name"] ) - diff_table = diff_tables(table, partial_table) + diff = diff_table(table, partial_table) + # remove incomplete columns in table that are complete in diff table + for col_name, column in diff["columns"].items(): + if is_complete_column(column): + table_column = table["columns"].get(col_name) + if table_column and not is_complete_column(table_column): + table["columns"].pop(col_name) # add new columns when all checks passed - table["columns"].update(diff_table["columns"]) + table["columns"].update(diff["columns"]) updated_columns = table["columns"] - table.update(diff_table) + table.update(diff) table["columns"] = updated_columns - return diff_table + return diff def has_table_seen_data(table: TTableSchema) -> bool: @@ -725,3 +734,11 @@ def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: def standard_type_detections() -> List[TTypeDetections]: return ["iso_timestamp"] + + +def to_pretty_json(stored_schema: TStoredSchema) -> str: + return json.dumps(stored_schema, pretty=True) + + +def to_pretty_yaml(stored_schema: TStoredSchema) -> str: + return yaml.dump(stored_schema, allow_unicode=True, default_flow_style=False, sort_keys=False) diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index 2cbe7c78d5..d0100c335d 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -31,24 +31,11 @@ class SchemaStorageConfiguration(BaseConfiguration): True # remove default values when exporting schema ) - if TYPE_CHECKING: - - def __init__( - self, - schema_volume_path: str = None, - import_schema_path: str = None, - export_schema_path: str = None, - ) -> None: ... - @configspec class NormalizeStorageConfiguration(BaseConfiguration): normalize_volume_path: str = None # path to volume where normalized loader files will be stored - if TYPE_CHECKING: - - def __init__(self, normalize_volume_path: str = None) -> None: ... - @configspec class LoadStorageConfiguration(BaseConfiguration): @@ -59,12 +46,6 @@ class LoadStorageConfiguration(BaseConfiguration): False # if set to true the folder with completed jobs will be deleted ) - if TYPE_CHECKING: - - def __init__( - self, load_volume_path: str = None, delete_completed_jobs: bool = None - ) -> None: ... - FileSystemCredentials = Union[ AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials @@ -96,7 +77,7 @@ class FilesystemConfiguration(BaseConfiguration): bucket_url: str = None # should be a union of all possible credentials as found in PROTOCOL_CREDENTIALS - credentials: FileSystemCredentials + credentials: FileSystemCredentials = None read_only: bool = False """Indicates read only filesystem access. Will enable caching""" @@ -144,14 +125,3 @@ def __str__(self) -> str: new_netloc += f":{url.port}" return url._replace(netloc=new_netloc).geturl() return self.bucket_url - - if TYPE_CHECKING: - - def __init__( - self, - bucket_url: str, - credentials: FileSystemCredentials = None, - read_only: bool = False, - kwargs: Optional[DictStrAny] = None, - client_kwargs: Optional[DictStrAny] = None, - ) -> None: ... diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py index d3d5f14fe5..fb94a21b7a 100644 --- a/dlt/common/storages/live_schema_storage.py +++ b/dlt/common/storages/live_schema_storage.py @@ -17,22 +17,17 @@ def __init__( def __getitem__(self, name: str) -> Schema: if name in self.live_schemas: schema = self.live_schemas[name] - else: - # return new schema instance - schema = super().load_schema(name) - self.update_live_schema(schema) - + if not self.is_live_schema_committed(name): + return schema + # return new schema instance + schema = self.load_schema(name) + schema = self.set_live_schema(schema) return schema - # def load_schema(self, name: str) -> Schema: - # self.commit_live_schema(name) - # # now live schema is saved so we can load it with the changes - # return super().load_schema(name) - def save_schema(self, schema: Schema) -> str: - rv = super().save_schema(schema) # update the live schema with schema being saved, if no live schema exist, create one to be available for a getter - self.update_live_schema(schema) + schema = self.set_live_schema(schema) + rv = super().save_schema(schema) return rv def remove_schema(self, name: str) -> None: @@ -40,44 +35,47 @@ def remove_schema(self, name: str) -> None: # also remove the live schema self.live_schemas.pop(name, None) - def save_import_schema_if_not_exists(self, schema: Schema) -> None: + def save_import_schema_if_not_exists(self, schema: Schema) -> bool: + """Saves import schema, if not exists. If schema was saved, link itself as imported from""" if self.config.import_schema_path: try: self._load_import_schema(schema.name) except FileNotFoundError: # save import schema only if it not exist self._export_schema(schema, self.config.import_schema_path) + # if import schema got saved then add own version hash as import version hash + schema._imported_version_hash = schema.version_hash + return True - def commit_live_schema(self, name: str) -> Schema: - # if live schema exists and is modified then it must be used as an import schema - live_schema = self.live_schemas.get(name) - if live_schema and live_schema.stored_version_hash != live_schema.version_hash: - live_schema.bump_version() - self._save_schema(live_schema) - return live_schema + return False + + def commit_live_schema(self, name: str) -> str: + """Saves live schema in storage if it was modified""" + if not self.is_live_schema_committed(name): + live_schema = self.live_schemas[name] + return self._save_schema(live_schema) + # not saved + return None def is_live_schema_committed(self, name: str) -> bool: """Checks if live schema is present in storage and have same hash""" live_schema = self.live_schemas.get(name) if live_schema is None: raise SchemaNotFoundError(name, f"live-schema://{name}") - try: - stored_schema_json = self._load_schema_json(name) - return live_schema.version_hash == cast(str, stored_schema_json.get("version_hash")) - except FileNotFoundError: - return False + return not live_schema.is_modified - def update_live_schema(self, schema: Schema, can_create_new: bool = True) -> None: - """Will update live schema content without writing to storage. Optionally allows to create a new live schema""" + def set_live_schema(self, schema: Schema) -> Schema: + """Will add or update live schema content without writing to storage.""" live_schema = self.live_schemas.get(schema.name) if live_schema: if id(live_schema) != id(schema): # replace content without replacing instance # print(f"live schema {live_schema} updated in place") live_schema.replace_schema_content(schema, link_to_replaced_schema=True) - elif can_create_new: + else: # print(f"live schema {schema.name} created from schema") - self.live_schemas[schema.name] = schema + live_schema = self.live_schemas[schema.name] = schema + return live_schema def list_schemas(self) -> List[str]: names = list(set(super().list_schemas()) | set(self.live_schemas.keys())) diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index bb66e28671..3b8af424ee 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -210,7 +210,7 @@ def schema_name(self) -> str: @property def schema_hash(self) -> str: - return self.schema.stored_version_hash + return self.schema.version_hash def asdict(self) -> DictStrAny: d = self._asdict() @@ -627,8 +627,8 @@ def filter_jobs_for_table( @configspec class LoadPackageStateInjectableContext(ContainerInjectableContext): - storage: PackageStorage - load_id: str + storage: PackageStorage = None + load_id: str = None can_create_default: ClassVar[bool] = False global_affinity: ClassVar[bool] = False @@ -640,10 +640,6 @@ def on_resolved(self) -> None: self.state_save_lock = threading.Lock() self.state = self.storage.get_load_package_state(self.load_id) - if TYPE_CHECKING: - - def __init__(self, load_id: str, storage: PackageStorage) -> None: ... - def load_package() -> TLoadPackage: """Get full load package state present in current context. Across all threads this will be the same in memory dict.""" diff --git a/dlt/common/storages/schema_storage.py b/dlt/common/storages/schema_storage.py index 4745d50dcc..23b695b839 100644 --- a/dlt/common/storages/schema_storage.py +++ b/dlt/common/storages/schema_storage.py @@ -4,6 +4,7 @@ from dlt.common import json, logger from dlt.common.configuration import with_config from dlt.common.configuration.accessors import config +from dlt.common.schema.utils import to_pretty_json, to_pretty_yaml from dlt.common.storages.configuration import ( SchemaStorageConfiguration, TSchemaFileFormat, @@ -106,32 +107,33 @@ def _maybe_import_schema(self, name: str, storage_schema: DictStrAny = None) -> if storage_schema is None: # import schema when no schema in storage rv_schema = Schema.from_dict(imported_schema) - # if schema was imported, overwrite storage schema + # store import hash to self to track changes rv_schema._imported_version_hash = rv_schema.version_hash - self._save_schema(rv_schema) logger.info( f"Schema {name} not present in {self.storage.storage_path} and got imported" f" with version {rv_schema.stored_version} and imported hash" f" {rv_schema._imported_version_hash}" ) + # if schema was imported, overwrite storage schema + self._save_schema(rv_schema) + if self.config.export_schema_path: + self._export_schema(rv_schema, self.config.export_schema_path) else: # import schema when imported schema was modified from the last import - sc = Schema.from_dict(storage_schema) - rv_schema = Schema.from_dict(imported_schema) - if rv_schema.version_hash != sc._imported_version_hash: - # use imported schema but version must be bumped and imported hash set - rv_schema._stored_version = sc.stored_version + 1 - rv_schema._imported_version_hash = rv_schema.version_hash - # if schema was imported, overwrite storage schema - self._save_schema(rv_schema) + rv_schema = Schema.from_dict(storage_schema) + i_s = Schema.from_dict(imported_schema) + if i_s.version_hash != rv_schema._imported_version_hash: + rv_schema.replace_schema_content(i_s, link_to_replaced_schema=True) + rv_schema._imported_version_hash = i_s.version_hash logger.info( f"Schema {name} was present in {self.storage.storage_path} but is" - f" overwritten with imported schema version {rv_schema.stored_version} and" - f" imported hash {rv_schema._imported_version_hash}" + f" overwritten with imported schema version {i_s.version} and" + f" imported hash {i_s.version_hash}" ) - else: - # use storage schema as nothing changed - rv_schema = sc + # if schema was imported, overwrite storage schema + self._save_schema(rv_schema) + if self.config.export_schema_path: + self._export_schema(rv_schema, self.config.export_schema_path) except FileNotFoundError: # no schema to import -> skip silently and return the original if storage_schema is None: @@ -154,14 +156,11 @@ def _load_import_schema(self, name: str) -> DictStrAny: ) def _export_schema(self, schema: Schema, export_path: str) -> None: + stored_schema = schema.to_dict(remove_defaults=True) if self.config.external_schema_format == "json": - exported_schema_s = schema.to_pretty_json( - remove_defaults=self.config.external_schema_format_remove_defaults - ) + exported_schema_s = to_pretty_json(stored_schema) elif self.config.external_schema_format == "yaml": - exported_schema_s = schema.to_pretty_yaml( - remove_defaults=self.config.external_schema_format_remove_defaults - ) + exported_schema_s = to_pretty_yaml(stored_schema) else: raise ValueError(self.config.external_schema_format) @@ -170,13 +169,19 @@ def _export_schema(self, schema: Schema, export_path: str) -> None: export_storage.save(schema_file, exported_schema_s) logger.info( f"Schema {schema.name} exported to {export_path} with version" - f" {schema.stored_version} as {self.config.external_schema_format}" + f" {stored_schema['version']}:{stored_schema['version_hash']} as" + f" {self.config.external_schema_format}" ) def _save_schema(self, schema: Schema) -> str: # save a schema to schema store schema_file = self._file_name_in_store(schema.name, "json") - return self.storage.save(schema_file, schema.to_pretty_json(remove_defaults=False)) + stored_schema = schema.to_dict() + saved_path = self.storage.save(schema_file, to_pretty_json(stored_schema)) + # this should be the only place where this function is called. we bump a version and + # clean modified status + schema._bump_version() + return saved_path @staticmethod def load_schema_file( diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 05720fe7d9..99c2604cdf 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -243,7 +243,7 @@ def get_all_types_of_class_in_union(hint: Type[Any], cls: Type[TAny]) -> List[Ty return [ t for t in get_args(hint) - if inspect.isclass(t) and (issubclass(t, cls) or issubclass(cls, t)) + if not is_typeddict(t) and inspect.isclass(t) and (issubclass(t, cls) or issubclass(cls, t)) ] diff --git a/dlt/destinations/decorators.py b/dlt/destinations/decorators.py index 62d059c4a6..a920d336a2 100644 --- a/dlt/destinations/decorators.py +++ b/dlt/destinations/decorators.py @@ -1,6 +1,6 @@ import functools -from typing import Any, Type, Optional, Callable, Union, cast +from typing import Any, Type, Optional, Callable, Union from typing_extensions import Concatenate from dlt.common.typing import AnyFun @@ -13,7 +13,6 @@ CustomDestinationClientConfiguration, ) from dlt.common.destination import TLoaderFileFormat -from dlt.common.destination.reference import Destination from dlt.common.typing import TDataItems from dlt.common.schema import TTableSchema diff --git a/dlt/destinations/exceptions.py b/dlt/destinations/exceptions.py index cc4d4fd836..5e6adb007d 100644 --- a/dlt/destinations/exceptions.py +++ b/dlt/destinations/exceptions.py @@ -1,5 +1,6 @@ from typing import Sequence -from dlt.common.exceptions import ( + +from dlt.common.destination.exceptions import ( DestinationTerminalException, DestinationTransientException, DestinationUndefinedEntity, @@ -63,18 +64,6 @@ def __init__(self, table_name: str, columns: Sequence[str], msg: str) -> None: ) -class DestinationSchemaTampered(DestinationTerminalException): - def __init__(self, schema_name: str, version_hash: str, stored_version_hash: str) -> None: - self.version_hash = version_hash - self.stored_version_hash = stored_version_hash - super().__init__( - f"Schema {schema_name} content was changed - by a loader or by destination code - from" - " the moment it was retrieved by load package. Such schema cannot reliably be updated" - f" or saved. Current version hash: {version_hash} != stored version hash" - f" {stored_version_hash}" - ) - - class LoadJobNotExistsException(DestinationTerminalException): def __init__(self, job_id: str) -> None: super().__init__(f"Job with id/file name {job_id} not found") diff --git a/dlt/destinations/impl/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py index 6b985f284a..59dfeee4ec 100644 --- a/dlt/destinations/impl/athena/configuration.py +++ b/dlt/destinations/impl/athena/configuration.py @@ -1,4 +1,5 @@ -from typing import ClassVar, Final, List, Optional, TYPE_CHECKING +import dataclasses +from typing import ClassVar, Final, List, Optional from dlt.common.configuration import configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration @@ -7,7 +8,7 @@ @configspec class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = "athena" # type: ignore[misc] + destination_type: Final[str] = dataclasses.field(default="athena", init=False, repr=False, compare=False) # type: ignore[misc] query_result_bucket: str = None credentials: AwsCredentials = None athena_work_group: Optional[str] = None @@ -23,19 +24,3 @@ def __str__(self) -> str: return str(self.staging_config.credentials) else: return "[no staging set]" - - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[AwsCredentials] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - athena_work_group: Optional[str] = None, - aws_data_catalog: Optional[str] = None, - supports_truncate_command: bool = False, - force_iceberg: Optional[bool] = False, - destination_name: str = None, - environment: str = None, - ) -> None: ... diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index 3c4a71c0df..a6686c3f2d 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -1,5 +1,6 @@ +import dataclasses import warnings -from typing import TYPE_CHECKING, ClassVar, List, Optional, Final +from typing import ClassVar, List, Final from dlt.common.configuration import configspec from dlt.common.configuration.specs import GcpServiceAccountCredentials @@ -10,7 +11,7 @@ @configspec class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = "bigquery" # type: ignore + destination_type: Final[str] = dataclasses.field(default="bigquery", init=False, repr=False, compare=False) # type: ignore credentials: GcpServiceAccountCredentials = None location: str = "US" @@ -38,31 +39,3 @@ def fingerprint(self) -> str: if self.credentials and self.credentials.project_id: return digest128(self.credentials.project_id) return "" - - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[GcpServiceAccountCredentials] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - location: str = "US", - http_timeout: float = 15.0, - file_upload_timeout: float = 30 * 60.0, - retry_deadline: float = 60.0, - destination_name: str = None, - environment: str = None - ) -> None: - super().__init__( - credentials=credentials, - dataset_name=dataset_name, - default_schema_name=default_schema_name, - destination_name=destination_name, - environment=environment, - ) - self.retry_deadline = retry_deadline - self.file_upload_timeout = file_upload_timeout - self.http_timeout = http_timeout - self.location = location - ... diff --git a/dlt/destinations/impl/clickhouse/configuration.py b/dlt/destinations/impl/clickhouse/configuration.py index 1be23d757f..534ea7acfd 100644 --- a/dlt/destinations/impl/clickhouse/configuration.py +++ b/dlt/destinations/impl/clickhouse/configuration.py @@ -1,3 +1,4 @@ +import dataclasses from typing import ClassVar, List, Any, Final, TYPE_CHECKING, Literal, cast from dlt.common.configuration import configspec @@ -15,7 +16,7 @@ @configspec class ClickhouseCredentials(ConnectionStringCredentials): drivername: str = "clickhouse" - host: str + host: str # type: ignore """Host with running ClickHouse server.""" port: int = 9440 """Port ClickHouse server is bound to. Defaults to 9000.""" @@ -65,7 +66,9 @@ def to_url(self) -> URL: @configspec class ClickhouseClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_type: Final[str] = "clickhouse" # type: ignore[misc] - credentials: ClickhouseCredentials + credentials: ClickhouseCredentials # type: ignore + dataset_name: Final[str] = "" # type: ignore + """dataset name in the destination to load data to, for schemas that are not default schema, it is used as dataset prefix""" # Primary key columns are used to build a sparse primary index which allows for efficient data retrieval, # but they do not enforce uniqueness constraints. It permits duplicate values even for the primary key @@ -90,7 +93,6 @@ def __init__( ) -> None: super().__init__( credentials=credentials, - dataset_name=dataset_name, destination_name=destination_name, environment=environment, ) diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index 924047e30f..3bd2d12a5a 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -1,3 +1,4 @@ +import dataclasses from typing import ClassVar, Final, Optional, Any, Dict, List from dlt.common.typing import TSecretStrValue @@ -40,8 +41,8 @@ def to_connector_params(self) -> Dict[str, Any]: @configspec class DatabricksClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = "databricks" # type: ignore[misc] - credentials: DatabricksCredentials + destination_type: Final[str] = dataclasses.field(default="databricks", init=False, repr=False, compare=False) # type: ignore[misc] + credentials: DatabricksCredentials = None def __str__(self) -> str: """Return displayable destination location""" diff --git a/dlt/destinations/impl/destination/configuration.py b/dlt/destinations/impl/destination/configuration.py index f123ba69b3..30e54a8313 100644 --- a/dlt/destinations/impl/destination/configuration.py +++ b/dlt/destinations/impl/destination/configuration.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Optional, Final, Callable, Union, Any +import dataclasses +from typing import Optional, Final, Callable, Union from typing_extensions import ParamSpec from dlt.common.configuration import configspec @@ -16,19 +17,9 @@ @configspec class CustomDestinationClientConfiguration(DestinationClientConfiguration): - destination_type: Final[str] = "destination" # type: ignore + destination_type: Final[str] = dataclasses.field(default="destination", init=False, repr=False, compare=False) # type: ignore destination_callable: Optional[Union[str, TDestinationCallable]] = None # noqa: A003 loader_file_format: TLoaderFileFormat = "puae-jsonl" batch_size: int = 10 skip_dlt_columns_and_tables: bool = True max_table_nesting: int = 0 - - if TYPE_CHECKING: - - def __init__( - self, - *, - loader_file_format: TLoaderFileFormat = "puae-jsonl", - batch_size: int = 10, - destination_callable: Union[TDestinationCallable, str] = None, - ) -> None: ... diff --git a/dlt/destinations/impl/duckdb/configuration.py b/dlt/destinations/impl/duckdb/configuration.py index 8cb88c43b5..70d91dcb56 100644 --- a/dlt/destinations/impl/duckdb/configuration.py +++ b/dlt/destinations/impl/duckdb/configuration.py @@ -1,7 +1,8 @@ import os +import dataclasses import threading from pathvalidate import is_valid_filepath -from typing import Any, ClassVar, Final, List, Optional, Tuple, TYPE_CHECKING, Union +from typing import Any, ClassVar, Final, List, Optional, Tuple, TYPE_CHECKING, Type, Union from dlt.common import logger from dlt.common.configuration import configspec @@ -13,12 +14,17 @@ ) from dlt.common.typing import TSecretValue +try: + from duckdb import DuckDBPyConnection +except ModuleNotFoundError: + DuckDBPyConnection = Type[Any] # type: ignore[assignment,misc] + DUCK_DB_NAME = "%s.duckdb" DEFAULT_DUCK_DB_NAME = DUCK_DB_NAME % "quack" LOCAL_STATE_KEY = "duckdb_database" -@configspec +@configspec(init=False) class DuckDbBaseCredentials(ConnectionStringCredentials): password: Optional[TSecretValue] = None host: Optional[str] = None @@ -95,7 +101,7 @@ def __del__(self) -> None: @configspec class DuckDbCredentials(DuckDbBaseCredentials): - drivername: Final[str] = "duckdb" # type: ignore + drivername: Final[str] = dataclasses.field(default="duckdb", init=False, repr=False, compare=False) # type: ignore username: Optional[str] = None __config_gen_annotations__: ClassVar[List[str]] = [] @@ -193,30 +199,31 @@ def _path_from_pipeline(self, default_path: str) -> Tuple[str, bool]: def _conn_str(self) -> str: return self.database + def __init__(self, conn_or_path: Union[str, DuckDBPyConnection] = None) -> None: + """Access to duckdb database at a given path or from duckdb connection""" + self._apply_init_value(conn_or_path) + @configspec class DuckDbClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = "duckdb" # type: ignore - credentials: DuckDbCredentials + destination_type: Final[str] = dataclasses.field(default="duckdb", init=False, repr=False, compare=False) # type: ignore + credentials: DuckDbCredentials = None create_indexes: bool = ( False # should unique indexes be created, this slows loading down massively ) - if TYPE_CHECKING: - try: - from duckdb import DuckDBPyConnection - except ModuleNotFoundError: - DuckDBPyConnection = Any # type: ignore[assignment,misc] - - def __init__( - self, - *, - credentials: Union[DuckDbCredentials, str, DuckDBPyConnection] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - create_indexes: bool = False, - staging_config: Optional[DestinationClientStagingConfiguration] = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... + def __init__( + self, + *, + credentials: Union[DuckDbCredentials, str, DuckDBPyConnection] = None, + create_indexes: bool = False, + destination_name: str = None, + environment: str = None, + ) -> None: + super().__init__( + credentials=credentials, # type: ignore[arg-type] + destination_name=destination_name, + environment=environment, + ) + self.create_indexes = create_indexes diff --git a/dlt/destinations/impl/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py index cce0dfa8ed..a9fdb1f47d 100644 --- a/dlt/destinations/impl/dummy/configuration.py +++ b/dlt/destinations/impl/dummy/configuration.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Optional, Final +import dataclasses +from typing import Final from dlt.common.configuration import configspec from dlt.common.destination import TLoaderFileFormat @@ -16,7 +17,7 @@ def __str__(self) -> str: @configspec class DummyClientConfiguration(DestinationClientConfiguration): - destination_type: Final[str] = "dummy" # type: ignore + destination_type: Final[str] = dataclasses.field(default="dummy", init=False, repr=False, compare=False) # type: ignore loader_file_format: TLoaderFileFormat = "jsonl" fail_schema_update: bool = False fail_prob: float = 0.0 @@ -30,22 +31,3 @@ class DummyClientConfiguration(DestinationClientConfiguration): create_followup_jobs: bool = False credentials: DummyClientCredentials = None - - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[CredentialsConfiguration] = None, - loader_file_format: TLoaderFileFormat = None, - fail_schema_update: bool = None, - fail_prob: float = None, - retry_prob: float = None, - completed_prob: float = None, - exception_prob: float = None, - timeout: float = None, - fail_in_init: bool = None, - create_followup_jobs: bool = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index c46e329819..0d91220d88 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -16,9 +16,12 @@ from dlt.common import pendulum from dlt.common.schema import Schema, TTableSchema, TSchemaTables -from dlt.common.schema.typing import TWriteDisposition from dlt.common.storages import FileStorage from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.exceptions import ( + DestinationTerminalException, + DestinationTransientException, +) from dlt.common.destination.reference import ( FollowupJob, NewLoadJob, @@ -32,10 +35,7 @@ from dlt.destinations.exceptions import ( LoadJobNotExistsException, LoadJobInvalidStateTransitionException, - DestinationTerminalException, - DestinationTransientException, ) - from dlt.destinations.impl.dummy import capabilities from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration from dlt.destinations.job_impl import NewReferenceJob diff --git a/dlt/destinations/impl/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py index 93e5537aab..1521222180 100644 --- a/dlt/destinations/impl/filesystem/configuration.py +++ b/dlt/destinations/impl/filesystem/configuration.py @@ -1,6 +1,5 @@ -from urllib.parse import urlparse - -from typing import Final, Type, Optional, Any, TYPE_CHECKING +import dataclasses +from typing import Final, Type, Optional from dlt.common.configuration import configspec, resolve_type from dlt.common.destination.reference import ( @@ -12,22 +11,9 @@ @configspec class FilesystemDestinationClientConfiguration(FilesystemConfiguration, DestinationClientStagingConfiguration): # type: ignore[misc] - destination_type: Final[str] = "filesystem" # type: ignore + destination_type: Final[str] = dataclasses.field(default="filesystem", init=False, repr=False, compare=False) # type: ignore @resolve_type("credentials") def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: # use known credentials or empty credentials for unknown protocol return self.PROTOCOL_CREDENTIALS.get(self.protocol) or Optional[CredentialsConfiguration] # type: ignore[return-value] - - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[Any] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - bucket_url: str = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... diff --git a/dlt/destinations/impl/motherduck/configuration.py b/dlt/destinations/impl/motherduck/configuration.py index f4ab571e5c..3179295c54 100644 --- a/dlt/destinations/impl/motherduck/configuration.py +++ b/dlt/destinations/impl/motherduck/configuration.py @@ -1,8 +1,9 @@ -from typing import Any, ClassVar, Final, List, TYPE_CHECKING, Optional +import dataclasses +from typing import Any, ClassVar, Final, List from dlt.common.configuration import configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration -from dlt.common.exceptions import DestinationTerminalException +from dlt.common.destination.exceptions import DestinationTerminalException from dlt.common.typing import TSecretValue from dlt.common.utils import digest128 from dlt.common.configuration.exceptions import ConfigurationValueError @@ -12,9 +13,9 @@ MOTHERDUCK_DRIVERNAME = "md" -@configspec +@configspec(init=False) class MotherDuckCredentials(DuckDbBaseCredentials): - drivername: Final[str] = "md" # type: ignore + drivername: Final[str] = dataclasses.field(default="md", init=False, repr=False, compare=False) # type: ignore username: str = "motherduck" read_only: bool = False # open database read/write @@ -57,8 +58,8 @@ def on_resolved(self) -> None: @configspec class MotherDuckClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = "motherduck" # type: ignore - credentials: MotherDuckCredentials + destination_type: Final[str] = dataclasses.field(default="motherduck", init=False, repr=False, compare=False) # type: ignore + credentials: MotherDuckCredentials = None create_indexes: bool = ( False # should unique indexes be created, this slows loading down massively @@ -70,19 +71,6 @@ def fingerprint(self) -> str: return digest128(self.credentials.password) return "" - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[MotherDuckCredentials] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - create_indexes: Optional[bool] = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... - class MotherduckLocalVersionNotSupported(DestinationTerminalException): def __init__(self, duckdb_version: str) -> None: diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index 45c448fab7..1d085f40c1 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -1,4 +1,5 @@ -from typing import Final, ClassVar, Any, List, Dict, Optional, TYPE_CHECKING +import dataclasses +from typing import Final, ClassVar, Any, List, Dict from dlt.common.libs.sql_alchemy import URL from dlt.common.configuration import configspec @@ -10,11 +11,11 @@ from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration -@configspec +@configspec(init=False) class MsSqlCredentials(ConnectionStringCredentials): - drivername: Final[str] = "mssql" # type: ignore - password: TSecretValue - host: str + drivername: Final[str] = dataclasses.field(default="mssql", init=False, repr=False, compare=False) # type: ignore + password: TSecretValue = None + host: str = None port: int = 1433 connect_timeout: int = 15 driver: str = None @@ -90,8 +91,8 @@ def to_odbc_dsn(self) -> str: @configspec class MsSqlClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = "mssql" # type: ignore - credentials: MsSqlCredentials + destination_type: Final[str] = dataclasses.field(default="mssql", init=False, repr=False, compare=False) # type: ignore + credentials: MsSqlCredentials = None create_indexes: bool = False @@ -100,16 +101,3 @@ def fingerprint(self) -> str: if self.credentials and self.credentials.host: return digest128(self.credentials.host) return "" - - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: Optional[MsSqlCredentials] = None, - dataset_name: str = None, - default_schema_name: Optional[str] = None, - create_indexes: Optional[bool] = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index 109d422650..0d12abbac7 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -1,6 +1,7 @@ -from typing import Final, ClassVar, Any, List, TYPE_CHECKING -from dlt.common.libs.sql_alchemy import URL +import dataclasses +from typing import Final, ClassVar, Any, List, TYPE_CHECKING, Union +from dlt.common.libs.sql_alchemy import URL from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.utils import digest128 @@ -9,11 +10,11 @@ from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration -@configspec +@configspec(init=False) class PostgresCredentials(ConnectionStringCredentials): - drivername: Final[str] = "postgresql" # type: ignore - password: TSecretValue - host: str + drivername: Final[str] = dataclasses.field(default="postgresql", init=False, repr=False, compare=False) # type: ignore + password: TSecretValue = None + host: str = None port: int = 5432 connect_timeout: int = 15 @@ -33,8 +34,8 @@ def to_url(self) -> URL: @configspec class PostgresClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = "postgres" # type: ignore - credentials: PostgresCredentials + destination_type: Final[str] = dataclasses.field(default="postgres", init=False, repr=False, compare=False) # type: ignore + credentials: PostgresCredentials = None create_indexes: bool = True @@ -43,16 +44,3 @@ def fingerprint(self) -> str: if self.credentials and self.credentials.host: return digest128(self.credentials.host) return "" - - if TYPE_CHECKING: - - def __init__( - self, - *, - credentials: PostgresCredentials = None, - dataset_name: str = None, - default_schema_name: str = None, - create_indexes: bool = True, - destination_name: str = None, - environment: str = None, - ) -> None: ... diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index 23637dee33..d589537742 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -1,3 +1,4 @@ +import dataclasses from typing import Optional, Final from dlt.common.configuration import configspec @@ -15,7 +16,7 @@ class QdrantCredentials(CredentialsConfiguration): # If `None` - use default values for `host` and `port` location: Optional[str] = None # API key for authentication in Qdrant Cloud. Default: `None` - api_key: Optional[str] + api_key: Optional[str] = None def __str__(self) -> str: return self.location or "localhost" @@ -47,12 +48,14 @@ class QdrantClientOptions(BaseConfiguration): @configspec class QdrantClientConfiguration(DestinationClientDwhConfiguration): - destination_type: Final[str] = "qdrant" # type: ignore + destination_type: Final[str] = dataclasses.field(default="qdrant", init=False, repr=False, compare=False) # type: ignore + # Qdrant connection credentials + credentials: QdrantCredentials = None # character for the dataset separator dataset_separator: str = "_" # make it optional so empty dataset is allowed - dataset_name: Final[Optional[str]] = None # type: ignore[misc] + dataset_name: Final[Optional[str]] = dataclasses.field(default=None, init=False, repr=False, compare=False) # type: ignore[misc] # Batch size for generating embeddings embedding_batch_size: int = 32 @@ -67,10 +70,7 @@ class QdrantClientConfiguration(DestinationClientDwhConfiguration): upload_max_retries: int = 3 # Qdrant client options - options: QdrantClientOptions - - # Qdrant connection credentials - credentials: QdrantCredentials + options: QdrantClientOptions = None # FlagEmbedding model to use # Find the list here. https://qdrant.github.io/fastembed/examples/Supported_Models/. diff --git a/dlt/destinations/impl/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py index 2df3023d86..febfe38ec9 100644 --- a/dlt/destinations/impl/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -283,6 +283,7 @@ def _delete_sentinel_collection(self) -> None: def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None ) -> Optional[TSchemaTables]: + super().update_stored_schema(only_tables, expected_update) applied_update: TSchemaTables = {} schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) if schema_info is None: diff --git a/dlt/destinations/impl/redshift/configuration.py b/dlt/destinations/impl/redshift/configuration.py index 2a6ade4a4f..72d7f70a9f 100644 --- a/dlt/destinations/impl/redshift/configuration.py +++ b/dlt/destinations/impl/redshift/configuration.py @@ -1,4 +1,5 @@ -from typing import Final, Optional, TYPE_CHECKING +import dataclasses +from typing import Final, Optional from dlt.common.typing import TSecretValue from dlt.common.configuration import configspec @@ -10,7 +11,7 @@ ) -@configspec +@configspec(init=False) class RedshiftCredentials(PostgresCredentials): port: int = 5439 password: TSecretValue = None @@ -20,8 +21,8 @@ class RedshiftCredentials(PostgresCredentials): @configspec class RedshiftClientConfiguration(PostgresClientConfiguration): - destination_type: Final[str] = "redshift" # type: ignore - credentials: RedshiftCredentials + destination_type: Final[str] = dataclasses.field(default="redshift", init=False, repr=False, compare=False) # type: ignore + credentials: RedshiftCredentials = None staging_iam_role: Optional[str] = None def fingerprint(self) -> str: @@ -29,17 +30,3 @@ def fingerprint(self) -> str: if self.credentials and self.credentials.host: return digest128(self.credentials.host) return "" - - if TYPE_CHECKING: - - def __init__( - self, - *, - destination_type: str = None, - credentials: PostgresCredentials = None, - dataset_name: str = None, - default_schema_name: str = None, - staging_iam_role: str = None, - destination_name: str = None, - environment: str = None, - ) -> None: ... diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index 4f97f08700..5a1f7a65a9 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -1,11 +1,9 @@ +import dataclasses import base64 -import binascii - -from typing import Final, Optional, Any, Dict, ClassVar, List, TYPE_CHECKING - -from dlt.common.libs.sql_alchemy import URL +from typing import Final, Optional, Any, Dict, ClassVar, List, TYPE_CHECKING, Union from dlt import version +from dlt.common.libs.sql_alchemy import URL from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import TSecretStrValue from dlt.common.configuration.specs import ConnectionStringCredentials @@ -51,9 +49,9 @@ def _read_private_key(private_key: str, password: Optional[str] = None) -> bytes ) -@configspec +@configspec(init=False) class SnowflakeCredentials(ConnectionStringCredentials): - drivername: Final[str] = "snowflake" # type: ignore[misc] + drivername: Final[str] = dataclasses.field(default="snowflake", init=False, repr=False, compare=False) # type: ignore[misc] password: Optional[TSecretStrValue] = None host: str = None database: str = None @@ -118,8 +116,8 @@ def to_connector_params(self) -> Dict[str, Any]: @configspec class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration): - destination_type: Final[str] = "snowflake" # type: ignore[misc] - credentials: SnowflakeCredentials + destination_type: Final[str] = dataclasses.field(default="snowflake", init=False, repr=False, compare=False) # type: ignore[misc] + credentials: SnowflakeCredentials = None stage_name: Optional[str] = None """Use an existing named stage instead of the default. Default uses the implicit table stage per table""" @@ -131,18 +129,3 @@ def fingerprint(self) -> str: if self.credentials and self.credentials.host: return digest128(self.credentials.host) return "" - - if TYPE_CHECKING: - - def __init__( - self, - *, - destination_type: str = None, - credentials: SnowflakeCredentials = None, - dataset_name: str = None, - default_schema_name: str = None, - stage_name: str = None, - keep_staged_files: bool = True, - destination_name: str = None, - environment: str = None, - ) -> None: ... diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py index bb1ba632dc..37b932cd67 100644 --- a/dlt/destinations/impl/synapse/configuration.py +++ b/dlt/destinations/impl/synapse/configuration.py @@ -1,9 +1,8 @@ +import dataclasses +from dlt import version from typing import Final, Any, List, Dict, Optional, ClassVar -from dlt.common import logger from dlt.common.configuration import configspec -from dlt.common.schema.typing import TSchemaTables -from dlt.common.schema.utils import get_write_disposition from dlt.destinations.impl.mssql.configuration import ( MsSqlCredentials, @@ -14,9 +13,9 @@ from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType -@configspec +@configspec(init=False) class SynapseCredentials(MsSqlCredentials): - drivername: Final[str] = "synapse" # type: ignore + drivername: Final[str] = dataclasses.field(default="synapse", init=False, repr=False, compare=False) # type: ignore # LongAsMax keyword got introduced in ODBC Driver 18 for SQL Server. SUPPORTED_DRIVERS: ClassVar[List[str]] = ["ODBC Driver 18 for SQL Server"] @@ -32,8 +31,8 @@ def _get_odbc_dsn_dict(self) -> Dict[str, Any]: @configspec class SynapseClientConfiguration(MsSqlClientConfiguration): - destination_type: Final[str] = "synapse" # type: ignore - credentials: SynapseCredentials + destination_type: Final[str] = dataclasses.field(default="synapse", init=False, repr=False, compare=False) # type: ignore + credentials: SynapseCredentials = None # While Synapse uses CLUSTERED COLUMNSTORE INDEX tables by default, we use # HEAP tables (no indexing) by default. HEAP is a more robust choice, because diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index b7eddd6ef7..100878ae05 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -16,6 +16,11 @@ class synapse(Destination[SynapseClientConfiguration, "SynapseClient"]): spec = SynapseClientConfiguration + # TODO: implement as property everywhere and makes sure not accessed as class property + # @property + # def spec(self) -> t.Type[SynapseClientConfiguration]: + # return SynapseClientConfiguration + def capabilities(self) -> DestinationCapabilitiesContext: return capabilities() diff --git a/dlt/destinations/impl/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py index 5014e69163..90fb7ce5ce 100644 --- a/dlt/destinations/impl/weaviate/configuration.py +++ b/dlt/destinations/impl/weaviate/configuration.py @@ -1,5 +1,5 @@ -from typing import Dict, Literal, Optional, Final, TYPE_CHECKING -from dataclasses import field +import dataclasses +from typing import Dict, Literal, Optional, Final from urllib.parse import urlparse from dlt.common.configuration import configspec @@ -13,7 +13,7 @@ @configspec class WeaviateCredentials(CredentialsConfiguration): url: str = "http://localhost:8080" - api_key: Optional[str] + api_key: Optional[str] = None additional_headers: Optional[Dict[str, str]] = None def __str__(self) -> str: @@ -24,7 +24,7 @@ def __str__(self) -> str: @configspec class WeaviateClientConfiguration(DestinationClientDwhConfiguration): - destination_type: Final[str] = "weaviate" # type: ignore + destination_type: Final[str] = dataclasses.field(default="weaviate", init=False, repr=False, compare=False) # type: ignore # make it optional so empty dataset is allowed dataset_name: Optional[str] = None # type: ignore[misc] @@ -39,9 +39,9 @@ class WeaviateClientConfiguration(DestinationClientDwhConfiguration): dataset_separator: str = "_" - credentials: WeaviateCredentials + credentials: WeaviateCredentials = None vectorizer: str = "text2vec-openai" - module_config: Dict[str, Dict[str, str]] = field( + module_config: Dict[str, Dict[str, str]] = dataclasses.field( default_factory=lambda: { "text2vec-openai": { "model": "ada", @@ -58,26 +58,3 @@ def fingerprint(self) -> str: hostname = urlparse(self.credentials.url).hostname return digest128(hostname) return "" - - if TYPE_CHECKING: - - def __init__( - self, - *, - destination_type: str = None, - credentials: WeaviateCredentials = None, - name: str = None, - environment: str = None, - dataset_name: str = None, - default_schema_name: str = None, - batch_size: int = None, - batch_workers: int = None, - batch_consistency: TWeaviateBatchConsistency = None, - batch_retries: int = None, - conn_timeout: float = None, - read_timeout: float = None, - startup_period: int = None, - dataset_separator: str = None, - vectorizer: str = None, - module_config: Dict[str, Dict[str, str]] = None, - ) -> None: ... diff --git a/dlt/destinations/impl/weaviate/exceptions.py b/dlt/destinations/impl/weaviate/exceptions.py index bff1b4cacc..ee798e4e76 100644 --- a/dlt/destinations/impl/weaviate/exceptions.py +++ b/dlt/destinations/impl/weaviate/exceptions.py @@ -1,4 +1,4 @@ -from dlt.common.exceptions import DestinationException, DestinationTerminalException +from dlt.common.destination.exceptions import DestinationException, DestinationTerminalException class WeaviateBatchError(DestinationException): diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py index 2d23dc38f7..6486a75e6e 100644 --- a/dlt/destinations/impl/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -14,7 +14,7 @@ cast, ) -from dlt.common.exceptions import ( +from dlt.common.destination.exceptions import ( DestinationUndefinedEntity, DestinationTransientException, DestinationTerminalException, @@ -424,6 +424,7 @@ def _delete_sentinel_class(self) -> None: def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None ) -> Optional[TSchemaTables]: + super().update_stored_schema(only_tables, expected_update) # Retrieve the schema from Weaviate applied_update: TSchemaTables = {} try: diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index a7de0eb38b..ea0d10d11d 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -35,13 +35,13 @@ ) from dlt.common.storages import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables +from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME from dlt.common.destination.reference import ( StateInfo, StorageSchemaInfo, WithStateSync, DestinationClientConfiguration, DestinationClientDwhConfiguration, - DestinationClientDwhWithStagingConfiguration, NewLoadJob, WithStagingDataset, TLoadJobState, @@ -50,15 +50,10 @@ FollowupJob, CredentialsConfiguration, ) -from dlt.destinations.exceptions import ( - DatabaseUndefinedRelation, - DestinationSchemaTampered, - DestinationSchemaWillNotUpdate, -) + +from dlt.destinations.exceptions import DatabaseUndefinedRelation from dlt.destinations.job_impl import EmptyLoadJobWithoutFollowup, NewReferenceJob from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob -from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME - from dlt.destinations.typing import TNativeConn from dlt.destinations.sql_client import SqlClientBase @@ -67,7 +62,7 @@ class SqlLoadJob(LoadJob): - """A job executing sql statement, without followup trait.""" + """A job executing sql statement, without followup trait""" def __init__(self, file_path: str, sql_client: SqlClientBase[Any]) -> None: super().__init__(FileStorage.get_file_name_from_file_path(file_path)) @@ -98,10 +93,13 @@ def exception(self) -> str: raise NotImplementedError() def _string_containts_ddl_queries(self, sql: str) -> bool: - return any(re.search(cmd, sql, re.IGNORECASE) for cmd in DDL_COMMANDS) + for cmd in DDL_COMMANDS: + if re.search(cmd, sql, re.IGNORECASE): + return True + return False def _split_fragments(self, sql: str) -> List[str]: - return [s + ("" if s.endswith(";") else ";") for s in sql.split(";") if s.strip()] + return [s + (";" if not s.endswith(";") else "") for s in sql.split(";") if s.strip()] @staticmethod def is_sql_job(file_path: str) -> bool: @@ -493,7 +491,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non @staticmethod def _gen_not_null(v: bool) -> str: - return "" if v else "NOT NULL" + return "NOT NULL" if not v else "" def _create_table_update( self, table_name: str, storage_columns: TTableSchemaColumns @@ -515,9 +513,13 @@ def _row_to_schema_info(self, query: str, *args: Any) -> StorageSchemaInfo: # get schema as string # TODO: Re-use decompress/compress_state() implementation from dlt.pipeline.state_sync schema_str: str = row[5] - with contextlib.suppress(ValueError): + try: schema_bytes = base64.b64decode(schema_str, validate=True) schema_str = zlib.decompress(schema_bytes).decode("utf-8") + except ValueError: + # not a base64 string + pass + # make utc datetime inserted_at = pendulum.instance(row[4]) @@ -532,13 +534,9 @@ def _replace_schema_in_storage(self, schema: Schema) -> None: self._update_schema_in_storage(schema) def _update_schema_in_storage(self, schema: Schema) -> None: - # Make sure the schema being saved wasn't modified from the moment it was loaded from storage. - version_hash = schema.version_hash - if version_hash != schema.stored_version_hash: - raise DestinationSchemaTampered(schema.name, version_hash, schema.stored_version_hash) # get schema string or zip schema_str = json.dumps(schema.to_dict()) - # TODO: not all databases store data as utf-8 but this exception is mostly for redshift. + # TODO: not all databases store data as utf-8 but this exception is mostly for redshift schema_bytes = schema_str.encode("utf-8") if len(schema_bytes) > self.capabilities.max_text_data_type_length: # compress and to base64 diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 215bcf9fe5..91be3a60c9 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -1,7 +1,7 @@ from typing import Any, Callable, List, Sequence, Tuple, cast, TypedDict, Optional import yaml -from dlt.common.runtime.logger import pretty_format_exception +from dlt.common.logger import pretty_format_exception from dlt.common.schema.typing import TTableSchema, TSortOrder from dlt.common.schema.utils import ( diff --git a/dlt/destinations/utils.py b/dlt/destinations/utils.py index a2ffa490aa..c02460fe58 100644 --- a/dlt/destinations/utils.py +++ b/dlt/destinations/utils.py @@ -46,5 +46,3 @@ def _convert_to_old_pyformat( if count != len(args): raise DatabaseTransientException(operational_error_cls()) return old_style_string, mapping - - diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 6e916ff6e1..28a2aca633 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -28,6 +28,7 @@ from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.exceptions import ArgumentsOverloadException from dlt.common.pipeline import PipelineContext +from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema from dlt.common.schema.typing import ( @@ -71,27 +72,19 @@ class SourceSchemaInjectableContext(ContainerInjectableContext): """A context containing the source schema, present when dlt.source/resource decorated function is executed""" - schema: Schema + schema: Schema = None can_create_default: ClassVar[bool] = False - if TYPE_CHECKING: - - def __init__(self, schema: Schema = None) -> None: ... - @configspec class SourceInjectableContext(ContainerInjectableContext): """A context containing the source schema, present when dlt.resource decorated function is executed""" - source: DltSource + source: DltSource = None can_create_default: ClassVar[bool] = False - if TYPE_CHECKING: - - def __init__(self, source: DltSource = None) -> None: ... - TSourceFunParams = ParamSpec("TSourceFunParams") TResourceFunParams = ParamSpec("TResourceFunParams") @@ -447,7 +440,7 @@ def make_resource( ) -> DltResource: table_template = make_hints( table_name, - write_disposition=write_disposition, + write_disposition=write_disposition or DEFAULT_WRITE_DISPOSITION, columns=columns, primary_key=primary_key, merge_key=merge_key, diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 3b3d0704ea..2fc4fd77aa 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -76,8 +76,7 @@ def choose_schema() -> Schema: """Except of explicitly passed schema, use a clone that will get discarded if extraction fails""" if schema: schema_ = schema - # TODO: We should start with a new schema of the same name here ideally, but many tests fail - # because of this. So some investigation is needed. + # take pipeline schema to make newest version visible to the resources elif pipeline.default_schema_name: schema_ = pipeline.schemas[pipeline.default_schema_name].clone() else: diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 52ecd66920..b8e615aae4 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -105,12 +105,16 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No if isinstance(meta, HintsMeta): # update the resource with new hints, remove all caches so schema is recomputed # and contracts re-applied - resource.merge_hints(meta.hints) + resource.merge_hints(meta.hints, meta.create_table_variant) + # convert to table meta if created table variant so item is assigned to this table + if meta.create_table_variant: + # name in hints meta must be a string, otherwise merge_hints would fail + meta = TableNameMeta(meta.hints["name"]) # type: ignore[arg-type] self._reset_contracts_cache() if table_name := self._get_static_table_name(resource, meta): # write item belonging to table with static name - self._write_to_static_table(resource, table_name, items) + self._write_to_static_table(resource, table_name, items, meta) else: # table has name or other hints depending on data items self._write_to_dynamic_table(resource, items) @@ -157,30 +161,32 @@ def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> N if table_name in self._filtered_tables: continue if table_name not in self._table_contracts or resource._table_has_other_dynamic_hints: - item = self._compute_and_update_table(resource, table_name, item) + item = self._compute_and_update_table( + resource, table_name, item, TableNameMeta(table_name) + ) # write to storage with inferred table name if table_name not in self._filtered_tables: self._write_item(table_name, resource.name, item) def _write_to_static_table( - self, resource: DltResource, table_name: str, items: TDataItems + self, resource: DltResource, table_name: str, items: TDataItems, meta: Any ) -> None: if table_name not in self._table_contracts: - items = self._compute_and_update_table(resource, table_name, items) + items = self._compute_and_update_table(resource, table_name, items, meta) if table_name not in self._filtered_tables: self._write_item(table_name, resource.name, items) - def _compute_table(self, resource: DltResource, items: TDataItems) -> TTableSchema: + def _compute_table(self, resource: DltResource, items: TDataItems, meta: Any) -> TTableSchema: """Computes a schema for a new or dynamic table and normalizes identifiers""" - return self.schema.normalize_table_identifiers(resource.compute_table_schema(items)) + return self.schema.normalize_table_identifiers(resource.compute_table_schema(items, meta)) def _compute_and_update_table( - self, resource: DltResource, table_name: str, items: TDataItems + self, resource: DltResource, table_name: str, items: TDataItems, meta: Any ) -> TDataItems: """ Computes new table and does contract checks, if false is returned, the table may not be created and no items should be written """ - computed_table = self._compute_table(resource, items) + computed_table = self._compute_table(resource, items, meta) # overwrite table name (if coming from meta) computed_table["name"] = table_name # get or compute contract @@ -193,7 +199,7 @@ def _compute_and_update_table( computed_table["x-normalizer"] = {"evolve-columns-once": True} # type: ignore[typeddict-unknown-key] existing_table = self.schema._schema_tables.get(table_name, None) if existing_table: - diff_table = utils.diff_tables(existing_table, computed_table) + diff_table = utils.diff_table(existing_table, computed_table) else: diff_table = computed_table @@ -300,9 +306,11 @@ def _write_item( ] super()._write_item(table_name, resource_name, items, columns) - def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTableSchema: + def _compute_table( + self, resource: DltResource, items: TDataItems, meta: Any + ) -> TPartialTableSchema: items = items[0] - computed_table = super()._compute_table(resource, items) + computed_table = super()._compute_table(resource, items, Any) # Merge the columns to include primary_key and other hints that may be set on the resource arrow_table = copy(computed_table) @@ -329,9 +337,9 @@ def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTa return arrow_table def _compute_and_update_table( - self, resource: DltResource, table_name: str, items: TDataItems + self, resource: DltResource, table_name: str, items: TDataItems, meta: Any ) -> TDataItems: - items = super()._compute_and_update_table(resource, table_name, items) + items = super()._compute_and_update_table(resource, table_name, items, meta) # filter data item as filters could be updated in compute table items = [self._apply_contract_filters(item, resource, table_name) for item in items] return items diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 54ce00a806..01a99a23fe 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -12,7 +12,8 @@ TTableFormat, TSchemaContract, ) -from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table +from dlt.common import logger +from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_column, new_column, new_table from dlt.common.typing import TDataItem, DictStrAny, DictStrStr from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys @@ -21,7 +22,7 @@ InconsistentTableTemplate, ) from dlt.extract.incremental import Incremental -from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, ValidateItem +from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, TableNameMeta, ValidateItem from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint from dlt.extract.validation import create_item_validator @@ -43,12 +44,14 @@ class TResourceHints(TypedDict, total=False): class HintsMeta: - __slots__ = "hints" + __slots__ = ("hints", "create_table_variant") - hints: TResourceHints - - def __init__(self, hints: TResourceHints) -> None: + def __init__(self, hints: TResourceHints, create_table_variant: bool) -> None: self.hints = hints + self.create_table_variant = create_table_variant + + +NATURAL_CALLABLES = ["incremental", "validator", "original_columns"] def make_hints( @@ -105,8 +108,11 @@ def __init__(self, table_schema_template: TResourceHints = None): self._table_name_hint_fun: TFunHintTemplate[str] = None self._table_has_other_dynamic_hints: bool = False self._hints: TResourceHints = None + """Hints for the resource""" + self._hints_variants: Dict[str, TResourceHints] = {} + """Hints for tables emitted from resources""" if table_schema_template: - self.set_hints(table_schema_template) + self._set_hints(table_schema_template) @property def name(self) -> str: @@ -143,16 +149,23 @@ def columns(self) -> TTableHintTemplate[TTableSchemaColumns]: def schema_contract(self) -> TTableHintTemplate[TSchemaContract]: return self._hints.get("schema_contract") - def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: - """Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data.""" - if not self._hints: + def compute_table_schema(self, item: TDataItem = None, meta: Any = None) -> TTableSchema: + """Computes the table schema based on hints and column definitions passed during resource creation. + `item` parameter is used to resolve table hints based on data. + `meta` parameter is taken from Pipe and may further specify table name if variant is to be used + """ + if isinstance(meta, TableNameMeta): + # look for variant + table_template = self._hints_variants.get(meta.table_name, self._hints) + else: + table_template = self._hints + if not table_template: return new_table(self.name, resource=self.name) # resolve a copy of a held template - table_template = copy(self._hints) + table_template = self._clone_hints(table_template) if "name" not in table_template: table_template["name"] = self.name - table_template["columns"] = copy(self._hints["columns"]) # if table template present and has dynamic hints, the data item must be provided. if self._table_name_hint_fun and item is None: @@ -161,7 +174,7 @@ def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: resolved_template: TResourceHints = { k: self._resolve_hint(item, v) for k, v in table_template.items() - if k not in ["incremental", "validator", "original_columns"] + if k not in NATURAL_CALLABLES } # type: ignore table_schema = self._merge_keys(resolved_template) table_schema["resource"] = self.name @@ -184,9 +197,14 @@ def apply_hints( schema_contract: TTableHintTemplate[TSchemaContract] = None, additional_table_hints: Optional[Dict[str, TTableHintTemplate[Any]]] = None, table_format: TTableHintTemplate[TTableFormat] = None, + create_table_variant: bool = False, ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. + If `create_table_variant` is specified, the `table_name` must be a string and hints will be used to create a separate set of hints + for a particular `table_name`. Such hints may be retrieved via compute_table_schema(meta=TableNameMeta(table_name)). + Table variant hints may not contain dynamic hints. + This method accepts the same table hints arguments as `dlt.resource` decorator with the following additions. Skip the argument or pass None to leave the existing hint. Pass empty value (for a particular type i.e. "" for a string) to remove a hint. @@ -197,7 +215,24 @@ def apply_hints( Please note that for efficient incremental loading, the resource must be aware of the Incremental by accepting it as one if its arguments and then using are to skip already loaded data. In non-aware resources, `dlt` will filter out the loaded values, however, the resource will yield all the values again. """ - if not self._hints: + if create_table_variant: + if not isinstance(table_name, str): + raise ValueError( + "Please provide string table name if you want to create a table variant of" + " hints" + ) + # select hints variant + t = self._hints_variants.get(table_name, None) + if t is None: + # use resource hints as starting point + if self._hints: + t = self._clone_hints(self._hints) + # but remove callables + t = {n: h for n, h in t.items() if not callable(h)} # type: ignore[assignment] + else: + t = self._hints + + if t is None: # if there is no template yet, create and set a new one. default_wd = None if parent_table_name else DEFAULT_WRITE_DISPOSITION t = make_hints( @@ -211,8 +246,7 @@ def apply_hints( table_format, ) else: - # set single hints - t = self._clone_hints(self._hints) + t = self._clone_hints(t) if table_name is not None: if table_name: t["name"] = table_name @@ -279,20 +313,46 @@ def apply_hints( if incremental is not None: t["incremental"] = None if incremental is Incremental.EMPTY else incremental - self.set_hints(t) + self._set_hints(t, create_table_variant) - def set_hints(self, hints_template: TResourceHints) -> None: + def _set_hints( + self, hints_template: TResourceHints, create_table_variant: bool = False + ) -> None: DltResourceHints.validate_dynamic_hints(hints_template) - # if "name" is callable in the template, then the table schema requires data item to be inferred. - name_hint = hints_template.get("name") - self._table_name_hint_fun = name_hint if callable(name_hint) else None - # check if any other hints in the table template should be inferred from data. - self._table_has_other_dynamic_hints = any( - callable(v) for k, v in hints_template.items() if k != "name" - ) - self._hints = hints_template + if create_table_variant: + table_name: str = hints_template["name"] # type: ignore[assignment] + # incremental cannot be specified in variant + if hints_template.get("incremental"): + raise InconsistentTableTemplate( + f"You can specify incremental only for the resource `{self.name}` hints, not in" + f" table `{table_name}` variant-" + ) + if hints_template.get("validator"): + logger.warning( + f"A data item validator was created from column schema in {self.name} for a" + f" table `{table_name}` variant. Currently such validator is ignored." + ) + # dynamic hints will be ignored + for name, hint in hints_template.items(): + if callable(hint) and name not in NATURAL_CALLABLES: + raise InconsistentTableTemplate( + f"Table `{table_name}` variant hint is resource {self.name} cannot have" + f" dynamic hint but {name} does." + ) + self._hints_variants[table_name] = hints_template + else: + # if "name" is callable in the template, then the table schema requires data item to be inferred. + name_hint = hints_template.get("name") + self._table_name_hint_fun = name_hint if callable(name_hint) else None + # check if any other hints in the table template should be inferred from data. + self._table_has_other_dynamic_hints = any( + callable(v) for k, v in hints_template.items() if k != "name" + ) + self._hints = hints_template - def merge_hints(self, hints_template: TResourceHints) -> None: + def merge_hints( + self, hints_template: TResourceHints, create_table_variant: bool = False + ) -> None: self.apply_hints( table_name=hints_template.get("name"), parent_table_name=hints_template.get("parent"), @@ -303,6 +363,7 @@ def merge_hints(self, hints_template: TResourceHints) -> None: incremental=hints_template.get("incremental"), schema_contract=hints_template.get("schema_contract"), table_format=hints_template.get("table_format"), + create_table_variant=create_table_variant, ) @staticmethod @@ -324,7 +385,7 @@ def _merge_key(hint: TColumnProp, keys: TColumnNames, partial: TPartialTableSche keys = [keys] for key in keys: if key in partial["columns"]: - merge_columns(partial["columns"][key], {hint: True}) # type: ignore + merge_column(partial["columns"][key], {hint: True}) # type: ignore else: partial["columns"][key] = new_column(key, nullable=False) partial["columns"][key][hint] = True @@ -347,9 +408,7 @@ def validate_dynamic_hints(template: TResourceHints) -> None: table_name = template.get("name") # if any of the hints is a function, then name must be as well. if any( - callable(v) - for k, v in template.items() - if k not in ["name", "incremental", "validator", "original_columns"] + callable(v) for k, v in template.items() if k not in ["name", *NATURAL_CALLABLES] ) and not callable(table_name): raise InconsistentTableTemplate( f"Table name {table_name} must be a function if any other table hint is a function" diff --git a/dlt/extract/items.py b/dlt/extract/items.py index c6e1f0a4b8..fec31e2846 100644 --- a/dlt/extract/items.py +++ b/dlt/extract/items.py @@ -81,10 +81,7 @@ class SourcePipeItem(NamedTuple): class DataItemWithMeta: - __slots__ = "meta", "data" - - meta: Any - data: TDataItems + __slots__ = ("meta", "data") def __init__(self, meta: Any, data: TDataItems) -> None: self.meta = meta @@ -92,9 +89,7 @@ def __init__(self, meta: Any, data: TDataItems) -> None: class TableNameMeta: - __slots__ = "table_name" - - table_name: str + __slots__ = ("table_name",) def __init__(self, table_name: str) -> None: self.table_name = table_name diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index 145b517802..1edd9bd039 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -2,6 +2,7 @@ import types from typing import ( AsyncIterator, + ClassVar, Dict, Sequence, Union, @@ -16,7 +17,11 @@ from dlt.common.configuration import configspec from dlt.common.configuration.inject import with_config -from dlt.common.configuration.specs import BaseConfiguration, ContainerInjectableContext +from dlt.common.configuration.specs import ( + BaseConfiguration, + ContainerInjectableContext, + known_sections, +) from dlt.common.configuration.container import Container from dlt.common.exceptions import PipelineException from dlt.common.source import unset_current_pipe_name, set_current_pipe_name @@ -48,7 +53,7 @@ class PipeIteratorConfiguration(BaseConfiguration): copy_on_fork: bool = False next_item_mode: str = "fifo" - __section__ = "extract" + __section__: ClassVar[str] = known_sections.EXTRACT def __init__( self, diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 0fef502112..4776158bbb 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -63,13 +63,17 @@ def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: return DataItemWithMeta(TableNameMeta(table_name), item) -def with_hints(item: TDataItems, hints: TResourceHints) -> DataItemWithMeta: +def with_hints( + item: TDataItems, hints: TResourceHints, create_table_variant: bool = False +) -> DataItemWithMeta: """Marks `item` to update the resource with specified `hints`. + Will create a separate variant of hints for a table if `name` is provided in `hints` and `create_table_variant` is set. + Create `TResourceHints` with `make_hints`. Setting `table_name` will dispatch the `item` to a specified table, like `with_table_name` """ - return DataItemWithMeta(HintsMeta(hints), item) + return DataItemWithMeta(HintsMeta(hints, create_table_variant), item) class DltResource(Iterable[TDataItem], DltResourceHints): @@ -388,25 +392,29 @@ def add_step( self._pipe.insert_step(item_transform, insert_at) return self - def set_hints(self, table_schema_template: TResourceHints) -> None: - super().set_hints(table_schema_template) - incremental = self.incremental - # try to late assign incremental - if table_schema_template.get("incremental") is not None: - if incremental: - incremental._incremental = table_schema_template["incremental"] - else: - # if there's no wrapper add incremental as a transform - incremental = table_schema_template["incremental"] # type: ignore - self.add_step(incremental) + def _set_hints( + self, table_schema_template: TResourceHints, create_table_variant: bool = False + ) -> None: + super()._set_hints(table_schema_template, create_table_variant) + # validators and incremental apply only to resource hints + if not create_table_variant: + incremental = self.incremental + # try to late assign incremental + if table_schema_template.get("incremental") is not None: + if incremental: + incremental._incremental = table_schema_template["incremental"] + else: + # if there's no wrapper add incremental as a transform + incremental = table_schema_template["incremental"] # type: ignore + self.add_step(incremental) - if incremental: - primary_key = table_schema_template.get("primary_key", incremental.primary_key) - if primary_key is not None: - incremental.primary_key = primary_key + if incremental: + primary_key = table_schema_template.get("primary_key", incremental.primary_key) + if primary_key is not None: + incremental.primary_key = primary_key - if table_schema_template.get("validator") is not None: - self.validator = table_schema_template["validator"] + if table_schema_template.get("validator") is not None: + self.validator = table_schema_template["validator"] def bind(self, *args: Any, **kwargs: Any) -> "DltResource": """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" diff --git a/dlt/helpers/dbt/configuration.py b/dlt/helpers/dbt/configuration.py index 4cd3f3a0f4..70fa4d1ac5 100644 --- a/dlt/helpers/dbt/configuration.py +++ b/dlt/helpers/dbt/configuration.py @@ -19,7 +19,7 @@ class DBTRunnerConfiguration(BaseConfiguration): package_additional_vars: Optional[StrAny] = None - runtime: RunConfiguration + runtime: RunConfiguration = None def on_resolved(self) -> None: if not self.package_profiles_dir: diff --git a/dlt/helpers/dbt/runner.py b/dlt/helpers/dbt/runner.py index 388b81b2ee..7b1f79dc77 100644 --- a/dlt/helpers/dbt/runner.py +++ b/dlt/helpers/dbt/runner.py @@ -11,7 +11,7 @@ from dlt.common.runners import Venv from dlt.common.runners.stdout import iter_stdout_with_result from dlt.common.typing import StrAny, TSecretValue -from dlt.common.runtime.logger import is_json_logging +from dlt.common.logger import is_json_logging from dlt.common.storages import FileStorage from dlt.common.git import git_custom_key_command, ensure_remote_head, force_clone_repo from dlt.common.utils import with_custom_environ diff --git a/dlt/helpers/dbt_cloud/configuration.py b/dlt/helpers/dbt_cloud/configuration.py index aac94b2f4a..3c95d53431 100644 --- a/dlt/helpers/dbt_cloud/configuration.py +++ b/dlt/helpers/dbt_cloud/configuration.py @@ -9,13 +9,13 @@ class DBTCloudConfiguration(BaseConfiguration): api_token: TSecretValue = TSecretValue("") - account_id: Optional[str] - job_id: Optional[str] - project_id: Optional[str] - environment_id: Optional[str] - run_id: Optional[str] + account_id: Optional[str] = None + job_id: Optional[str] = None + project_id: Optional[str] = None + environment_id: Optional[str] = None + run_id: Optional[str] = None cause: str = "Triggered via API" - git_sha: Optional[str] - git_branch: Optional[str] - schema_override: Optional[str] + git_sha: Optional[str] = None + git_branch: Optional[str] = None + schema_override: Optional[str] = None diff --git a/dlt/load/configuration.py b/dlt/load/configuration.py index 0a84e3c331..97cf23fdfc 100644 --- a/dlt/load/configuration.py +++ b/dlt/load/configuration.py @@ -18,13 +18,3 @@ class LoaderConfiguration(PoolRunnerConfiguration): def on_resolved(self) -> None: self.pool_type = "none" if self.workers == 1 else "thread" - - if TYPE_CHECKING: - - def __init__( - self, - pool_type: TPoolType = "thread", - workers: int = None, - raise_on_failed_jobs: bool = False, - _load_storage_config: LoadStorageConfiguration = None, - ) -> None: ... diff --git a/dlt/load/exceptions.py b/dlt/load/exceptions.py index 8a704660ce..e85dffd2e9 100644 --- a/dlt/load/exceptions.py +++ b/dlt/load/exceptions.py @@ -1,10 +1,8 @@ from typing import Sequence -from dlt.destinations.exceptions import DestinationTerminalException, DestinationTransientException - - -# class LoadException(DltException): -# def __init__(self, msg: str) -> None: -# super().__init__(msg) +from dlt.common.destination.exceptions import ( + DestinationTerminalException, + DestinationTransientException, +) class LoadClientJobFailed(DestinationTerminalException): diff --git a/dlt/load/load.py b/dlt/load/load.py index 23c3dea820..f02a21f98e 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -15,21 +15,15 @@ SupportsPipeline, WithStepInfo, ) -from dlt.common.schema.utils import get_child_tables, get_top_level_table +from dlt.common.schema.utils import get_top_level_table from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState from dlt.common.storages.load_package import LoadPackageStateInjectableContext from dlt.common.runners import TRunMetrics, Runnable, workermethod, NullExecutor from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.runtime.logger import pretty_format_exception -from dlt.common.exceptions import ( - TerminalValueError, - DestinationTerminalException, - DestinationTransientException, -) +from dlt.common.logger import pretty_format_exception +from dlt.common.exceptions import TerminalValueError from dlt.common.configuration.container import Container - -from dlt.common.schema import Schema, TSchemaTables - +from dlt.common.schema import Schema from dlt.common.storages import LoadStorage from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, @@ -44,7 +38,10 @@ SupportsStagingDestination, TDestination, ) -from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.destination.exceptions import ( + DestinationTerminalException, + DestinationTransientException, +) from dlt.destinations.job_impl import EmptyLoadJob diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py index 3949a07fa8..5676d23569 100644 --- a/dlt/normalize/configuration.py +++ b/dlt/normalize/configuration.py @@ -18,18 +18,14 @@ class ItemsNormalizerConfiguration(BaseConfiguration): add_dlt_load_id: bool = False """When true, items to be normalized will have `_dlt_load_id` column added with the current load ID.""" - if TYPE_CHECKING: - - def __init__(self, add_dlt_id: bool = None, add_dlt_load_id: bool = None) -> None: ... - @configspec class NormalizeConfiguration(PoolRunnerConfiguration): pool_type: TPoolType = "process" destination_capabilities: DestinationCapabilitiesContext = None # injectable - _schema_storage_config: SchemaStorageConfiguration - _normalize_storage_config: NormalizeStorageConfiguration - _load_storage_config: LoadStorageConfiguration + _schema_storage_config: SchemaStorageConfiguration = None + _normalize_storage_config: NormalizeStorageConfiguration = None + _load_storage_config: LoadStorageConfiguration = None json_normalizer: ItemsNormalizerConfiguration = ItemsNormalizerConfiguration( add_dlt_id=True, add_dlt_load_id=True @@ -41,14 +37,3 @@ class NormalizeConfiguration(PoolRunnerConfiguration): def on_resolved(self) -> None: self.pool_type = "none" if self.workers == 1 else "process" - - if TYPE_CHECKING: - - def __init__( - self, - pool_type: TPoolType = "process", - workers: int = None, - _schema_storage_config: SchemaStorageConfiguration = None, - _normalize_storage_config: NormalizeStorageConfiguration = None, - _load_storage_config: LoadStorageConfiguration = None, - ) -> None: ... diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 8565a5d2b2..fc1e152ff2 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -199,7 +199,7 @@ def __call__( root_table_name, items, may_have_pua(line), skip_write=False ) schema_updates.append(partial_update) - logger.debug(f"Processed {line_no} lines from file {extracted_items_file}") + logger.debug(f"Processed {line_no+1} lines from file {extracted_items_file}") if line is None and root_table_name in self.schema.tables: # TODO: we should push the truncate jobs via package state # not as empty jobs. empty jobs should be reserved for diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 18f8faaa25..4a17b9eef8 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -309,11 +309,16 @@ def spool_files( f"Table {table_name} has seen data for a first time with load id {load_id}" ) x_normalizer["seen-data"] = True - logger.info( - f"Saving schema {schema.name} with version {schema.stored_version}:{schema.version}" - ) # schema is updated, save it to schema volume - self.schema_storage.save_schema(schema) + if schema.is_modified: + logger.info( + f"Saving schema {schema.name} with version {schema.stored_version}:{schema.version}" + ) + self.schema_storage.save_schema(schema) + else: + logger.info( + f"Schema {schema.name} with version {schema.version} was not modified. Save skipped" + ) # save schema new package self.load_storage.new_packages.save_schema(load_id, schema) # save schema updates even if empty @@ -376,8 +381,9 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: schema = self.normalize_storage.extracted_packages.load_schema(load_id) # prefer schema from schema storage if it exists try: - # also import the schema - storage_schema = self.schema_storage.load_schema(schema.name) + # use live schema instance via getter if on live storage, it will also do import + # schema as live schemas are committed before calling normalize + storage_schema = self.schema_storage[schema.name] if schema.stored_version_hash != storage_schema.stored_version_hash: logger.warning( f"When normalizing package {load_id} with schema {schema.name}: the storage" diff --git a/dlt/pipeline/configuration.py b/dlt/pipeline/configuration.py index 7aa54541c0..d7ffca6e89 100644 --- a/dlt/pipeline/configuration.py +++ b/dlt/pipeline/configuration.py @@ -27,7 +27,7 @@ class PipelineConfiguration(BaseConfiguration): full_refresh: bool = False """When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset.""" progress: Optional[str] = None - runtime: RunConfiguration + runtime: RunConfiguration = None def on_resolved(self) -> None: if not self.pipeline_name: diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py index 7bba5f84e7..c242a26eaa 100644 --- a/dlt/pipeline/helpers.py +++ b/dlt/pipeline/helpers.py @@ -12,7 +12,6 @@ from dlt.common.schema.typing import TSimpleRegex from dlt.common.typing import REPattern from dlt.common.pipeline import ( - TSourceState, reset_resource_state, _sources_state, _delete_source_state_keys, @@ -26,6 +25,7 @@ PipelineStepFailed, PipelineHasPendingDataException, ) +from dlt.pipeline.state_sync import force_state_extract from dlt.pipeline.typing import TPipelineStep from dlt.pipeline import Pipeline @@ -122,7 +122,7 @@ def __init__( else: self.tables_to_drop = [] self.drop_tables = False # No tables to drop - self.drop_state = not not self.state_paths_to_drop + self.drop_state = not not self.state_paths_to_drop # obtain truth value self.drop_all = drop_all self.info: _DropInfo = dict( @@ -167,10 +167,11 @@ def _drop_destination_tables(self) -> None: with client.with_staging_dataset(): client.drop_tables(*table_names, replace_schema=True) - def _delete_pipeline_tables(self) -> None: + def _delete_schema_tables(self) -> None: for tbl in self.tables_to_drop: del self.schema_tables[tbl["name"]] - self.schema.bump_version() + # bump schema, we'll save later + self.schema._bump_version() def _list_state_paths(self, source_state: Dict[str, Any]) -> List[str]: return resolve_paths(self.state_paths_to_drop, source_state) @@ -197,7 +198,7 @@ def _create_modified_state(self) -> Dict[str, Any]: self.info["state_paths"].extend(f"{source_name}.{p}" for p in resolved_paths) return state # type: ignore[return-value] - def _drop_state_keys(self) -> None: + def _extract_state(self) -> None: state: Dict[str, Any] with self.pipeline.managed_state(extract_state=True) as state: # type: ignore[assignment] state.clear() @@ -216,12 +217,12 @@ def __call__(self) -> None: return # Nothing to drop if self.drop_tables: - self._delete_pipeline_tables() + self._delete_schema_tables() self._drop_destination_tables() if self.drop_tables: self.pipeline.schemas.save_schema(self.schema) if self.drop_state: - self._drop_state_keys() + self._extract_state() # Send updated state to destination self.pipeline.normalize() try: @@ -230,8 +231,7 @@ def __call__(self) -> None: # Clear extracted state on failure so command can run again self.pipeline.drop_pending_packages() with self.pipeline.managed_state() as state: - state["_local"].pop("_last_extracted_at", None) - state["_local"].pop("_last_extracted_hash", None) + force_state_extract(state) raise diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index d1d558b3b8..de1f7afced 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -12,6 +12,7 @@ Optional, Sequence, Tuple, + Type, cast, get_type_hints, ContextManager, @@ -28,14 +29,14 @@ ) from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.resolve import initialize_credentials -from dlt.common.exceptions import ( +from dlt.common.destination.exceptions import ( DestinationLoadingViaStagingNotSupported, DestinationLoadingWithoutStagingNotSupported, DestinationNoStagingMode, - MissingDependencyException, DestinationUndefinedEntity, DestinationIncompatibleLoaderFileFormatException, ) +from dlt.common.exceptions import MissingDependencyException from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime from dlt.common.schema.typing import ( @@ -129,6 +130,7 @@ PIPELINE_STATE_ENGINE_VERSION, bump_pipeline_state_version_if_modified, load_pipeline_state_from_destination, + mark_state_extracted, migrate_pipeline_state, state_resource, json_encode_state, @@ -172,7 +174,7 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: for name in list(self._schema_storage.live_schemas.keys()): try: schema = self._schema_storage.load_schema(name) - self._schema_storage.update_live_schema(schema, can_create_new=False) + schema.replace_schema_content(schema, link_to_replaced_schema=False) except FileNotFoundError: # no storage schema yet so pop live schema (created in call to f) self._schema_storage.live_schemas.pop(name, None) @@ -182,9 +184,10 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: else: # save modified live schemas for name, schema in self._schema_storage.live_schemas.items(): - self._schema_storage.commit_live_schema(name) # also save import schemas only here self._schema_storage.save_import_schema_if_not_exists(schema) + # only now save the schema, already linked to itself if saved as import schema + self._schema_storage.commit_live_schema(name) # refresh list of schemas if any new schemas are added self.schema_names = self._list_schemas_sorted() return rv @@ -488,7 +491,6 @@ def normalize( ) from n_ex @with_runtime_trace(send_state=True) - @with_schemas_sync @with_state_sync() @with_config_section((known_sections.LOAD,)) def load( @@ -725,8 +727,7 @@ def sync_destination( # set the pipeline props from merged state self._state_to_props(state) # add that the state is already extracted - state["_local"]["_last_extracted_hash"] = state["_version_hash"] - state["_local"]["_last_extracted_at"] = pendulum.now() + mark_state_extracted(state, state["_version_hash"]) # on merge schemas are replaced so we delete all old versions self._schema_storage.clear_storage() for schema in restored_schemas: @@ -1054,15 +1055,11 @@ def _extract_source( # discover the existing pipeline schema try: # all live schemas are initially committed and during the extract will accumulate changes in memory - # if schema is committed try to take schema from storage - if self._schema_storage.is_live_schema_committed(source.schema.name): - # this will (1) save live schema if modified (2) look for import schema if present - # (3) load import schema an overwrite pipeline schema if import schema modified - # (4) load pipeline schema if no import schema is present - pipeline_schema = self.schemas.load_schema(source.schema.name) - else: - # if schema is not committed we know we are in process of extraction - pipeline_schema = self.schemas[source.schema.name] + # line below may create another live schema if source schema is not a part of storage + # this will (1) look for import schema if present + # (2) load import schema an overwrite pipeline schema if import schema modified + # (3) load pipeline schema if no import schema is present + pipeline_schema = self.schemas[source.schema.name] pipeline_schema = pipeline_schema.clone() # use clone until extraction complete # apply all changes in the source schema to pipeline schema # NOTE: we do not apply contracts to changes done programmatically @@ -1080,7 +1077,7 @@ def _extract_source( # self._schema_storage.save_import_schema_if_not_exists(source.schema) # update live schema but not update the store yet - self._schema_storage.update_live_schema(source.schema) + source.schema = self._schema_storage.set_live_schema(source.schema) # set as default if this is first schema in pipeline if not self.default_schema_name: @@ -1126,17 +1123,16 @@ def _get_destination_client_initial_config( ) if issubclass(client_spec, DestinationClientStagingConfiguration): - return client_spec( - dataset_name=self.dataset_name, - default_schema_name=default_schema_name, + spec: DestinationClientDwhConfiguration = client_spec( credentials=credentials, as_staging=as_staging, ) - return client_spec( - dataset_name=self.dataset_name, - default_schema_name=default_schema_name, - credentials=credentials, - ) + else: + spec = client_spec( + credentials=credentials, + ) + spec._bind_dataset_name(self.dataset_name, default_schema_name) + return spec return client_spec(credentials=credentials) @@ -1560,9 +1556,11 @@ def _bump_version_and_extract_state( extract_ = extract or Extract( self._schema_storage, self._normalize_storage_config(), original_data=data ) - self._extract_source(extract_, data_to_sources(data, self)[0], 1, 1) - state["_local"]["_last_extracted_at"] = pendulum.now() - state["_local"]["_last_extracted_hash"] = hash_ + self._extract_source( + extract_, data_to_sources(data, self, self.default_schema)[0], 1, 1 + ) + # set state to be extracted + mark_state_extracted(state, hash_) # commit only if we created storage if not extract: extract_.commit_packages() diff --git a/dlt/pipeline/platform.py b/dlt/pipeline/platform.py index c8014d5ae7..0955e91b51 100644 --- a/dlt/pipeline/platform.py +++ b/dlt/pipeline/platform.py @@ -6,7 +6,7 @@ from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, TPipelineStep, SupportsPipeline from dlt.common import json -from dlt.common.runtime import logger +from dlt.common import logger from dlt.common.pipeline import LoadInfo from dlt.common.schema.typing import TStoredSchema diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index 8c72a218a4..5366b9c46d 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -68,6 +68,21 @@ def bump_pipeline_state_version_if_modified(state: TPipelineState) -> Tuple[int, return bump_state_version_if_modified(state, exclude_attrs=["_local"]) +def mark_state_extracted(state: TPipelineState, hash_: str) -> None: + """Marks state as extracted by setting last extracted hash to hash_ (which is current version_hash) + + `_last_extracted_hash` is kept locally and never synced with the destination + """ + state["_local"]["_last_extracted_at"] = pendulum.now() + state["_local"]["_last_extracted_hash"] = hash_ + + +def force_state_extract(state: TPipelineState) -> None: + """Forces `state` to be extracted by removing local information on the most recent extraction""" + state["_local"].pop("_last_extracted_at", None) + state["_local"].pop("_last_extracted_hash", None) + + def migrate_pipeline_state( pipeline_name: str, state: DictStrAny, from_engine: int, to_engine: int ) -> TPipelineState: diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index 5679884b0b..b610d1751f 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -14,7 +14,7 @@ from dlt.common.configuration.utils import _RESOLVED_TRACES from dlt.common.configuration.container import Container from dlt.common.exceptions import ExceptionTrace, ResourceNameNotAvailable -from dlt.common.runtime.logger import suppress_and_warn +from dlt.common.logger import suppress_and_warn from dlt.common.runtime.exec_info import TExecutionContext, get_execution_context from dlt.common.pipeline import ( ExtractInfo, diff --git a/dlt/sources/helpers/rest_client/__init__.py b/dlt/sources/helpers/rest_client/__init__.py new file mode 100644 index 0000000000..b2fb0a2351 --- /dev/null +++ b/dlt/sources/helpers/rest_client/__init__.py @@ -0,0 +1,46 @@ +from typing import Optional, Dict, Iterator, Union, Any + +from dlt.common import jsonpath + +from .client import RESTClient # noqa: F401 +from .client import PageData +from .auth import AuthConfigBase +from .paginators import BasePaginator +from .typing import HTTPMethodBasic, Hooks + + +def paginate( + url: str, + method: HTTPMethodBasic = "GET", + headers: Optional[Dict[str, str]] = None, + params: Optional[Dict[str, Any]] = None, + json: Optional[Dict[str, Any]] = None, + auth: AuthConfigBase = None, + paginator: Optional[BasePaginator] = None, + data_selector: Optional[jsonpath.TJsonPath] = None, + hooks: Optional[Hooks] = None, +) -> Iterator[PageData[Any]]: + """ + Paginate over a REST API endpoint. + + Args: + url: URL to paginate over. + **kwargs: Keyword arguments to pass to `RESTClient.paginate`. + + Returns: + Iterator[Page]: Iterator over pages. + """ + client = RESTClient( + base_url=url, + headers=headers, + ) + return client.paginate( + path="", + method=method, + params=params, + json=json, + auth=auth, + paginator=paginator, + data_selector=data_selector, + hooks=hooks, + ) diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py new file mode 100644 index 0000000000..99421e2c60 --- /dev/null +++ b/dlt/sources/helpers/rest_client/auth.py @@ -0,0 +1,210 @@ +from base64 import b64encode +import math +from typing import ( + List, + Dict, + Final, + Literal, + Optional, + Union, + Any, + cast, + Iterable, + TYPE_CHECKING, +) +from dlt.sources.helpers import requests +from requests.auth import AuthBase +from requests import PreparedRequest # noqa: I251 +import pendulum + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common import logger +from dlt.common.configuration.specs.base_configuration import configspec +from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs.exceptions import NativeValueError +from dlt.common.typing import TSecretStrValue + +if TYPE_CHECKING: + from cryptography.hazmat.primitives.asymmetric.types import PrivateKeyTypes +else: + PrivateKeyTypes = Any + +TApiKeyLocation = Literal["header", "cookie", "query", "param"] # Alias for scheme "in" field + + +class AuthConfigBase(AuthBase, CredentialsConfiguration): + """Authenticator base which is both `requests` friendly AuthBase and dlt SPEC + configurable via env variables or toml files + """ + + pass + + +@configspec +class BearerTokenAuth(AuthConfigBase): + token: TSecretStrValue = None + + def parse_native_representation(self, value: Any) -> None: + if isinstance(value, str): + self.token = cast(TSecretStrValue, value) + else: + raise NativeValueError( + type(self), + value, + f"BearerTokenAuth token must be a string, got {type(value)}", + ) + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + +@configspec +class APIKeyAuth(AuthConfigBase): + name: str = "Authorization" + api_key: TSecretStrValue = None + location: TApiKeyLocation = "header" + + def parse_native_representation(self, value: Any) -> None: + if isinstance(value, str): + self.api_key = cast(TSecretStrValue, value) + else: + raise NativeValueError( + type(self), + value, + f"APIKeyAuth api_key must be a string, got {type(value)}", + ) + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + if self.location == "header": + request.headers[self.name] = self.api_key + elif self.location in ["query", "param"]: + request.prepare_url(request.url, {self.name: self.api_key}) + elif self.location == "cookie": + raise NotImplementedError() + return request + + +@configspec +class HttpBasicAuth(AuthConfigBase): + username: str = None + password: TSecretStrValue = None + + def parse_native_representation(self, value: Any) -> None: + if isinstance(value, Iterable) and not isinstance(value, str): + value = list(value) + if len(value) == 2: + self.username, self.password = value + return + raise NativeValueError( + type(self), + value, + "HttpBasicAuth username and password must be a tuple of two strings, got" + f" {type(value)}", + ) + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + encoded = b64encode(f"{self.username}:{self.password}".encode()).decode() + request.headers["Authorization"] = f"Basic {encoded}" + return request + + +@configspec +class OAuth2AuthBase(AuthConfigBase): + """Base class for oauth2 authenticators. requires access_token""" + + # TODO: Separate class for flows (implicit, authorization_code, client_credentials, etc) + access_token: TSecretStrValue = None + + def parse_native_representation(self, value: Any) -> None: + if isinstance(value, str): + self.access_token = cast(TSecretStrValue, value) + else: + raise NativeValueError( + type(self), + value, + f"OAuth2AuthBase access_token must be a string, got {type(value)}", + ) + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = f"Bearer {self.access_token}" + return request + + +@configspec +class OAuthJWTAuth(BearerTokenAuth): + """This is a form of Bearer auth, actually there's not standard way to declare it in openAPI""" + + format: Final[Literal["JWT"]] = "JWT" # noqa: A003 + client_id: str = None + private_key: TSecretStrValue = None + auth_endpoint: str = None + scopes: Optional[Union[str, List[str]]] = None + headers: Optional[Dict[str, str]] = None + private_key_passphrase: Optional[TSecretStrValue] = None + default_token_expiration: int = 3600 + + def __post_init__(self) -> None: + self.scopes = self.scopes if isinstance(self.scopes, str) else " ".join(self.scopes) + self.token = None + self.token_expiry: Optional[pendulum.DateTime] = None + + def __call__(self, r: PreparedRequest) -> PreparedRequest: + if self.token is None or self.is_token_expired(): + self.obtain_token() + r.headers["Authorization"] = f"Bearer {self.token}" + return r + + def is_token_expired(self) -> bool: + return not self.token_expiry or pendulum.now() >= self.token_expiry + + def obtain_token(self) -> None: + try: + import jwt + except ModuleNotFoundError: + raise MissingDependencyException("dlt OAuth helpers", ["PyJWT"]) + + payload = self.create_jwt_payload() + data = { + "grant_type": "urn:ietf:params:oauth:grant-type:jwt-bearer", + "assertion": jwt.encode(payload, self.load_private_key(), algorithm="RS256"), + } + + logger.debug(f"Obtaining token from {self.auth_endpoint}") + + response = requests.post(self.auth_endpoint, headers=self.headers, data=data) + response.raise_for_status() + + token_response = response.json() + self.token = token_response["access_token"] + self.token_expiry = pendulum.now().add( + seconds=token_response.get("expires_in", self.default_token_expiration) + ) + + def create_jwt_payload(self) -> Dict[str, Union[str, int]]: + now = pendulum.now() + return { + "iss": self.client_id, + "sub": self.client_id, + "aud": self.auth_endpoint, + "exp": math.floor((now.add(hours=1)).timestamp()), + "iat": math.floor(now.timestamp()), + "scope": cast(str, self.scopes), + } + + def load_private_key(self) -> "PrivateKeyTypes": + try: + from cryptography.hazmat.backends import default_backend + from cryptography.hazmat.primitives import serialization + except ModuleNotFoundError: + raise MissingDependencyException("dlt OAuth helpers", ["cryptography"]) + + private_key_bytes = self.private_key.encode("utf-8") + return serialization.load_pem_private_key( + private_key_bytes, + password=( + self.private_key_passphrase.encode("utf-8") if self.private_key_passphrase else None + ), + backend=default_backend(), + ) diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py new file mode 100644 index 0000000000..027afc7cbb --- /dev/null +++ b/dlt/sources/helpers/rest_client/client.py @@ -0,0 +1,252 @@ +from typing import ( + Iterator, + Optional, + List, + Dict, + Any, + TypeVar, + Iterable, + cast, +) +import copy +from urllib.parse import urlparse + +from requests import Session as BaseSession # noqa: I251 + +from dlt.common import logger +from dlt.common import jsonpath +from dlt.sources.helpers.requests.retry import Client +from dlt.sources.helpers.requests import Response, Request + +from .typing import HTTPMethodBasic, HTTPMethod, Hooks +from .paginators import BasePaginator +from .auth import AuthConfigBase +from .detector import PaginatorFactory, find_records +from .exceptions import IgnoreResponseException + +from .utils import join_url + + +_T = TypeVar("_T") + + +class PageData(List[_T]): + """A list of elements in a single page of results with attached request context. + + The context allows to inspect the response, paginator and authenticator, modify the request + """ + + def __init__( + self, + __iterable: Iterable[_T], + request: Request, + response: Response, + paginator: BasePaginator, + auth: AuthConfigBase, + ): + super().__init__(__iterable) + self.request = request + self.response = response + self.paginator = paginator + self.auth = auth + + +class RESTClient: + """A generic REST client for making requests to an API with support for + pagination and authentication. + + Args: + base_url (str): The base URL of the API to make requests to. + headers (Optional[Dict[str, str]]): Default headers to include in all requests. + auth (Optional[AuthConfigBase]): Authentication configuration for all requests. + paginator (Optional[BasePaginator]): Default paginator for handling paginated responses. + data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for extracting data from responses. + session (BaseSession): HTTP session for making requests. + paginator_factory (Optional[PaginatorFactory]): Factory for creating paginator instances, + used for detecting paginators. + """ + + def __init__( + self, + base_url: str, + headers: Optional[Dict[str, str]] = None, + auth: Optional[AuthConfigBase] = None, + paginator: Optional[BasePaginator] = None, + data_selector: Optional[jsonpath.TJsonPath] = None, + session: BaseSession = None, + paginator_factory: Optional[PaginatorFactory] = None, + ) -> None: + self.base_url = base_url + self.headers = headers + self.auth = auth + + if session: + self._validate_session_raise_for_status(session) + self.session = session + else: + self.session = Client(raise_for_status=False).session + + self.paginator = paginator + self.pagination_factory = paginator_factory or PaginatorFactory() + + self.data_selector = data_selector + + def _validate_session_raise_for_status(self, session: BaseSession) -> None: + # dlt.sources.helpers.requests.session.Session + # has raise_for_status=True by default + if getattr(self.session, "raise_for_status", False): + logger.warning( + "The session provided has raise_for_status enabled. " + "This may cause unexpected behavior." + ) + + def _create_request( + self, + path: str, + method: HTTPMethod, + params: Dict[str, Any], + json: Optional[Dict[str, Any]] = None, + auth: Optional[AuthConfigBase] = None, + hooks: Optional[Hooks] = None, + ) -> Request: + parsed_url = urlparse(path) + if parsed_url.scheme in ("http", "https"): + url = path + else: + url = join_url(self.base_url, path) + + return Request( + method=method, + url=url, + headers=self.headers, + params=params, + json=json, + auth=auth or self.auth, + hooks=hooks, + ) + + def _send_request(self, request: Request) -> Response: + logger.info( + f"Making {request.method.upper()} request to {request.url}" + f" with params={request.params}, json={request.json}" + ) + + prepared_request = self.session.prepare_request(request) + + return self.session.send(prepared_request) + + def request(self, path: str = "", method: HTTPMethod = "GET", **kwargs: Any) -> Response: + prepared_request = self._create_request( + path=path, + method=method, + **kwargs, + ) + return self._send_request(prepared_request) + + def get(self, path: str, params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Response: + return self.request(path, method="GET", params=params, **kwargs) + + def post(self, path: str, json: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Response: + return self.request(path, method="POST", json=json, **kwargs) + + def paginate( + self, + path: str = "", + method: HTTPMethodBasic = "GET", + params: Optional[Dict[str, Any]] = None, + json: Optional[Dict[str, Any]] = None, + auth: Optional[AuthConfigBase] = None, + paginator: Optional[BasePaginator] = None, + data_selector: Optional[jsonpath.TJsonPath] = None, + hooks: Optional[Hooks] = None, + ) -> Iterator[PageData[Any]]: + """Iterates over paginated API responses, yielding pages of data. + + Args: + path (str): Endpoint path for the request, relative to `base_url`. + method (HTTPMethodBasic): HTTP method for the request, defaults to 'get'. + params (Optional[Dict[str, Any]]): URL parameters for the request. + json (Optional[Dict[str, Any]]): JSON payload for the request. + auth (Optional[AuthConfigBase]): Authentication configuration for the request. + paginator (Optional[BasePaginator]): Paginator instance for handling + pagination logic. + data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for + extracting data from the response. + hooks (Optional[Hooks]): Hooks to modify request/response objects. Note that + when hooks are not provided, the default behavior is to raise an exception + on error status codes. + + Yields: + PageData[Any]: A page of data from the paginated API response, along with request and response context. + + Raises: + HTTPError: If the response status code is not a success code. This is raised + by default when hooks are not provided. + + Example: + >>> client = RESTClient(base_url="https://api.example.com") + >>> for page in client.paginate("/search", method="post", json={"query": "foo"}): + >>> print(page) + """ + + paginator = paginator if paginator else copy.deepcopy(self.paginator) + auth = auth or self.auth + data_selector = data_selector or self.data_selector + hooks = hooks or {} + + def raise_for_status(response: Response, *args: Any, **kwargs: Any) -> None: + response.raise_for_status() + + if "response" not in hooks: + hooks["response"] = [raise_for_status] + + request = self._create_request( + path=path, method=method, params=params, json=json, auth=auth, hooks=hooks + ) + + while True: + try: + response = self._send_request(request) + except IgnoreResponseException: + break + + if paginator is None: + paginator = self.detect_paginator(response) + + data = self.extract_response(response, data_selector) + paginator.update_state(response) + paginator.update_request(request) + + # yield data with context + yield PageData(data, request=request, response=response, paginator=paginator, auth=auth) + + if not paginator.has_next_page: + break + + def extract_response(self, response: Response, data_selector: jsonpath.TJsonPath) -> List[Any]: + if data_selector: + # we should compile data_selector + data: Any = jsonpath.find_values(data_selector, response.json()) + # extract if single item selected + data = data[0] if isinstance(data, list) and len(data) == 1 else data + else: + data = find_records(response.json()) + # wrap single pages into lists + if not isinstance(data, list): + data = [data] + return cast(List[Any], data) + + def detect_paginator(self, response: Response) -> BasePaginator: + """Detects a paginator for the response and returns it. + + Args: + response (Response): The response to detect the paginator for. + + Returns: + BasePaginator: The paginator instance that was detected. + """ + paginator = self.pagination_factory.create_paginator(response) + if paginator is None: + raise ValueError(f"No suitable paginator found for the response at {response.url}") + logger.info(f"Detected paginator: {paginator.__class__.__name__}") + return paginator diff --git a/dlt/sources/helpers/rest_client/detector.py b/dlt/sources/helpers/rest_client/detector.py new file mode 100644 index 0000000000..547162358c --- /dev/null +++ b/dlt/sources/helpers/rest_client/detector.py @@ -0,0 +1,158 @@ +import re +from typing import List, Dict, Any, Tuple, Union, Optional, Callable, Iterable + +from dlt.sources.helpers.requests import Response + +from .paginators import ( + BasePaginator, + HeaderLinkPaginator, + JSONResponsePaginator, + SinglePagePaginator, +) + +RECORD_KEY_PATTERNS = frozenset( + [ + "data", + "items", + "results", + "entries", + "records", + "rows", + "entities", + "payload", + "content", + "objects", + ] +) + +NON_RECORD_KEY_PATTERNS = frozenset( + [ + "meta", + "metadata", + "pagination", + "links", + "extras", + "headers", + ] +) + +NEXT_PAGE_KEY_PATTERNS = frozenset(["next", "nextpage", "nexturl"]) +NEXT_PAGE_DICT_KEY_PATTERNS = frozenset(["href", "url"]) + + +def single_entity_path(path: str) -> bool: + """Checks if path ends with path param indicating that single object is returned""" + return re.search(r"\{([a-zA-Z_][a-zA-Z0-9_]*)\}$", path) is not None + + +def find_all_lists( + dict_: Dict[str, Any], + result: List[Tuple[int, str, List[Any]]] = None, + level: int = 0, +) -> List[Tuple[int, str, List[Any]]]: + """Recursively looks for lists in dict_ and returns tuples + in format (nesting level, dictionary key, list) + """ + if level > 2: + return [] + + for key, value in dict_.items(): + if isinstance(value, list): + result.append((level, key, value)) + elif isinstance(value, dict): + find_all_lists(value, result, level + 1) + + return result + + +def find_records( + response: Union[Dict[str, Any], List[Any], Any], +) -> Union[Dict[str, Any], List[Any], Any]: + # when a list was returned (or in rare case a simple type or null) + if not isinstance(response, dict): + return response + lists = find_all_lists(response, result=[]) + if len(lists) == 0: + # could not detect anything + return response + # we are ordered by nesting level, find the most suitable list + try: + return next( + list_info[2] + for list_info in lists + if list_info[1] in RECORD_KEY_PATTERNS and list_info[1] not in NON_RECORD_KEY_PATTERNS + ) + except StopIteration: + # return the least nested element + return lists[0][2] + + +def matches_any_pattern(key: str, patterns: Iterable[str]) -> bool: + normalized_key = key.lower() + return any(pattern in normalized_key for pattern in patterns) + + +def find_next_page_path( + dictionary: Dict[str, Any], path: Optional[List[str]] = None +) -> Optional[List[str]]: + if not isinstance(dictionary, dict): + return None + + if path is None: + path = [] + + for key, value in dictionary.items(): + if matches_any_pattern(key, NEXT_PAGE_KEY_PATTERNS): + if isinstance(value, dict): + for dict_key in value: + if matches_any_pattern(dict_key, NEXT_PAGE_DICT_KEY_PATTERNS): + return [*path, key, dict_key] + return [*path, key] + + if isinstance(value, dict): + result = find_next_page_path(value, [*path, key]) + if result: + return result + + return None + + +def header_links_detector(response: Response) -> Optional[HeaderLinkPaginator]: + links_next_key = "next" + + if response.links.get(links_next_key): + return HeaderLinkPaginator() + return None + + +def json_links_detector(response: Response) -> Optional[JSONResponsePaginator]: + dictionary = response.json() + next_path_parts = find_next_page_path(dictionary) + + if not next_path_parts: + return None + + return JSONResponsePaginator(next_url_path=".".join(next_path_parts)) + + +def single_page_detector(response: Response) -> Optional[SinglePagePaginator]: + """This is our fallback paginator, also for results that are single entities""" + return SinglePagePaginator() + + +class PaginatorFactory: + def __init__(self, detectors: List[Callable[[Response], Optional[BasePaginator]]] = None): + if detectors is None: + detectors = [ + header_links_detector, + json_links_detector, + single_page_detector, + ] + self.detectors = detectors + + def create_paginator(self, response: Response) -> Optional[BasePaginator]: + for detector in self.detectors: + paginator = detector(response) + if paginator: + return paginator + return None diff --git a/dlt/sources/helpers/rest_client/exceptions.py b/dlt/sources/helpers/rest_client/exceptions.py new file mode 100644 index 0000000000..4b4d555ca7 --- /dev/null +++ b/dlt/sources/helpers/rest_client/exceptions.py @@ -0,0 +1,5 @@ +from dlt.common.exceptions import DltException + + +class IgnoreResponseException(DltException): + pass diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py new file mode 100644 index 0000000000..65605b7dee --- /dev/null +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -0,0 +1,176 @@ +from abc import ABC, abstractmethod +from typing import Optional + +from dlt.sources.helpers.requests import Response, Request +from dlt.common import jsonpath + + +class BasePaginator(ABC): + def __init__(self) -> None: + self._has_next_page = True + self._next_reference: Optional[str] = None + + @property + def has_next_page(self) -> bool: + """ + Check if there is a next page available. + + Returns: + bool: True if there is a next page available, False otherwise. + """ + return self._has_next_page + + @property + def next_reference(self) -> Optional[str]: + return self._next_reference + + @next_reference.setter + def next_reference(self, value: Optional[str]) -> None: + self._next_reference = value + self._has_next_page = value is not None + + @abstractmethod + def update_state(self, response: Response) -> None: + """Update the paginator state based on the response. + + Args: + response (Response): The response object from the API. + """ + ... + + @abstractmethod + def update_request(self, request: Request) -> None: + """ + Update the request object with the next arguments for the API request. + + Args: + request (Request): The request object to be updated. + """ + ... + + +class SinglePagePaginator(BasePaginator): + """A paginator for single-page API responses.""" + + def update_state(self, response: Response) -> None: + self._has_next_page = False + + def update_request(self, request: Request) -> None: + return + + +class OffsetPaginator(BasePaginator): + """A paginator that uses the 'offset' parameter for pagination.""" + + def __init__( + self, + initial_limit: int, + initial_offset: int = 0, + offset_param: str = "offset", + limit_param: str = "limit", + total_path: jsonpath.TJsonPath = "total", + ) -> None: + super().__init__() + self.offset_param = offset_param + self.limit_param = limit_param + self.total_path = jsonpath.compile_path(total_path) + + self.offset = initial_offset + self.limit = initial_limit + + def update_state(self, response: Response) -> None: + values = jsonpath.find_values(self.total_path, response.json()) + total = values[0] if values else None + + if total is None: + raise ValueError(f"Total count not found in response for {self.__class__.__name__}") + + self.offset += self.limit + + if self.offset >= total: + self._has_next_page = False + + def update_request(self, request: Request) -> None: + if request.params is None: + request.params = {} + + request.params[self.offset_param] = self.offset + request.params[self.limit_param] = self.limit + + +class BaseNextUrlPaginator(BasePaginator): + def update_request(self, request: Request) -> None: + request.url = self.next_reference + + +class HeaderLinkPaginator(BaseNextUrlPaginator): + """A paginator that uses the 'Link' header in HTTP responses + for pagination. + + A good example of this is the GitHub API: + https://docs.github.com/en/rest/guides/traversing-with-pagination + """ + + def __init__(self, links_next_key: str = "next") -> None: + """ + Args: + links_next_key (str, optional): The key (rel ) in the 'Link' header + that contains the next page URL. Defaults to 'next'. + """ + super().__init__() + self.links_next_key = links_next_key + + def update_state(self, response: Response) -> None: + self.next_reference = response.links.get(self.links_next_key, {}).get("url") + + +class JSONResponsePaginator(BaseNextUrlPaginator): + """A paginator that uses a specific key in the JSON response to find + the next page URL. + """ + + def __init__( + self, + next_url_path: jsonpath.TJsonPath = "next", + ): + """ + Args: + next_url_path: The JSON path to the key that contains the next page URL in the response. + Defaults to 'next'. + """ + super().__init__() + self.next_url_path = jsonpath.compile_path(next_url_path) + + def update_state(self, response: Response) -> None: + values = jsonpath.find_values(self.next_url_path, response.json()) + self.next_reference = values[0] if values else None + + +class JSONResponseCursorPaginator(BasePaginator): + """A paginator that uses a cursor query param to paginate. The cursor for the + next page is found in the JSON response. + """ + + def __init__( + self, + cursor_path: jsonpath.TJsonPath = "cursors.next", + cursor_param: str = "after", + ): + """ + Args: + cursor_path: The JSON path to the key that contains the cursor in the response. + cursor_param: The name of the query parameter to be used in the request to get the next page. + """ + super().__init__() + self.cursor_path = jsonpath.compile_path(cursor_path) + self.cursor_param = cursor_param + + def update_state(self, response: Response) -> None: + values = jsonpath.find_values(self.cursor_path, response.json()) + self.next_reference = values[0] if values else None + + def update_request(self, request: Request) -> None: + if request.params is None: + request.params = {} + + request.params[self.cursor_param] = self._next_reference diff --git a/dlt/sources/helpers/rest_client/typing.py b/dlt/sources/helpers/rest_client/typing.py new file mode 100644 index 0000000000..626aee4877 --- /dev/null +++ b/dlt/sources/helpers/rest_client/typing.py @@ -0,0 +1,17 @@ +from typing import ( + List, + Dict, + Union, + Literal, + Callable, + Any, +) +from dlt.sources.helpers.requests import Response + + +HTTPMethodBasic = Literal["GET", "POST"] +HTTPMethodExtended = Literal["PUT", "PATCH", "DELETE", "HEAD", "OPTIONS"] +HTTPMethod = Union[HTTPMethodBasic, HTTPMethodExtended] +HookFunction = Callable[[Response, Any, Any], None] +HookEvent = Union[HookFunction, List[HookFunction]] +Hooks = Dict[str, HookEvent] diff --git a/dlt/sources/helpers/rest_client/utils.py b/dlt/sources/helpers/rest_client/utils.py new file mode 100644 index 0000000000..7fe91655c5 --- /dev/null +++ b/dlt/sources/helpers/rest_client/utils.py @@ -0,0 +1,16 @@ +def join_url(base_url: str, path: str) -> str: + if base_url is None: + raise ValueError("Base URL must be provided or set to an empty string.") + + if base_url == "": + return path + + if path == "": + return base_url + + # Normalize the base URL + base_url = base_url.rstrip("/") + if not base_url.endswith("/"): + base_url += "/" + + return base_url + path.lstrip("/") diff --git a/docs/tools/check_embedded_snippets.py b/docs/tools/check_embedded_snippets.py index da27c1aa19..96e1227745 100644 --- a/docs/tools/check_embedded_snippets.py +++ b/docs/tools/check_embedded_snippets.py @@ -18,7 +18,7 @@ LINT_TEMPLATE = "./lint_setup/template.py" LINT_FILE = "./lint_setup/lint_me.py" -ENABLE_MYPY = False +ENABLE_MYPY = True @dataclass @@ -225,6 +225,7 @@ def typecheck_snippets(snippets: List[Snippet], verbose: bool) -> None: failed_count += 1 fmt.warning(f"Failed to type check {str(snippet)}") fmt.echo(result.stdout.strip()) + fmt.echo(result.stderr.strip()) if failed_count: fmt.error(f"Failed to type check {failed_count} snippets") diff --git a/docs/tools/lint_setup/template.py b/docs/tools/lint_setup/template.py index dcfada63f6..c72c4dba62 100644 --- a/docs/tools/lint_setup/template.py +++ b/docs/tools/lint_setup/template.py @@ -8,8 +8,8 @@ import os import pendulum -from pendulum import DateTime from datetime import datetime # noqa: I251 +from pendulum import DateTime import dlt from dlt.common import json @@ -26,6 +26,7 @@ BaseConfiguration, ) from dlt.common.storages.configuration import FileSystemCredentials +from dlt.pipeline.exceptions import PipelineStepFailed # some universal variables pipeline: dlt.Pipeline = None # type: ignore[assignment] @@ -33,3 +34,4 @@ ex: Exception = None # type: ignore[assignment] load_info: LoadInfo = None # type: ignore[assignment] url: str = None # type: ignore[assignment] +my_resource: DltResource = None # type: ignore[assignment] diff --git a/docs/tools/utils.py b/docs/tools/utils.py index 074b19b8e1..f71d68bd86 100644 --- a/docs/tools/utils.py +++ b/docs/tools/utils.py @@ -1,27 +1,35 @@ from typing import List import os +import glob import dlt.cli.echo as fmt DOCS_DIR = "../website/docs" +BLOG_DIR = "../website/blog" def collect_markdown_files(verbose: bool) -> List[str]: """ Discovers all docs markdown files """ + + # collect docs pages markdown_files: List[str] = [] - for path, _, files in os.walk(DOCS_DIR): - if "api_reference" in path: + for filepath in glob.glob(f"{DOCS_DIR}/**/*.md", recursive=True): + if "api_reference" in filepath: continue - if "jaffle_shop" in path: + if "jaffle_shop" in filepath: continue - for file in files: - if file.endswith(".md"): - markdown_files.append(os.path.join(path, file)) - if verbose: - fmt.echo(f"Discovered {os.path.join(path, file)}") + markdown_files.append(filepath) + if verbose: + fmt.echo(f"Discovered {filepath}") + + # collect blog pages + for filepath in glob.glob(f"{BLOG_DIR}/**/*.md", recursive=True): + markdown_files.append(filepath) + if verbose: + fmt.echo(f"Discovered {filepath}") if len(markdown_files) < 50: # sanity check fmt.error("Found too few files. Something went wrong.") diff --git a/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md b/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md index 394504dc64..08180b379e 100644 --- a/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md +++ b/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md @@ -47,9 +47,11 @@ The code provided below demonstrates training a chat-oriented GPT model using th -```python -!python3 -m pip install --upgrade langchain deeplake openai tiktoken +```sh +python -m pip install --upgrade langchain deeplake openai tiktoken +``` +```py # Create accounts on platform.openai.com and deeplake.ai. After registering, retrieve the access tokens for both platforms and securely store them for use in the next step. Enter the access tokens grabbed in the last step and enter them when prompted import os @@ -65,7 +67,7 @@ embeddings = OpenAIEmbeddings(disallowed_special=()) #### 2. Create a directory to store the code for training the model. Clone the desired repositories into that. -```python +```sh # making a new directory named dlt-repo !mkdir dlt-repo # changing the directory to dlt-repo @@ -80,7 +82,7 @@ embeddings = OpenAIEmbeddings(disallowed_special=()) ``` #### 3. Load the files from the directory -```python +```py import os from langchain.document_loaders import TextLoader @@ -95,7 +97,7 @@ for dirpath, dirnames, filenames in os.walk(root_dir): pass ``` #### 4. Load the files from the directory -```python +```py import os from langchain.document_loaders import TextLoader @@ -111,7 +113,7 @@ for dirpath, dirnames, filenames in os.walk(root_dir): ``` #### 5. Splitting files to chunks -```python +```py # This code uses CharacterTextSplitter to split documents into smaller chunksbased on character count and store the resulting chunks in the texts variable. from langchain.text_splitter import CharacterTextSplitter @@ -119,7 +121,8 @@ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts = text_splitter.split_documents(docs) ``` #### 6. Create Deeplake dataset -```python + +```sh # Set up your deeplake dataset by replacing the username with your Deeplake account and setting the dataset name. For example if the deeplakes username is “your_name” and the dataset is “dlt-hub-dataset” username = "your_deeplake_username" # replace with your username from app.activeloop.ai @@ -138,7 +141,7 @@ retriever.search_kwargs['maximal_marginal_relevance'] = True retriever.search_kwargs['k'] = 10 ``` #### 7. Initialize the GPT model -```python +```py from langchain.chat_models import ChatOpenAI from langchain.chains import ConversationalRetrievalChain diff --git a/docs/website/blog/2023-08-14-dlt-motherduck-blog.md b/docs/website/blog/2023-08-14-dlt-motherduck-blog.md index 9f48d808a5..21aa7139f3 100644 --- a/docs/website/blog/2023-08-14-dlt-motherduck-blog.md +++ b/docs/website/blog/2023-08-14-dlt-motherduck-blog.md @@ -70,7 +70,7 @@ This is a perfect problem to test out my new super simple and highly customizabl `dlt init bigquery duckdb` This creates a folder with the directory structure - ``` + ```text ├── .dlt │ ├── config.toml │ └── secrets.toml diff --git a/docs/website/blog/2023-08-21-dlt-lineage-support.md b/docs/website/blog/2023-08-21-dlt-lineage-support.md index a76f89ed6a..90f6eb58aa 100644 --- a/docs/website/blog/2023-08-21-dlt-lineage-support.md +++ b/docs/website/blog/2023-08-21-dlt-lineage-support.md @@ -63,7 +63,7 @@ By combining row and column level lineage, you can have an easy overview of wher After a pipeline run, the schema evolution info gets stored in the load info. Load it back to the database to persist the column lineage: -```python +```py load_info = pipeline.run(data, write_disposition="append", table_name="users") diff --git a/docs/website/blog/2023-08-24-dlt-etlt.md b/docs/website/blog/2023-08-24-dlt-etlt.md index 3e27a21338..a36b169a99 100644 --- a/docs/website/blog/2023-08-24-dlt-etlt.md +++ b/docs/website/blog/2023-08-24-dlt-etlt.md @@ -83,7 +83,7 @@ This engine is configurable in both how it works and what it does, you can read more here: [Normaliser, schema settings](https://dlthub.com/docs/general-usage/schema#data-normalizer) Here is a usage example (it's built into the pipeline): -```python +```py import dlt @@ -119,7 +119,7 @@ Besides your own customisations, `dlt` also supports injecting your transform co Here is a code example of pseudonymisation, a common case where data needs to be transformed before loading: -```python +```py import dlt import hashlib @@ -150,16 +150,16 @@ def pseudonymize_name(doc): # 1. Create an instance of the source so you can edit it. data_source = dummy_source() # 2. Modify this source instance's resource -data_source = data_source.dummy_data().add_map(pseudonymize_name) +data_resource = data_source.dummy_data().add_map(pseudonymize_name) # 3. Inspect your result -for row in data_source: +for row in data_resource: print(row) #{'id': 0, 'name': '96259edb2b28b48bebce8278c550e99fbdc4a3fac8189e6b90f183ecff01c442'} #{'id': 1, 'name': '92d3972b625cbd21f28782fb5c89552ce1aa09281892a2ab32aee8feeb3544a1'} #{'id': 2, 'name': '443679926a7cff506a3b5d5d094dc7734861352b9e0791af5d39db5a7356d11a'} pipeline = dlt.pipeline(pipeline_name='example', destination='bigquery', dataset_name='normalized_data') -load_info = pipeline.run(data_source) +load_info = pipeline.run(data_resource) ``` @@ -168,7 +168,7 @@ load_info = pipeline.run(data_source) Finally, once you have clean data loaded, you will probably prefer to use SQL and one of the standard tools. `dlt` offers a dbt runner to get you started easily with your transformation package. -```python +```py pipeline = dlt.pipeline( pipeline_name='pipedrive', destination='bigquery', diff --git a/docs/website/blog/2023-09-05-mongo-etl.md b/docs/website/blog/2023-09-05-mongo-etl.md index 19e1f18682..0e4a3d83f2 100644 --- a/docs/website/blog/2023-09-05-mongo-etl.md +++ b/docs/website/blog/2023-09-05-mongo-etl.md @@ -139,21 +139,21 @@ Here's a code explanation of how it works under the hood: example of how this nested data could look: ```json - data = { - 'id': 1, - 'name': 'Alice', - 'job': { + { + "id": 1, + "name": "Alice", + "job": { "company": "ScaleVector", - "title": "Data Scientist", + "title": "Data Scientist" }, - 'children': [ + "children": [ { - 'id': 1, - 'name': 'Eve' + "id": 1, + "name": "Eve" }, { - 'id': 2, - 'name': 'Wendy' + "id": 2, + "name": "Wendy" } ] } @@ -161,7 +161,7 @@ Here's a code explanation of how it works under the hood: 1. We can load the data to a supported destination declaratively: - ```python + ```py import dlt pipeline = dlt.pipeline( diff --git a/docs/website/blog/2023-09-26-verba-dlt-zendesk.md b/docs/website/blog/2023-09-26-verba-dlt-zendesk.md index 1990a5df7f..f3825b4427 100644 --- a/docs/website/blog/2023-09-26-verba-dlt-zendesk.md +++ b/docs/website/blog/2023-09-26-verba-dlt-zendesk.md @@ -40,7 +40,7 @@ In this blog post, we'll guide you through the process of building a RAG applica Create a new folder for your project and install Verba: -```bash +```sh mkdir verba-dlt-zendesk cd verba-dlt-zendesk python -m venv venv @@ -50,7 +50,7 @@ pip install goldenverba To configure Verba, we need to set the following environment variables: -```bash +```sh VERBA_URL=https://your-cluster.weaviate.network # your Weaviate instance URL VERBA_API_KEY=F8...i4WK # the API key of your Weaviate instance OPENAI_API_KEY=sk-...R # your OpenAI API key @@ -61,13 +61,13 @@ You can put them in a `.env` file in the root of your project or export them in Let's test that Verba is installed correctly: -```bash +```sh verba start ``` You should see the following output: -```bash +```sh INFO: Uvicorn running on (Press CTRL+C to quit) ℹ Setting up client ✔ Client connected to Weaviate Cluster @@ -88,7 +88,7 @@ If you try to ask a question now, you'll get an error in return. That's because We get our data from Zendesk using dlt. Let's install it along with the Weaviate extra: -```bash +```sh pip install "dlt[weaviate]" ``` @@ -96,7 +96,7 @@ This also installs a handy CLI tool called `dlt`. It will help us initialize the Let's initialize the verified source: -```bash +```sh dlt init zendesk weaviate ``` @@ -104,7 +104,7 @@ dlt init zendesk weaviate To make things easier, we'll use the email address and password authentication method for Zendesk API. Let's add our credentials to `secrets.toml`: -```yaml +```toml [sources.zendesk.credentials] password = "your-password" subdomain = "your-subdomain" @@ -113,14 +113,13 @@ email = "your-email@example.com" We also need to specify the URL and the API key of our Weaviate instance. Copy the credentials for the Weaviate instance you created earlier and add them to `secrets.toml`: -```yaml +```toml [destination.weaviate.credentials] url = "https://your-cluster.weaviate.network" api_key = "F8.....i4WK" [destination.weaviate.credentials.additional_headers] X-OpenAI-Api-Key = "sk-....." - ``` All the components are now in place and configured. Let's set up a pipeline to import data from Zendesk. @@ -129,7 +128,7 @@ All the components are now in place and configured. Let's set up a pipeline to i Open your favorite text editor and create a file called `zendesk_verba.py`. Add the following code to it: -```python +```py import itertools import dlt @@ -217,13 +216,13 @@ Finally, we run the pipeline and print the load info. Let's run the pipeline: -```bash +```sh python zendesk_verba.py ``` You should see the following output: -```bash +```sh Pipeline zendesk_verba completed in 8.27 seconds 1 load package(s) were loaded to destination weaviate and into dataset None The weaviate destination used location to store data @@ -235,13 +234,13 @@ Verba is now populated with data from Zendesk Support. However there are a coupl Run the following command: -```bash +```sh verba init ``` You should see the following output: -```bash +```sh ===================== Creating Document and Chunk class ===================== ℹ Setting up client ✔ Client connected to Weaviate Cluster @@ -264,7 +263,7 @@ Document class already exists, do you want to overwrite it? (y/n): n We're almost there! Let's start Verba: -```bash +```sh verba start ``` diff --git a/docs/website/blog/2023-10-06-dlt-holistics.md b/docs/website/blog/2023-10-06-dlt-holistics.md index b2791bd2ec..c5e9b2ca46 100644 --- a/docs/website/blog/2023-10-06-dlt-holistics.md +++ b/docs/website/blog/2023-10-06-dlt-holistics.md @@ -92,7 +92,7 @@ In this section, we walk through how to set up a MongoDB data pipeline using `dl Use the command below to install `dlt`. -```bash +```sh pip3 install -U dlt ``` @@ -100,13 +100,13 @@ Consider setting up a virtual environment for your projects and installing the p Once we have `dlt` installed, we can go ahead and initialize a verified MongoDB pipeline with the destination set to Google BigQuery. First, create a project directory and then execute the command below: -```python +```sh dlt init mongodb bigquery ``` The above command will create a local ready-made pipeline that we can customize to our needs. After executing the command your project directory will look as follows: -```bash +```text . ├── .dlt │ ├── config.toml @@ -127,7 +127,7 @@ We also need to set up the GCP service account credentials to get permissions to Once all the credentials are set add them to the `secrets.toml` file. Your file should look something like this: -```bash +```toml # put your secret values and credentials here. do not share this file and do not push it to github [sources.mongodb] connection_url = "mongodb+srv://:@.cvanypn.mongodb.net" # please set me up! @@ -143,7 +143,7 @@ client_email = "@analytics.iam.gserviceaccount.com" # please set me up The `mongodb_pipeline.py` at the root of your project directory is the script that runs the pipeline. It contains many functions that provide different ways of loading the data. The selection of the function depends on your specific use case, but for this demo, we try to keep it simple and use the `load_entire_database` function. -```python +```py def load_entire_database(pipeline: Pipeline = None) -> LoadInfo: """Use the mongo source to completely load all collection in a database""" if pipeline is None: @@ -165,13 +165,13 @@ def load_entire_database(pipeline: Pipeline = None) -> LoadInfo: Before we execute the pipeline script let's install the dependencies for the pipeline by executing the `requirements.txt` file. -```bash +```sh pip install -r requirements.txt ``` Finally, we are ready to execute the script. In the main function uncomment the `load_entire_database` function call and run the script. -```bash +```sh python mongodb_pipeline.py ``` @@ -290,7 +290,7 @@ This is a typical way data is structured in a NoSQL database. The data is in a J The ddl (data definition language) for the movies table in BigQuery can be seen below: -```json +```sql CREATE TABLE `dlthub-analytics.mongo_database.movies` ( _id STRING NOT NULL, @@ -354,7 +354,7 @@ In Holistics, add a new data source click on the plus sign (+) on the top menu, Once the BigQuery source is added we are ready to import the schemas from BigQuery into Holistics. The schema(`dataset_name`) name under which dlt loaded the MongoDB data is defined in the `load_entire_database` function when we create the MongoDB pipeline. -```bash +```sh # Create a pipeline pipeline = dlt.pipeline( pipeline_name="local_mongo", @@ -399,13 +399,13 @@ The resulting relationship can seen As Code using the Holistics 4.0 Analytics as Previously, we created the relationship between the `cast` and the `movies` tables using GUI, now let’s add the relationship between the `directors` and `movies` tables using the Analytics as Code feature. In the `dataset.aml` file append the relationships block with the following line of code: -```python +```py relationship(model__mongo_database_movies_directors.dlt_parent_id > model__mongo_database_movies.dlt_id, true) ``` After the change, the `dataset.aml` file should look like this: -```python +```sh import '../Models/mongo_database_movies.model.aml' { mongo_database_movies as model__mongo_database_movies } diff --git a/docs/website/blog/2023-10-09-dlt-ops-startups.md b/docs/website/blog/2023-10-09-dlt-ops-startups.md index 94c1ff662b..dd21725f90 100644 --- a/docs/website/blog/2023-10-09-dlt-ops-startups.md +++ b/docs/website/blog/2023-10-09-dlt-ops-startups.md @@ -61,14 +61,14 @@ The `dlt` [init command](https://dlthub.com/docs/reference/command-line-interfac - Open `.dlt/secrets.toml` file on your laptop. - Enter the OpenAI secrets: - ``` + ```toml [sources.unstructured_data] openai_api_key = "openai_api_key" ``` - Enter your email account secrets in the same section `[sources.unstructured_data]`: - ``` + ```toml host = 'imap.example.com' email_account = "example@example.com" password = 'set me up!' @@ -78,7 +78,7 @@ The `dlt` [init command](https://dlthub.com/docs/reference/command-line-interfac - Enter the BigQuery secrets: - ``` + ```toml [destination.bigquery] location = "US" [destination.bigquery.credentials] @@ -96,7 +96,7 @@ This is the part where you can define what you’d like to see as an outcome. Queries example: -```python +```py INVOICE_QUERIES = { "recipient_company_name": "Who is the recipient of the invoice? Just return the name. If you don't know, then return None", "invoice_amount": "What is the total amount of the invoice? Just return the amount as decimal number, no currency or text. If you don't know, then return None", diff --git a/docs/website/blog/2023-10-16-first-data-warehouse.md b/docs/website/blog/2023-10-16-first-data-warehouse.md index 79186fd267..641751eb1d 100644 --- a/docs/website/blog/2023-10-16-first-data-warehouse.md +++ b/docs/website/blog/2023-10-16-first-data-warehouse.md @@ -75,7 +75,7 @@ For those new to pushing data via an API, it may seem intimidating. Let's simplify - sending data to an API endpoint for loading or updating an object is similar to making a `GET` request. Here's a straightforward example in Python: -```python +```py # Assuming data is in this format import requests # assume we have a table of contacts we want to push to Pipedrive. diff --git a/docs/website/blog/2023-10-19-dbt-runners.md b/docs/website/blog/2023-10-19-dbt-runners.md index 713815abb0..9eb22c050f 100644 --- a/docs/website/blog/2023-10-19-dbt-runners.md +++ b/docs/website/blog/2023-10-19-dbt-runners.md @@ -149,7 +149,7 @@ The Cloud runner we support can do the following: - Check the status of a dbt job in your account. Code example: -```python +```py from dlt.helpers.dbt_cloud import run_dbt_cloud_job # Trigger a job run with additional data @@ -179,7 +179,7 @@ The core runner does the following: - Execute the package and report the outcome. Code example: -```python +```py # Create a transformation on a new dataset called 'pipedrive_dbt' # we created a local dbt package # and added pipedrive_raw to its sources.yml @@ -210,7 +210,7 @@ for m in models: f"Model {m.model_name} materialized" + f"in {m.time}" + f"with status {m.status}" + - f"and message {m.message}" + f"and message {m.message}") ``` ## 4. A short demo on how to do that with dlt’s dbt runner. diff --git a/docs/website/blog/2023-10-23-arrow-loading.md b/docs/website/blog/2023-10-23-arrow-loading.md index 978586fa76..2f25511d73 100644 --- a/docs/website/blog/2023-10-23-arrow-loading.md +++ b/docs/website/blog/2023-10-23-arrow-loading.md @@ -18,13 +18,13 @@ Here we achieved ~30x speedups when loading data from (local) postgres database We’ll start with [ConnectorX library](https://github.com/sfu-db/connector-x) that creates Arrow tables from SQL queries on most of the popular database engines. -```python +```sh pip install connectorx ``` Lib has Rust inside, zero copy extraction and is amazingly fast. We’ll extract and normalize 10 000 000 [test rows](https://github.com/dlt-hub/verified-sources/blob/master/tests/sql_database/sql_source.py#L88) from local postgresql. The table **chat_message** looks like Slack messages dump. Messages have unique autoincrement **id** which we use to load in chunks: -```python +```py import connectorx as cx import dlt from dlt.sources.credentials import ConnectionStringCredentials @@ -49,7 +49,7 @@ chat_messages = dlt.resource( In this demo I just extract and normalize data and skip the loading step. -```python +```py pipeline = dlt.pipeline(destination="duckdb", full_refresh=True) # extract first pipeline.extract(chat_messages) @@ -78,7 +78,7 @@ Step normalize COMPLETED in 0.08 seconds. Here’s corresponding code working with **SqlAlchemy**. We process 10 000 000 rows, yielding in 100k rows packs and normalize to parquet in 3 parallel processes. -```python +```py from itertools import islice import dlt from sqlalchemy import create_engine diff --git a/docs/website/blog/2023-10-25-dlt-deepnote.md b/docs/website/blog/2023-10-25-dlt-deepnote.md index 864353a36d..2674ceae7d 100644 --- a/docs/website/blog/2023-10-25-dlt-deepnote.md +++ b/docs/website/blog/2023-10-25-dlt-deepnote.md @@ -37,7 +37,7 @@ likely than not, you spend more time fixing data pipelines or data formats then on ML algorithms or dashboard designs. We aren’t always lucky enough to get structured data to work with. Imagine a world where your training data is just this statement without no prior work: -```jsx +```sql select * from ``` diff --git a/docs/website/blog/2023-10-26-dlt-prefect.md b/docs/website/blog/2023-10-26-dlt-prefect.md index 8bd6321489..85fa47a5c8 100644 --- a/docs/website/blog/2023-10-26-dlt-prefect.md +++ b/docs/website/blog/2023-10-26-dlt-prefect.md @@ -82,8 +82,7 @@ It would take some effort to interpret even a simple response like this one for "updated": 1502138686, "is_app_user": false, "has_2fa": false - }, - // ... (more data) + } ] } ``` @@ -92,14 +91,14 @@ You can use dlt to build a Slack to BigQuery pipeline in just a few seconds with Seriously, it is that simple. In preparation, let’s make sure to install what we need: -```bash +```sh pip install dlt pip install prefect ```` Then just run a simple init command: -```bash +```sh dlt init slack bigquery ``` @@ -126,14 +125,13 @@ Note that we are redacting some of the code in the preview for brevity, to follow along completely navigate to the repo. -```python +```py # Pipeline to load Slack into BigQuery from typing import List import dlt import pendulum -from pendulum import datetime from slack import slack_source @@ -145,7 +143,7 @@ def get_resources() -> List[str]: """Fetch a list of available dlt resources so we can fetch them one at a time""" # ... -def load_channel_history(channel: str, start_date: datetime) -> None: +def load_channel_history(channel: str, start_date: Date) -> None: """Execute a pipeline that will load the given Slack channel incrementally beginning at the given start date.""" # ... @@ -190,19 +188,18 @@ that can make sure your pipelines aren’t causing you stress in the middle of t Make sure you’re logged in to Prefect Cloud by [signing up](https://app.prefect.cloud/?utm_source=dltblog) and using the following command: -```bash +```sh prefect cloud login ``` Luckily, Prefect is also incredibly Pythonic. Turning any pipeline into an observable, scheduled Prefect flow is as simple as adding decorators to your functions and `serving` it up. Here’s our `dlt` generated pipeline, scheduled daily: -```python +```py from typing import List import dlt import pendulum -from pendulum import datetime from prefect import flow, task from slack import slack_source @@ -215,7 +212,7 @@ def get_resources() -> List[str]: ... @task -def load_channel_history(channel: str, start_date: datetime) -> None: +def load_channel_history(channel: str, start_date: pendulum.Date) -> None: ... @task diff --git a/docs/website/blog/2023-10-30-data-modelling-tools.md b/docs/website/blog/2023-10-30-data-modelling-tools.md index e5839ee66e..960d80a569 100644 --- a/docs/website/blog/2023-10-30-data-modelling-tools.md +++ b/docs/website/blog/2023-10-30-data-modelling-tools.md @@ -71,7 +71,7 @@ Our database is based on the data published by [LivWell](https://www.nature.com/ Sample input structure: -```jsx +```py [{"survey_id": "AM2000DHS", "country": "Armenia", "marriage_related": [{...}, {...}, ...], @@ -81,7 +81,7 @@ Sample input structure: "health_related": [{...}, {...}, ...], "age_related": [{...}, {...}, ...] }, - {...}, {...}, {...}, ...}] + {...}, {...}, {...}, {...}] ``` To break it up into proper tables representing the different sections of the surveys, we gave this data to **dlt** to unpack it into a flat relational structure into BigQuery. dlt automatically unpacked the original data into connected tables. The various child tables link to the parent table `wellness` using foreign keys. `Wellness` contains surveys identified by ID and country. The final setup of indicators broken up into different categories can be found below, as displayed by Power BI. This structured database has been used to experiment with all three dashboarding tools in this article. diff --git a/docs/website/blog/2023-11-01-dlt-dagster.md b/docs/website/blog/2023-11-01-dlt-dagster.md index 4da685be73..687e8444c4 100644 --- a/docs/website/blog/2023-11-01-dlt-dagster.md +++ b/docs/website/blog/2023-11-01-dlt-dagster.md @@ -33,7 +33,7 @@ As we will be ingesting data into BigQuery we first need to create service accou Once we have the credentials we are ready to begin. Let’s first install Dagster and `dlt`. The below commands should install both. -```python +```sh pip install dlt pip install dagster dagster-webserver ``` @@ -42,13 +42,13 @@ pip install dagster dagster-webserver As a first step, we will create the GitHub issues pipeline using `dlt`. -```bash +```sh dlt init github_issues bigquery ``` This will generate a template for us to create a new pipeline. Under `.dlt/secrets.toml` add the service account credentials for BigQuery. Then in the `github_issues.py` delete the generated code and add the following: -```python +```py @dlt.resource(write_disposition="append") def github_issues_resource(api_secret_key=dlt.secrets.value): owner = 'dlt-hub' @@ -88,7 +88,7 @@ The above code creates a simple **github_issues** pipeline that gets the issues To run the pipeline execute the below commands: -```bash +```sh pip install -r requirements.txt python github_issues.py ``` @@ -103,7 +103,7 @@ We will need to adjust our pipeline a bit to orchestrate it using Dagster. - Create a new directory for your Dagster project and scaffold the basic structure: -```bash +```sh mkdir dagster_github_issues cd dagster_github_issues dagster project scaffold --name github-issues @@ -115,7 +115,7 @@ This will generate the default files for Dagster that we will use as a starting - Inside the `github-issues/github_issues` directory create the following folders: `assets`, `resources`, and `dlt`. -```bash +```sh . ├── README.md ├── github_issues @@ -141,13 +141,13 @@ This will generate the default files for Dagster that we will use as a starting ### Step 4: Add configurable resources and define the asset -- Define a `DltResource` class in `resources/__init__.py` as a Dagster configurable resource. This class allows you to reuse pipeline code inside an asset. +- Define a `DDltResource` class in `resources/__init__.py` as a Dagster configurable resource. This class allows you to reuse pipeline code inside an asset. -```python +```py from dagster import ConfigurableResource import dlt -class DltResource(ConfigurableResource): +class DDltResource(ConfigurableResource): pipeline_name: str dataset_name: str destination: str @@ -167,20 +167,20 @@ class DltResource(ConfigurableResource): - Define the asset, `issues_pipeline`, in `assets/__init__.py`. This asset uses the configurable resource to create a dlt pipeline and ingests data into BigQuery. -```python +```py from dagster import asset, get_dagster_logger -from ..resources import DltResource +from ..resources import DDltResource from ..dlt import github_issues_resource @asset -def issues_pipeline(pipeline: DltResource): +def issues_pipeline(pipeline: DDltResource): logger = get_dagster_logger() results = pipeline.create_pipeline(github_issues_resource, table_name='github_issues') logger.info(results) ``` -The defined asset (**issues_pipeline**) takes as input the configurable resource (**DltResource**). In the asset, we use the configurable resource to create a dlt pipeline by using an instance of the configurable resource (**DltResource**) to call the `create_pipeline` function. The `dlt.resource` (**github_issues_resource**) is passed to the `create_pipeline` function. The `create_pipeline` function normalizes the data and ingests it into BigQuery. +The defined asset (**issues_pipeline**) takes as input the configurable resource (**DDltResource**). In the asset, we use the configurable resource to create a dlt pipeline by using an instance of the configurable resource (**DDltResource**) to call the `create_pipeline` function. The `dlt.resource` (**github_issues_resource**) is passed to the `create_pipeline` function. The `create_pipeline` function normalizes the data and ingests it into BigQuery. ### Step 5: Handle Schema Evolution @@ -188,12 +188,12 @@ The defined asset (**issues_pipeline**) takes as input the configurable resource - Add the schema evolution code to the asset to make our pipelines more resilient to changes. -```python +```py from dagster import AssetExecutionContext @asset -def issues_pipeline(context: AssetExecutionContext, pipeline: DltResource): -... -md_content="" +def issues_pipeline(context: AssetExecutionContext, pipeline: DDltResource): + ... + md_content="" for package in result.load_packages: for table_name, table in package.schema_update.items(): for column_name, column in table["columns"].items(): @@ -207,7 +207,7 @@ md_content="" - In the `__init.py__` under the **github_issues** folder add the definitions: -```python +```py all_assets = load_assets_from_modules([assets]) simple_pipeline = define_asset_job(name="simple_pipeline", selection= ['issues_pipeline']) @@ -215,7 +215,7 @@ defs = Definitions( assets=all_assets, jobs=[simple_pipeline], resources={ - "pipeline": DltResource( + "pipeline": DDltResource( pipeline_name = "github_issues", dataset_name = "dagster_github_issues", destination = "bigquery", @@ -255,20 +255,20 @@ One of the main strengths of `dlt` lies in its ability to extract, normalize, an - Start by creating a new Dagster project scaffold: -```python +```sh dagster project scaffold --name mongodb-dlt ``` - Follow the steps mentioned earlier and create an `assets`, and `resources` directory under `mongodb-dlt/mongodb_dlt`. - Initialize a `dlt` MongoDB pipeline in the same directory: -```python +```sh dlt init mongodb bigquery ``` This will create a template with all the necessary logic implemented for extracting data from MongoDB. After running the command your directory structure should be as follows: -```python +```text . ├── README.md ├── mongodb_dlt @@ -299,16 +299,16 @@ For this example, we are using MongoDB Atlas. Set up the account for MongoDB Atl Next, create a `.env` file and add the BigQuery and MongoDB credentials to the file. The `.env` file should reside in the root directory. -### Step 3: Adding the DltResource +### Step 3: Adding the DDltResource Create a `DltResouce` under the **resources** directory. Add the following code to the `__init__.py`: -```python +```py from dagster import ConfigurableResource import dlt -class DltResource(ConfigurableResource): +class DDltResource(ConfigurableResource): pipeline_name: str dataset_name: str destination: str @@ -335,9 +335,9 @@ In the `mongodb_pipeline.py` file, locate the `load_select_collection_hint_db` f In the `__init__.py` file under the **assets** directory, define the `dlt_asset_factory`: -```python +```py from ..mongodb import mongodb -from ..resources import DltResource +from ..resources import DDltResource import dlt import os @@ -363,7 +363,7 @@ def dlt_asset_factory(collection_list): for stream in collection_name} ) - def collections_asset(context: OpExecutionContext, pipeline: DltResource): + def collections_asset(context: OpExecutionContext, pipeline: DDltResource): # Getting Data From MongoDB data = mongodb(URL, db).with_resources(*collection_name) @@ -386,16 +386,16 @@ dlt_assets = dlt_asset_factory(DATABASE_COLLECTIONS) Add the definitions in the `__init__.py` in the root directory: -```python +```py from dagster import Definitions from .assets import dlt_assets -from .resources import DltResource +from .resources import DDltResource defs = Definitions( assets=dlt_assets, resources={ - "pipeline": DltResource( + "pipeline": DDltResource( pipeline_name = "mongo", dataset_name = "dagster_mongo", destination = "bigquery" diff --git a/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md b/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md index 292879fc95..94fb89790e 100644 --- a/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md +++ b/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md @@ -79,11 +79,11 @@ in-depth guide, please refer to the detailed documentation. 1. Click 'Create Function' in Cloud Functions, and select your region and environment setup. 1. Choose HTTP as the trigger, enable 'Allow unauthenticated invocations', save, and click 'Next'. 1. Set the environment to Python 3.10 and prepare to insert code into main.py: - ```python + ```py import dlt - import json import time from google.cloud import bigquery + from dlt.common import json def github_webhook(request): # Extract relevant data from the request payload @@ -106,7 +106,7 @@ in-depth guide, please refer to the detailed documentation. dlt[bigquery] ``` 1. Post-deployment, a webhook URL is generated, typically following a specific format. - ```bash + ```sh https://{region]-{project-id}.cloudfunctions.net/{cloud-function-name} ``` @@ -140,7 +140,7 @@ Set up the webhook by creating a cloud function, using the same steps as for the 1. Here’s what `main.py` looks like: - ```python + ```py import dlt from flask import jsonify @@ -215,7 +215,7 @@ Set up the webhook by creating a cloud function, using the same steps as for the 1. Here’s what `main.py`looks like: - ```python + ```py import dlt from flask import jsonify @@ -227,7 +227,8 @@ Set up the webhook by creating a cloud function, using the same steps as for the # Initialize and configure the DLT pipeline pipeline = dlt.pipeline( - pipeline_name=ßigquery', # Destination service for the data + pipeline_name="hubspot", + destination='bigquery', # Destination service for the data dataset_name='hubspot_webhooks_dataset', # BigQuery dataset name ) diff --git a/docs/website/blog/2023-11-27-dlt-data-lineage.md b/docs/website/blog/2023-11-27-dlt-data-lineage.md index 233ef58800..d91659eb6b 100644 --- a/docs/website/blog/2023-11-27-dlt-data-lineage.md +++ b/docs/website/blog/2023-11-27-dlt-data-lineage.md @@ -42,7 +42,7 @@ The **load_info** produced by `dlt` for both pipelines is also populated into Bi To get started install `dlt` and dbt: -```jsx +```sh pip install dlt pip install dbt-bigquery ``` @@ -59,13 +59,13 @@ We use the following CSV files as our data sources for this demo: To get started we initialize a dlt pipeline and selecting BigQuery as our destination by running the following command: -```python +```sh dlt init data_lineage bigquery ``` This will create default scaffolding to build our pipeline. Install the dependencies by running the following command: -```python +```sh pip install -r requirements.txt ``` @@ -76,7 +76,7 @@ As a first step, we will load the sales data from the online and physical store In the `data_lineage.py` file remove the default code and add the following: -```python +```py FILEPATH = "data/supermarket_sales.csv" FILEPATH_SHOPIFY = "data/orders_export_1.csv" @@ -109,7 +109,7 @@ Any changes in the underlying data are captured by the dlt **load_info**. To sho We will add the **load_info** back to BigQuery to use in our Dashboard. The Dashboard will provide an overview data lineage for our ingested data. -```python +```py if __name__ == "__main__": data_store = pd.read_csv(FILEPATH) @@ -134,7 +134,7 @@ if __name__ == "__main__": dataset_name='sales_shopify' ) - load_a = pipeline_store.run_pipeline( + load_a = pipeline_store.run_pipeline( data=select_c_data_store, table_name='sales_info', write_disposition='replace' @@ -161,7 +161,7 @@ if __name__ == "__main__": To run the pipeline, execute the following command: -```python +```sh python data_lineage.py ``` @@ -175,7 +175,7 @@ Now that both the Shopify and Store data are available in BigQuery, we will use To get started initialize a dbt project in the root directory: -```python +```sh dbt init sales_dbt ``` @@ -244,7 +244,7 @@ In the query, we combine the **load_info** for both sources by doing a union ove In the `data_lineage.py` add the code to run the dbt package using `dlt`. -```python +```py pipeline_transform = dlt.pipeline( pipeline_name='pipeline_transform', destination='bigquery', @@ -271,7 +271,7 @@ for m in models: Next, run the pipeline using the following command: -```python +```sh python data_lineage.py ``` diff --git a/docs/website/blog/2023-12-01-dlt-kestra-demo.md b/docs/website/blog/2023-12-01-dlt-kestra-demo.md index da47384194..9f1d7acba2 100644 --- a/docs/website/blog/2023-12-01-dlt-kestra-demo.md +++ b/docs/website/blog/2023-12-01-dlt-kestra-demo.md @@ -78,7 +78,7 @@ In my scenario, the email data doesn't have nested structures, so there's no nee Here's how the pipeline is defined and subsequently run in the first task of the main flow in **`Kestra`**: -```python +```py # Run dlt pipeline to load email data from gmail to BigQuery pipeline = dlt.pipeline( pipeline_name="standard_inbox", diff --git a/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md b/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md index c819f90741..296d303dcb 100644 --- a/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md +++ b/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md @@ -46,13 +46,13 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 1. Install the SAM CLI [add link or command here] - ```bash + ```sh pip install aws-sam-cli ``` 2. Define your resources in a `template.yml` file - ```yaml + ```text AWSTemplateFormatVersion: "2010-09-09" Transform: AWS::Serverless-2016-10-31 @@ -86,7 +86,7 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM Effect: Allow Action: - secretsmanager:GetSecretValue - Resource: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:DLT_* + Resource: !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:DLT_*" Metadata: DockerTag: dlt-aws DockerContext: . @@ -99,13 +99,13 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 3. Build a deployment package - ```bash + ```sh sam build ``` 4. Test your setup locally - ```bash + ```sh sam local start-api # in a second terminal window @@ -114,7 +114,7 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 5. Deploy your resources to AWS - ```bash + ```sh sam deploy --stack-name= --resolve-image-repos --resolve-s3 --capabilities CAPABILITY_IAM ``` diff --git a/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md b/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md index d31d9a7e3a..e6e7d2ba18 100644 --- a/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md +++ b/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md @@ -125,7 +125,7 @@ By using this micro-batch architecture, we strive to maintain a balance of datab insert efficiency (by writing multiple records at a time) with near real-time insertion (by keeping the window size around 5 seconds). -```python +```py pipeline = dlt.pipeline( pipeline_name="pubsub_dlt", diff --git a/docs/website/blog/2024-01-10-dlt-mode.md b/docs/website/blog/2024-01-10-dlt-mode.md index b92425184d..1d6bf8ca0e 100644 --- a/docs/website/blog/2024-01-10-dlt-mode.md +++ b/docs/website/blog/2024-01-10-dlt-mode.md @@ -123,13 +123,13 @@ With the model we just created, called Products, a chart can be instantly create In this demo, we’ll forego the authentication issues of connecting to a data warehouse, and choose the DuckDB destination to show how the Python environment within Mode can be used to initialize a data pipeline and dump normalized data into a destination. In order to see how it works, we first install dlt[duckdb] into the Python environment. -```python +```sh !pip install dlt[duckdb] ``` Next, we initialize the dlt pipeline: -```python +```py # initializing the dlt pipeline with your # data warehouse destination pipeline = dlt.pipeline( diff --git a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md index b36748aed9..059dd97a06 100644 --- a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md +++ b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md @@ -86,7 +86,7 @@ We recommend setting up and testing dbt-core locally before using it in cloud fu 1. Next, modify the `main.py` as follows: - ```python + ```py import os import subprocess import logging @@ -191,11 +191,12 @@ To integrate dlt and dbt in cloud functions, use the dlt-dbt runner; here’s ho 1. Next, configure the `main.py` as follows: - ```python + ```py import dlt - import logging, json + import logging from flask import jsonify from dlt.common.runtime.slack import send_slack_message + from dlt.common import json def run_pipeline(request): """ @@ -306,7 +307,7 @@ To integrate dlt and dbt in cloud functions, use the dlt-dbt runner; here’s ho 1. Next, list runtime-installable modules in `requirements.txt`: - ``` + ```sh dbt-core dbt-bigquery ``` diff --git a/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md b/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md index e67e203caf..415a55f9b9 100644 --- a/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md +++ b/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md @@ -38,7 +38,7 @@ Here’s how a pipeline could look: The data being used is of a questionnaire, which includes questions, the options of those questions, respondents and responses. This data is contained within a nested json object, that we’ll pass as a raw source to `dlt` to structure, normalize and dump into a BigQuery destination. -```python +```py # initializing the dlt pipeline with your data warehouse destination pipeline = dlt.pipeline( pipeline_name="survey_pipeline", @@ -89,20 +89,20 @@ measures: - name: surveys_total description: The total surveys for each --dimension. agg: count - # if all rows need to be counted then expr = 1 + # if all rows need to be counted then expr = 1 expr: 1 # where in SQL you would: group by columns dimensions: - # default dbt requirement + # default dbt requirement - name: surveyed_at type: time type_params: time_granularity: day # count entry per answer - - name: people_per_color + - name: people_per_color type: categorical expr: answer - # count entry per question + # count entry per question - name: question type: categorical expr: question @@ -117,10 +117,10 @@ metrics: type: simple label: Favorite Colors type_params: - # reference of the measure created in the semantic model + # reference of the measure created in the semantic model measure: surveys_total - filter: | # adding a filter on the "question" column for asking about favorite color - {{ Dimension('id__question') }} = 'What is your favorite color?' + filter: | # adding a filter on the "question" column for asking about favorite color + {{ Dimension('id__question') }} = 'What is your favorite color?' ``` The DAG then looks like this: diff --git a/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md b/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md index 553284bc6f..ff54c463bd 100644 --- a/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md +++ b/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md @@ -42,7 +42,7 @@ Since “checking” things can be tedious, we rather forget about it and be not Here’s a gist of how to use it -```python +```py from dlt.common.runtime.slack import send_slack_message def run_pipeline_and_notify(pipeline, data): diff --git a/docs/website/blog/2024-03-07-openapi-generation-chargebee.md b/docs/website/blog/2024-03-07-openapi-generation-chargebee.md index 367f8db2ca..3d77c3ea4c 100644 --- a/docs/website/blog/2024-03-07-openapi-generation-chargebee.md +++ b/docs/website/blog/2024-03-07-openapi-generation-chargebee.md @@ -90,7 +90,7 @@ There were no great challenges. The most ~~difficult~~ tedious probably was to m 1) Authentication The provided Authentication was a bit off. The generated code assumed the using of a username and password but what was actually required was — an empty username + api_key as a password. So super easy fix was changing -```python +```py def to_http_params(self) -> CredentialsHttpParams: cred = f"{self.api_key}:{self.password}" if self.password else f"{self.username}" encoded = b64encode(f"{cred}".encode()).decode() @@ -99,9 +99,9 @@ def to_http_params(self) -> CredentialsHttpParams: to -```python +```py def to_http_params(self) -> CredentialsHttpParams: - encoded = b64encode(f"{self.api_key}".encode()).decode() + encoded = b64encode(f"{self.api_key}".encode()).decode() return dict(cookies={}, headers={"Authorization": "Basic " + encoded}, params={}) ``` @@ -111,13 +111,14 @@ Also I was pleasantly surprised that generator had several different authenticat For the code generator it’s hard to guess a pagination method by OpenAPI specification, so the generated code has no pagination 😞. So I had to replace a line -```python -yield _build_response(requests.request(**kwargs)) +```py +def f(): + yield _build_response(requests.request(**kwargs)) ``` with yielding form a 6-lines `get_page` function -```python +```py def get_pages(kwargs: Dict[str, Any], data_json_path): has_more = True while has_more: @@ -133,7 +134,7 @@ The downside — I had to do it for each resource. The code wouldn’t run because it wasn’t able to find some models. I found a commented line in generator script -```python +```py # self._build_models() ``` diff --git a/docs/website/blog/2024-03-11-moving-away-from-segment.md b/docs/website/blog/2024-03-11-moving-away-from-segment.md index f834e25060..4f4b7d0a80 100644 --- a/docs/website/blog/2024-03-11-moving-away-from-segment.md +++ b/docs/website/blog/2024-03-11-moving-away-from-segment.md @@ -67,7 +67,7 @@ Next, we focus on establishing the necessary permissions for our pipeline. A cru Please refer to the Google Cloud documentation [here](https://cloud.google.com/iam/docs/service-accounts-create#console) to set up a service account. Once created, it's important to assign the necessary permissions to the service account. The project [README](https://github.com/dlt-hub/dlt_pubsub_demo) lists the necessary permissions. Finally, generate a key for the created service account and download the JSON file. Pass the credentials as environment variables in the project root directory. -```bash +```sh export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" ``` @@ -75,7 +75,7 @@ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" To set up our pipeline, start by cloning the [GitHub Repository](https://github.com/dlt-hub/dlt_pubsub_demo). The repository contains all the necessary components, structured as follows: -```bash +```sh . ├── README.md ├── cloud_functions @@ -102,7 +102,7 @@ Meanwhile, the **cloud_functions** folder includes the code for the Cloud Functi To begin, integrate the service account credentials with Terraform to enable authorization and resource management on Google Cloud. Edit the `terraform/main.tf` file to include the path to your service account's credentials file as follows: -```bash +```sh provider "google" { credentials = file("./../credentials.json") project = var.project_id @@ -114,7 +114,7 @@ provider "google" { Next, in the `terraform/variables.tf` define the required variables. These variables correspond to details within your `credentials.json` file and include your project's ID, the region for resource deployment, and any other parameters required by your Terraform configuration: -```bash +```sh variable "project_id" { type = string default = "Add Project ID" @@ -128,7 +128,6 @@ variable "region" { variable "service_account_email" { type = string default = "Add Service Account Email" - } ``` @@ -138,7 +137,7 @@ We are now ready to set up some cloud resources. To get started, navigate into t With the initialization complete, you're ready to proceed with the creation of your cloud resources. To do this, run the following Terraform commands in sequence. These commands instruct Terraform to plan and apply the configurations defined in your `.tf` files, setting up the infrastructure on Google Cloud as specified. -```bash +```sh terraform plan terraform apply ``` @@ -161,7 +160,7 @@ The following resources are created on Google Cloud once `terraform apply` comma Now that our cloud infrastructure is in place, it's time to activate the event publisher. Look for the `publisher.py` file in the project root directory. You'll need to provide specific details to enable the publisher to send events to the correct Pub/Sub topic. Update the file with the following: -```python +```py # TODO(developer) project_id = "Add GCP Project ID" topic_id = "telemetry_data_tera" @@ -169,7 +168,7 @@ topic_id = "telemetry_data_tera" The `publisher.py` script is designed to generate dummy events, simulating real-world data, and then sends these events to the specified Pub/Sub topic. This process is crucial for testing the end-to-end functionality of our event streaming pipeline, ensuring that data flows from the source (the publisher) to our intended destinations (BigQuery, via the Cloud Function and dlt). To run the publisher execute the following command: -```python +```sh python publisher.py ``` @@ -179,7 +178,7 @@ Once the publisher sends events to the Pub/Sub Topic, the pipeline is activated. The average completion time of the pipeline is approximately 12 minutes, accounting for the 10-minute time interval after which the subscriber pushes data to storage plus the Cloud Function execution time. The push interval of the subscriber can be adjusted by changing the **max_duration** in `pubsub.tf` -```bash +```sh cloud_storage_config { bucket = google_storage_bucket.tel_bucket_storage.name diff --git a/docs/website/docs/_book-onboarding-call.md b/docs/website/docs/_book-onboarding-call.md new file mode 100644 index 0000000000..5f6d5df81b --- /dev/null +++ b/docs/website/docs/_book-onboarding-call.md @@ -0,0 +1 @@ +book a call with our support engineer Violetta \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md index 60753d90b5..c9a0bff022 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/destination.md +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -54,7 +54,7 @@ The full signature of the destination decorator plus its function is the followi loader_file_format="jsonl", name="my_custom_destination", naming_convention="direct", - max_nesting_level=0, + max_table_nesting=0, skip_dlt_columns_and_tables=True ) def my_destination(items: TDataItems, table: TTableSchema) -> None: diff --git a/docs/website/docs/dlt-ecosystem/destinations/index.md b/docs/website/docs/dlt-ecosystem/destinations/index.md index 2c24d14312..fef79d4364 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/index.md +++ b/docs/website/docs/dlt-ecosystem/destinations/index.md @@ -4,11 +4,12 @@ description: List of destinations keywords: ['destinations'] --- import DocCardList from '@theme/DocCardList'; +import Link from '../../_book-onboarding-call.md'; Pick one of our high-quality destinations and load your data into a local database, warehouse, or data lake. Append, replace, or merge your data. Apply performance hints like partitions, clusters, or indexes. Load directly or via staging. Each of our destinations undergoes several hundred automated tests every day. * Is a destination or feature missing? [Join our Slack community](https://dlthub.com/community) and ask for it. -* Need more info? [Join our Slack community](https://dlthub.com/community) and ask in the tech help channel or [Talk to an engineer](https://calendar.app.google/kiLhuMsWKpZUpfho6). +* Need more info? [Join our Slack community](https://dlthub.com/community) and ask in the tech help channel or . Otherwise, pick a destination below: diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index c5c019725d..c0bf2bcebf 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -61,6 +61,13 @@ You can also pass a SQLAlchemy-like database connection: destination.mssql.credentials="mssql://loader:@loader.database.windows.net/dlt_data?connect_timeout=15" ``` +To connect to an `mssql` server using Windows authentication, include `trusted_connection=yes` in the connection string. This method is useful when SQL logins aren't available, and you use Windows credentials. + +```toml +destination.mssql.credentials="mssql://username:password@loader.database.windows.net/dlt_data?trusted_connection=yes" +``` +> The username and password must be filled out with the appropriate login credentials or left untouched. Leaving these empty is not recommended. + To pass credentials directly, you can use the `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. ```py pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="mssql://loader:@loader.database.windows.net/dlt_data?connect_timeout=15") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/_source-info-header.md b/docs/website/docs/dlt-ecosystem/verified-sources/_source-info-header.md new file mode 100644 index 0000000000..112dcf06bf --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/_source-info-header.md @@ -0,0 +1,6 @@ +import Admonition from "@theme/Admonition"; +import Link from '../../_book-onboarding-call.md'; + + +Join our Slack community or . + \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md index bd04dbfcf3..43d99a02fd 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md @@ -3,14 +3,11 @@ title: Airtable description: dlt verified source for Airtable keywords: [airtable api, airtable verified source, airtable] --- +import Header from './_source-info-header.md'; # Airtable -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Airtable](https://www.airtable.com/) is a cloud-based platform that merges spreadsheet and database functionalities for easy data management and collaboration. @@ -215,7 +212,7 @@ verified source. base_id = base_id, table_names = table_names ) - load_info = pipeline.run(airtables, write_deposition = "replace") + load_info = pipeline.run(airtables, write_disposition = "replace") ``` > You have the option to use table names or table IDs in the code above, in place of "Table1" and diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md index 7867b2d54e..3e7dad9793 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md @@ -3,14 +3,11 @@ title: Amazon Kinesis description: dlt verified source for Amazon Kinesis keywords: [amazon kinesis, verified source] --- +import Header from './_source-info-header.md'; # Amazon Kinesis -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. -::: +
[Amazon Kinesis](https://docs.aws.amazon.com/streams/latest/dev/key-concepts.html) is a cloud-based service for real-time data streaming and analytics, enabling the processing and analysis of large diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index 915a9d297a..426c090f94 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -3,14 +3,11 @@ title: Arrow Table / Pandas description: dlt source for Arrow tables and Pandas dataframes keywords: [arrow, pandas, parquet, source] --- +import Header from './_source-info-header.md'; # Arrow Table / Pandas -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
You can load data directly from an Arrow table or Pandas dataframe. This is supported by all destinations, but recommended especially when using destinations that support the `parquet` file format natively (e.g. [Snowflake](../destinations/snowflake.md) and [Filesystem](../destinations/filesystem.md)). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md index 4980aa57cd..173cc42b8a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md @@ -3,14 +3,11 @@ title: Asana description: dlt verified source for Asana API keywords: [asana api, verified source, asana] --- +import Header from './_source-info-header.md'; # Asana -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Asana](https://asana.com) is a widely used web-based project management and collaboration tool that helps teams stay organized, focused, and productive. With Asana, team members can easily create, assign, and track diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md index 6ae457d1e6..663dda7259 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md @@ -3,14 +3,11 @@ title: Chess.com description: dlt verified source for Chess.com API keywords: [chess.com api, chess.com verified source, verified source, chess.com, chess] --- +import Header from './_source-info-header.md'; # Chess.com -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Chess.com](https://www.chess.com/) is an online platform that offers services for chess enthusiasts. It includes online chess games, tournaments, lessons, and more. @@ -127,7 +124,9 @@ def players_profiles(players: List[str]) -> Iterator[TDataItem]: @dlt.defer def _get_profile(username: str) -> TDataItem: return get_path_with_retry(f"player/{username}") - ... + + for username in players: + yield _get_profile(username) ``` `players`: Is a list of player usernames for which you want to fetch profile data. @@ -158,10 +157,10 @@ specified otherwise. @dlt.resource(write_disposition="append") def players_games( players: List[str], start_month: str = None, end_month: str = None -) -> Iterator[Callable[[], List[TDataItem]]]: +) -> Iterator[TDataItems]: # gets a list of already checked(loaded) archives. checked_archives = dlt.current.resource_state().setdefault("archives", []) - ... + yield {} # return your retrieved data here ``` `players`: Is a list of player usernames for which you want to fetch games. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md index a1b143bd68..c9b1ee5e34 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md @@ -3,14 +3,11 @@ title: Facebook Ads description: dlt verified source for Facebook Ads keywords: [facebook ads api, verified source, facebook ads] --- +import Header from './_source-info-header.md'; # Facebook Ads -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
Facebook Ads is the advertising platform that lets businesses and individuals create targeted ads on Facebook and its affiliated apps like Instagram and Messenger. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md index bf30da8882..c4cb0e536e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md @@ -3,13 +3,11 @@ title: Filesystem description: dlt verified source for Readers Source and Filesystem keywords: [readers source and filesystem, filesystem, readers source] --- -# Readers Source and Filesystem +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# Readers Source and Filesystem -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
This verified source easily streams files from AWS S3, Google Cloud Storage, Google Drive, Azure, or local filesystem using the reader source. @@ -453,7 +451,8 @@ verified source. ) # pretty print the information on data that was loaded print(load_info) - print(listing)(pipeline.last_trace.last_normalize_info) + print(listing) + print(pipeline.last_trace.last_normalize_info) ``` 1. Cleanup after loading: diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md index 8ab3e3d510..a5a338666e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/github.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/github.md @@ -3,14 +3,11 @@ title: GitHub description: dlt verified source for GitHub API keywords: [github api, github verified source, github] --- +import Header from './_source-info-header.md'; # GitHub -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
This verified source can be used to load data on issues or pull requests from any GitHub repository onto a [destination](../../dlt-ecosystem/destinations) of your choice using [GitHub API](https://docs.github.com/en/rest?apiVersion=2022-11-28). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md index 09c36b93bc..7b4c1b0d5e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md @@ -1,10 +1,13 @@ -# Google Analytics +--- +title: Google Analytics +description: dlt verified source for Google Analytics API +keywords: [google analytics api, google analytics verified source, google analytics] +--- +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# Google Analytics -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Google Analytics](https://marketingplatform.google.com/about/analytics/#?modal_active=none) is a service for web analytics that tracks and provides data regarding user engagement with your website diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index 4471c291fb..3be72adfa0 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -3,14 +3,11 @@ title: Google Sheets description: dlt verified source for Google Sheets API keywords: [google sheets api, google sheets verified source, google sheets] --- +import Header from './_source-info-header.md'; # Google Sheets -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Google Sheets](https://www.google.com/sheets/about/) is a cloud-based spreadsheet application offered by Google as part of its Google Workspace suite. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md index d43bfcb2e2..357d50582f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md @@ -3,14 +3,11 @@ title: Hubspot description: dlt verified source for Hubspot API keywords: [hubspot api, hubspot verified source, hubspot] --- +import Header from './_source-info-header.md'; # HubSpot -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
HubSpot is a customer relationship management (CRM) software and inbound marketing platform that helps businesses to attract visitors, engage customers, and close leads. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md index 894b7ca59b..aac77b9b0a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md @@ -3,14 +3,11 @@ title: Inbox description: dlt verified source for Mail Inbox keywords: [inbox, inbox verified source, inbox mail, email] --- +import Header from './_source-info-header.md'; # Inbox -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. -::: +
This source collects inbox emails, retrieves attachments, and stores relevant email data. It uses the imaplib library for IMAP interactions and the dlt library for data processing. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/index.md index 500e5fada6..16249e41ca 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/index.md @@ -4,10 +4,11 @@ description: List of verified sources keywords: ['verified source'] --- import DocCardList from '@theme/DocCardList'; +import Link from '../../_book-onboarding-call.md'; Pick one of our verified sources that we wrote or maintain ourselves. All of them are constantly tested on real data and distributed as simple Python code so they can be easily customized or hacked. -* Need more info? [Join our Slack community](https://dlthub.com/community) and ask in the tech help channel or [Talk to an engineer](https://calendar.app.google/kiLhuMsWKpZUpfho6) +* Need more info? [Join our Slack community](https://dlthub.com/community) and ask in the tech help channel or . Do you plan to run dlt in production and source is missing? We are happy to build it. * Source missing? [Request a new verified source](https://github.com/dlt-hub/verified-sources/issues/new?template=source-request.md) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md index 38dacb0541..b4e8bb76de 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md @@ -1,10 +1,13 @@ -# Jira +--- +title: Jira +description: dlt verified source for Atlassian Jira +keywords: [jira api, jira verified source, jira] +--- +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# Jira -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. -::: +
[Jira](https://www.atlassian.com/software/jira) by Atlassian helps teams manage projects and tasks efficiently, prioritize work, and collaborate. @@ -173,7 +176,8 @@ The resource function searches issues using JQL queries and then loads them to t ```py @dlt.resource(write_disposition="replace") def issues(jql_queries: List[str]) -> Iterable[TDataItem]: - api_path = "rest/api/3/search" + api_path = "rest/api/3/search" + return {} # return the retrieved values here ``` `jql_queries`: Accepts a list of JQL queries. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md index 371e5af767..fe3c426819 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md @@ -3,14 +3,11 @@ title: Kafka description: dlt verified source for Confluent Kafka keywords: [kafka api, kafka verified source, kafka] --- +import Header from './_source-info-header.md'; # Kafka -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://join.slack.com/t/dlthub-community/shared_invite/zt-1n5193dbq-rCBmJ6p~ckpSFK4hCF2dYA) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. -::: +
[Kafka](https://www.confluent.io/) is an open-source distributed event streaming platform, organized in the form of a log with message publishers and subscribers. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md index 0b8e5ae1f9..296526b21a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md @@ -1,10 +1,13 @@ -# Matomo +--- +title: Matomo +description: dlt verified source for Matomo +keywords: [matomo api, matomo verified source, matomo] +--- +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# Matomo -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
Matomo is a free and open-source web analytics platform that provides detailed insights into website and application performance with features like visitor maps, site search analytics, real-time visitor tracking, and custom reports. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md index c8335f26d9..6fda0f8fe9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md @@ -3,14 +3,11 @@ title: MongoDB description: dlt verified source for MongoDB keywords: [mongodb, verified source, mongo database] --- +import Header from './_source-info-header.md'; # MongoDB -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[MongoDB](https://www.mongodb.com/what-is-mongodb) is a NoSQL database that stores JSON-like documents. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md index 7ed87b410f..37368110e4 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md @@ -1,10 +1,13 @@ -# Mux +--- +title: Mux +description: dlt verified source for Mux +keywords: [mux api, mux verified source, mux] +--- +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# Mux -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Mux.com](http://mux.com/) is a video technology platform that provides infrastructure and tools for developers to build and stream high-quality video content. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md index b1e943336f..69e66ed2aa 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md @@ -1,10 +1,13 @@ -# Notion +--- +title: Notion +description: dlt pipeline for Notion API +keywords: [notion api, notion pipeline, notion] +--- +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# Notion -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Notion](https://www.notion.so/) is a flexible workspace tool for organizing personal and professional tasks, offering customizable notes, documents, databases, and more. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md index fd1005e67b..9829c94786 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md @@ -3,14 +3,11 @@ title: Personio description: dlt verified source for Personio API keywords: [personio api, personio verified source, personio] --- +import Header from './_source-info-header.md'; # Personio -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
Personio is a human resources management software that helps businesses streamline HR processes, including recruitment, employee data management, and payroll, in one platform. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index 3dc815d53b..d571e5d386 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -3,14 +3,11 @@ title: Pipedrive description: dlt verified source for Pipedrive API keywords: [pipedrive api, pipedrive verified source, pipedrive] --- +import Header from './_source-info-header.md'; # Pipedrive -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. -::: +
[Pipedrive](https://developers.pipedrive.com/docs/api/v1) is a cloud-based sales Customer Relationship Management (CRM) tool designed to help businesses manage leads and deals, track @@ -213,10 +210,11 @@ create and store a mapping of custom fields for different entities in the source ```py @dlt.resource(selected=False) def create_state(pipedrive_api_key: str) -> Iterator[Dict[str, Any]]: - def _get_pages_for_rename( - entity: str, fields_entity: str, pipedrive_api_key: str - ) -> Dict[str, Any]: + def _get_pages_for_rename( + entity: str, fields_entity: str, pipedrive_api_key: str + ) -> Dict[str, Any]: ... + yield _get_pages_for_rename("", "", "") ``` It processes each entity in ENTITY_MAPPINGS, updating the custom fields mapping if a related fields diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md index a62c41c2dc..667ab8b0c1 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md @@ -1,9 +1,13 @@ +--- +title: Salesforce +description: dlt pipeline for Salesforce API +keywords: [salesforce api, salesforce pipeline, salesforce] +--- +import Header from './_source-info-header.md'; + # Salesforce -:::info Need help deploying these sources, or figuring out how to run them in your data stack? -[Join our Slack community](https://dlthub.com/community) or -[book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Salesforce](https://www.salesforce.com) is a cloud platform that streamlines business operations and customer relationship management, encompassing sales, marketing, and customer service. @@ -271,11 +275,11 @@ To create your data pipeline using single loading and > overwriting existing data. Conversely, the "task" endpoint supports "merge" mode for > incremental loads, updating or adding data based on the 'last_timestamp' value without erasing > previously loaded data. - -1. Salesforce enforces specific limits on API data requests. These limits + +1. Salesforce enforces specific limits on API data requests. These limits vary based on the Salesforce edition and license type, as outlined in the [Salesforce API Request Limits documentation](https://developer.salesforce.com/docs/atlas.en-us.salesforce_app_limits_cheatsheet.meta/salesforce_app_limits_cheatsheet/salesforce_app_limits_platform_api.htm). - To limit the number of Salesforce API data requests, developers can control the environment for production or + To limit the number of Salesforce API data requests, developers can control the environment for production or development purposes. For development, you can set the `IS_PRODUCTION` variable to `False` in "[salesforce/settings.py](https://github.com/dlt-hub/verified-sources/blob/master/sources/salesforce/settings.py)", which limits API call requests to 100. To modify this limit, you can update the query limit in diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md b/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md new file mode 100644 index 0000000000..2e6b588c18 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md @@ -0,0 +1,189 @@ +--- +title: Scrapy +description: dlt verified source for Scraping using scrapy +keywords: [scraping, scraping verified source, scrapy] +--- + +# Scrapy + +This verified source utilizes Scrapy, an open-source and collaborative framework for web scraping. +Scrapy enables efficient extraction of required data from websites. + +## Setup Guide + +### Initialize the verified source + +To get started with your data pipeline, follow these steps: + +1. Enter the following command: + + ```sh + dlt init scraping duckdb + ``` + + [This command](../../reference/command-line-interface) will initialize + [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/scraping_pipeline.py) + with Scrapy as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) + as the [destination](../destinations). + +1. If you'd like to use a different destination, simply replace `duckdb` with the name of your + preferred [destination](../destinations). + +1. After running this command, a new directory will be created with the necessary files and + configuration settings to get started. + +For more information, read the guide on +[how to add a verified source.](../../walkthroughs/add-a-verified-source) + +### Add credentials + +1. The `config.toml`, looks like: + ```toml + # put your configuration values here + [sources.scraping] + start_urls = ["URL to be scraped"] # please set me up! + start_urls_file = "/path/to/urls.txt" # please set me up! + ``` + > When both `start_urls` and `start_urls_file` are provided they will be merged and deduplicated + > to ensure a Scrapy gets a unique set of start URLs. + +1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can securely + store your access tokens and other sensitive information. It's important to handle this + file with care and keep it safe. + +1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to + add credentials for your chosen destination, ensuring proper routing of your data to the final + destination. +For more information, read [Secrets and Configs.](../../general-usage/credentials) + +## Run the pipeline + +In this section, we demonstrate how to use the `MySpider` class defined in "scraping_pipeline.py" to +scrape data from "https://quotes.toscrape.com/page/1/". + +1. Start with configuring the `config.toml` as follows: + + ```toml + [sources.scraping] + start_urls = ["https://quotes.toscrape.com/page/1/"] # please set me up! + ``` + + Additionally, set destination credentials in `secrets.toml`, as [discussed](#add-credentials). + +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by + running the command: + + ```sh + pip install -r requirements.txt + ``` + +1. You're now ready to run the pipeline! To get started, run the following command: + + ```sh + python scraping_pipeline.py + ``` + +## Customization + +### Create your own pipeline + +If you wish to create your data pipeline, follow these steps: + +1. The first step requires creating a spider class that scrapes data + from the website. For example, class `Myspider` below scrapes data from + URL: "https://quotes.toscrape.com/page/1/". + + ```py + class MySpider(Spider): + def parse(self, response: Response, **kwargs: Any) -> Any: + # Iterate through each "next" page link found + for next_page in response.css("li.next a::attr(href)"): + if next_page: + yield response.follow(next_page.get(), self.parse) + + # Iterate through each quote block found on the page + for quote in response.css("div.quote"): + # Extract the quote details + result = { + "quote": { + "text": quote.css("span.text::text").get(), + "author": quote.css("small.author::text").get(), + "tags": quote.css("div.tags a.tag::text").getall(), + }, + } + yield result + + ``` + + > Define your own class tailored to the website you intend to scrape. + +1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: + + ```py + pipeline = dlt.pipeline( + pipeline_name="scrapy_pipeline", # Use a custom name if desired + destination="duckdb", # Choose the appropriate destination (e.g., bigquery, redshift) + dataset_name="scrapy_data", # Use a custom name if desired + ) + ``` + + To read more about pipeline configuration, please refer to our + [documentation](../../general-usage/pipeline). + +1. To run the pipeline with customized scrapy settings: + + ```py + run_pipeline( + pipeline, + MySpider, + # you can pass scrapy settings overrides here + scrapy_settings={ + # How many sub pages to scrape + # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit + "DEPTH_LIMIT": 100, + "SPIDER_MIDDLEWARES": { + "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, + }, + "HTTPERROR_ALLOW_ALL": False, + }, + write_disposition="append", + ) + ``` + + In the above example, scrapy settings are passed as a parameter. For more information about + scrapy settings, please refer to the + [Scrapy documentation.](https://docs.scrapy.org/en/latest/topics/settings.html). + +1. To limit the number of items processed, use the "on_before_start" function to set a limit on + the resources the pipeline processes. For instance, setting the resource limit to two allows + the pipeline to yield a maximum of two resources. + + ```py + def on_before_start(res: DltResource) -> None: + res.add_limit(2) + + run_pipeline( + pipeline, + MySpider, + batch_size=10, + scrapy_settings={ + "DEPTH_LIMIT": 100, + "SPIDER_MIDDLEWARES": { + "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, + } + }, + on_before_start=on_before_start, + write_disposition="append", + ) + ``` + +1. To create a pipeline using Scrapy host, use `create_pipeline_runner` defined in + `helpers.py`. As follows: + + ```py + scraping_host = create_pipeline_runner(pipeline, MySpider, batch_size=10) + scraping_host.pipeline_runner.scraping_resource.add_limit(2) + scraping_host.run(dataset_name="quotes", write_disposition="append") + ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md index 3350e19230..ae526668f2 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md @@ -1,10 +1,13 @@ -# Shopify +--- +title: Shopify +description: dlt pipeline for Shopify API +keywords: [shopify api, shopify pipeline, shopify] +--- +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# Shopify -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Shopify](https://www.shopify.com/) is a user-friendly e-commerce solution that enables anyone to easily create and manage their own online store. Whereas a [Shopify partner](https://partners.shopify.com/) is an individual or company that diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 446da96123..970a891e60 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -3,14 +3,11 @@ title: Slack description: dlt verified source for Slack API keywords: [slack api, slack verified source, slack] --- +import Header from './_source-info-header.md'; # Slack -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. -::: +
[Slack](https://slack.com/) is a popular messaging and collaboration platform for teams and organizations. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index aeb1408531..e6c8e47f28 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -1,10 +1,13 @@ -# 30+ SQL Databases +--- +title: 30+ SQL Databases +description: dlt pipeline for SQL Database +keywords: [sql connector, sql database pipeline, sql database] +--- +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# 30+ SQL Databases -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
SQL databases are management systems (DBMS) that store data in a structured format, commonly used for efficient and reliable data retrieval. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md index ab7780e971..caf5ae2359 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md @@ -3,14 +3,11 @@ title: Strapi description: dlt verified source for Strapi API keywords: [strapi api, strapi verified source, strapi] --- +import Header from './_source-info-header.md'; # Strapi -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Strapi](https://strapi.io/) is a headless CMS (Content Management System) that allows developers to create API-driven content management systems without having to write a lot of custom code. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 318e5a8b5e..5844844cca 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -3,15 +3,11 @@ title: Stripe description: dlt verified source for Stripe API keywords: [stripe api, stripe verified source, stripe] --- +import Header from './_source-info-header.md'; # Stripe -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: - +
[Stripe](https://stripe.com) is an online payment platform that allows businesses to securely process and manage customer transactions over the Internet. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 9f2886f708..472f48a28f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -1,10 +1,13 @@ -# Workable +--- +title: Workable +description: dlt pipeline for Workable API +keywords: [workable api, workable pipeline, workable] +--- +import Header from './_source-info-header.md'; -:::info Need help deploying these sources, or figuring out how to run them in your data stack? +# Workable -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Workable](https://www.workable.com/) is an online platform for posting jobs and managing the hiring process. With Workable, employers can create job listings, receive applications, track candidates, collaborate with team diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md index 58899ce5b4..b8993ae8d5 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md @@ -3,14 +3,11 @@ title: Zendesk description: dlt pipeline for Zendesk API keywords: [zendesk api, zendesk pipeline, zendesk] --- +import Header from './_source-info-header.md'; # Zendesk -:::info Need help deploying these sources, or figuring out how to run them in your data stack? - -[Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. -::: +
[Zendesk](https://www.zendesk.com/) is a cloud-based customer service and support platform. It offers a range of features, including ticket management, self-service options, knowledge base management, live chat, customer diff --git a/docs/website/docs/general-usage/credentials/config_specs.md b/docs/website/docs/general-usage/credentials/config_specs.md index e93e1c466a..30b401727d 100644 --- a/docs/website/docs/general-usage/credentials/config_specs.md +++ b/docs/website/docs/general-usage/credentials/config_specs.md @@ -94,7 +94,7 @@ credentials = ConnectionStringCredentials() credentials.drivername = "postgresql" credentials.database = "my_database" credentials.username = "my_user" -credentials.password = "my_password" +credentials.password = "my_password" # type: ignore credentials.host = "localhost" credentials.port = 5432 @@ -120,8 +120,8 @@ Usage: ```py credentials = OAuth2Credentials( client_id="CLIENT_ID", - client_secret="CLIENT_SECRET", - refresh_token="REFRESH_TOKEN", + client_secret="CLIENT_SECRET", # type: ignore + refresh_token="REFRESH_TOKEN", # type: ignore scopes=["scope1", "scope2"] ) diff --git a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md index ba0b13636b..eff6f795ac 100644 --- a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md @@ -51,11 +51,11 @@ for row in dummy_source().dummy_data.add_map(pseudonymize_name): # 1. Create an instance of the source so you can edit it. data_source = dummy_source() # 2. Modify this source instance's resource -data_source = data_source.dummy_data.add_map(pseudonymize_name) +data_resource = data_source.dummy_data.add_map(pseudonymize_name) # 3. Inspect your result -for row in data_source: +for row in data_resource: print(row) pipeline = dlt.pipeline(pipeline_name='example', destination='bigquery', dataset_name='normalized_data') -load_info = pipeline.run(data_source) +load_info = pipeline.run(data_resource) ``` diff --git a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md index 04e4d33b13..4cbb4d7b32 100644 --- a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md @@ -44,10 +44,10 @@ def replace_umlauts_in_dict_keys(d): data_source = dummy_source() # 2. Modify this source instance's resource -data_source = data_source.dummy_data().add_map(replace_umlauts_in_dict_keys) +data_resource = data_source.dummy_data().add_map(replace_umlauts_in_dict_keys) # 3. Inspect your result -for row in data_source: +for row in data_resource: print(row) # {'Objekt_0': {'Groesse': 0, 'Aequivalenzpruefung': True}} diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md index 6b07845689..3aadb2f982 100644 --- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -127,7 +127,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Create `fetch_average_price()` function as follows: ```py - import datetime + from datetime import datetime, timedelta import requests # Uncomment transformer function if it is to be used as a transformer, @@ -160,7 +160,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the device_info = dlt.current.resource_state().setdefault("devices", {}) # Current timestamp for checking the last update - current_timestamp = datetime.datetime.now() + current_timestamp = datetime.now() # Print the current device information # print(device_info) # if you need to check state @@ -172,10 +172,10 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the # Calculate the time since the last update last_updated = ( current_timestamp - - device_data.get('timestamp', datetime.datetime.min) + device_data.get('timestamp', datetime.min) ) # Check if the device is not in state or data is older than 180 days - if device not in device_info or last_updated > datetime.timedelta(days=180): + if device not in device_info or last_updated > timedelta(days=180): try: # Make an API request to fetch device prices response = requests.get("https://serpapi.com/search", params={ diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index fe3bb8b61d..23b2218b46 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -298,13 +298,13 @@ We just yield all the events and `dlt` does the filtering (using `id` column dec Github returns events ordered from newest to oldest. So we declare the `rows_order` as **descending** to [stop requesting more pages once the incremental value is out of range](#declare-row-order-to-not-request-unnecessary-data). We stop requesting more data from the API after finding the first event with `created_at` earlier than `initial_value`. -:::note +:::note **Note on Incremental Cursor Behavior:** -When using incremental cursors for loading data, it's essential to understand how `dlt` handles records in relation to the cursor's +When using incremental cursors for loading data, it's essential to understand how `dlt` handles records in relation to the cursor's last value. By default, `dlt` will load only those records for which the incremental cursor value is higher than the last known value of the cursor. This means that any records with a cursor value lower than or equal to the last recorded value will be ignored during the loading process. -This behavior ensures efficiency by avoiding the reprocessing of records that have already been loaded, but it can lead to confusion if -there are expectations of loading older records that fall below the current cursor threshold. If your use case requires the inclusion of +This behavior ensures efficiency by avoiding the reprocessing of records that have already been loaded, but it can lead to confusion if +there are expectations of loading older records that fall below the current cursor threshold. If your use case requires the inclusion of such records, you can consider adjusting your data extraction logic, using a full refresh strategy where appropriate or using `last_value_func` as discussed in the subsquent section. ::: @@ -625,6 +625,35 @@ Before `dlt` starts executing incremental resources, it looks for `data_interval You can run DAGs manually but you must remember to specify the Airflow logical date of the run in the past (use Run with config option). For such run `dlt` will load all data from that past date until now. If you do not specify the past date, a run with a range (now, now) will happen yielding no data. +### Reading incremental loading parameters from configuration + +Consider the example below for reading incremental loading parameters from "config.toml". We create a `generate_incremental_records` resource that yields "id", "idAfter", and "name". This resource retrieves `cursor_path` and `initial_value` from "config.toml". + +1. In "config.toml", define the `cursor_path` and `initial_value` as: + ```toml + # Configuration snippet for an incremental resource + [pipeline_with_incremental.sources.id_after] + cursor_path = "idAfter" + initial_value = 10 + ``` + + `cursor_path` is assigned the value "idAfter" with an initial value of 10. + +1. Here's how the `generate_incremental_records` resource uses `cursor_path` defined in "config.toml": + ```py + @dlt.resource(table_name="incremental_records") + def generate_incremental_records(id_after: dlt.sources.incremental = dlt.config.value): + for i in range(150): + yield {"id": i, "idAfter": i, "name": "name-" + str(i)} + + pipeline = dlt.pipeline( + pipeline_name="pipeline_with_incremental", + destination="duckdb", + ) + + pipeline.run(generate_incremental_records) + ``` + `id_after` incrementally stores the latest `cursor_path` value for future pipeline runs. ## Doing a full refresh diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index e2e95d937f..66c4281d8d 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -63,7 +63,7 @@ accepts following arguments: ... # the `table_schema` method gets table schema generated by a resource - print(get_users().table_schema()) + print(get_users().compute_table_schema()) ``` > 💡 You can pass dynamic hints which are functions that take the data item as input and return a @@ -154,7 +154,7 @@ def repo_events() -> Iterator[TDataItems]: # the `table_schema` method gets table schema generated by a resource and takes optional # data item to evaluate dynamic hints -print(repo_events().table_schema({"type": "WatchEvent", id:...})) +print(repo_events().compute_table_schema({"type": "WatchEvent", id:...})) ``` In more advanced cases, you can dispatch data to different tables directly in the code of the @@ -283,7 +283,7 @@ def get_orders(): yield o # users and orders will be iterated in parallel in two separate threads -pipeline.run(get_users(), get_orders()) +pipeline.run([get_users(), get_orders()]) ``` Async generators are automatically extracted concurrently with other resources: diff --git a/docs/website/docs/general-usage/schema-contracts.md b/docs/website/docs/general-usage/schema-contracts.md index 1b5e67357a..c79d240520 100644 --- a/docs/website/docs/general-usage/schema-contracts.md +++ b/docs/website/docs/general-usage/schema-contracts.md @@ -124,7 +124,7 @@ As with any other exception coming from pipeline run, it will be re-raised via ` ```py try: pipeline.run() -except Exception as pip_ex: +except PipelineStepFailed as pip_ex: if pip_ex.step == "normalize": if isinstance(pip_ex.__context__.__context__, DataValidationError): ... diff --git a/docs/website/docs/general-usage/schema-evolution.md b/docs/website/docs/general-usage/schema-evolution.md index 24d0ff1d58..9e225fba01 100644 --- a/docs/website/docs/general-usage/schema-evolution.md +++ b/docs/website/docs/general-usage/schema-evolution.md @@ -163,7 +163,7 @@ data = [{ pipeline = dlt.pipeline("organizations_pipeline", destination="duckdb") # Adding not null constraint -pipeline.run(data, table_name="org", columns={"room": {"data_type": "integer", "nullable": False}}) +pipeline.run(data, table_name="org", columns={"room": {"data_type": "bigint", "nullable": False}}) ``` During pipeline execution a data validation error indicates that a removed column is being passed as null. diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 164814010d..cb1c73c340 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -268,9 +268,44 @@ settings: re:^updated_at$: timestamp re:^_dlt_list_idx$: bigint ``` +### Applying data types directly with `@dlt.resource` and `apply_hints` +`dlt` offers the flexibility to directly apply data types and hints in your code, bypassing the need for importing and adjusting schemas. This approach is ideal for rapid prototyping and handling data sources with dynamic schema requirements. + +### Direct specification in `@dlt.resource` +Directly define data types and their properties, such as nullability, within the `@dlt.resource` decorator. This eliminates the dependency on external schema files. For example: + +```py +@dlt.resource(name='my_table', columns={"my_column": {"data_type": "bool", "nullable": True}}) +def my_resource(): + for i in range(10): + yield {'my_column': i % 2 == 0} +``` +This code snippet sets up a nullable boolean column named `my_column` directly in the decorator. + +#### Using `apply_hints` +When dealing with dynamically generated resources or needing to programmatically set hints, `apply_hints` is your tool. It's especially useful for applying hints across various collections or tables at once. + +For example, to apply a complex data type across all collections from a MongoDB source: + +```py +all_collections = ["collection1", "collection2", "collection3"] # replace with your actual collection names +source_data = mongodb().with_resources(*all_collections) + +for col in all_collections: + source_data.resources[col].apply_hints(columns={"column_name": {"data_type": "complex"}}) + +pipeline = dlt.pipeline( + pipeline_name="mongodb_pipeline", + destination="duckdb", + dataset_name="mongodb_data" +) +load_info = pipeline.run(source_data) +``` +This example iterates through MongoDB collections, applying the complex [data type](schema#data-types) to a specified column, and then processes the data with `pipeline.run`. ## Export and import schema files + Please follow the guide on [how to adjust a schema](../walkthroughs/adjust-a-schema.md) to export and import `yaml` schema files in your pipeline. @@ -317,7 +352,7 @@ def textual(nesting_level: int): schema.remove_type_detection("iso_timestamp") # convert UNIX timestamp (float, withing a year from NOW) into timestamp schema.add_type_detection("timestamp") - schema.compile_settings() + schema._compile_settings() - return dlt.resource(...) + return dlt.resource([]) ``` diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md index fc32aa2c30..29a0ae86f8 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md @@ -17,10 +17,10 @@ You can setup GCP cloud function webhook using `dlt` as follows: ```py import dlt - import json import time from google.cloud import bigquery - + from dlt.common import json + def your_webhook(request): # Extract relevant data from the request payload data = request.get_json() @@ -40,7 +40,7 @@ You can setup GCP cloud function webhook using `dlt` as follows: 7. Set the function name as "your_webhook" in the Entry point field. 8. In the requirements.txt file, specify the necessary packages: - ```py + ```text # Function dependencies, for example: # package>=version dlt diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md new file mode 100644 index 0000000000..cca882ba38 --- /dev/null +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -0,0 +1,157 @@ +--- +title: Deploy with Dagster +description: How to deploy a pipeline with Dagster +keywords: [how to, deploy a pipeline, Dagster] +--- + +# Deploy with Dagster + +## Introduction to Dagster + +Dagster is an orchestrator designed for developing and maintaining data assets, such as +tables, data sets, machine learning models, and reports. Dagster ensures these processes are +reliable and focuses on using software-defined assets (SDAs) to simplify complex data management, +enhance the ability to reuse code, and provide a better understanding of data. + +To read more, please refer to Dagster’s +[documentation.](https://docs.dagster.io/getting-started?_gl=1*19ikq9*_ga*NTMwNTUxNDAzLjE3MDg5Mjc4OTk.*_ga_84VRQZG7TV*MTcwOTkwNDY3MS4zLjEuMTcwOTkwNTYzNi41Ny4wLjA.*_gcl_au*OTM3OTU1ODMwLjE3MDg5Mjc5MDA.) + +### Dagster Cloud Features + +Dagster Cloud offers enterprise-level orchestration service with serverless or hybrid deployment +options. It incorporates native branching and built-in CI/CD to prioritize the developer experience. +It enables scalable, cost-effective operations without the hassle of infrastructure management. + +### Dagster deployment options: **Serverless** versus **Hybrid**: + +The *serverless* option fully hosts the orchestration engine, while the *hybrid* model offers +flexibility to use your computing resources, with Dagster managing the control plane. Reducing +operational overhead and ensuring security. + +For more info, please refer to the Dagster Cloud [docs.](https://dagster.io/cloud) + +### Using Dagster for Free + +Dagster offers a 30-day free trial during which you can explore its features, such as pipeline +orchestration, data quality checks, and embedded ELTs. You can try Dagster using its open source or +by signing up for the trial. + +## Building Data Pipelines with `dlt` + +`dlt` is an open-source Python library that allows you to declaratively load data sources into +well-structured tables or datasets through automatic schema inference and evolution. It simplifies +building data pipelines with support for extract and load processes. + +**How does `dlt` integrate with Dagster for pipeline orchestration?** + +`dlt` integrates with Dagster for pipeline orchestration, providing a streamlined process for +building, enhancing, and managing data pipelines. This enables developers to leverage `dlt`'s +capabilities for handling data extraction and load and Dagster's orchestration features to efficiently manage and monitor data pipelines. + +### Orchestrating `dlt` pipeline on Dagster + +Here's a concise guide to orchestrating a `dlt` pipeline with Dagster, using the project "Ingesting +GitHub issues data from a repository and storing it in BigQuery" as an example. + +More details can be found in the article +[“Orchestrating unstructured data pipelines with dagster and dlt."](https://dagster.io/blog/dagster-dlt) + +**The steps are as follows:** +1. Create a `dlt` pipeline. For more, please refer to the documentation: +[Creating a pipeline.](https://dlthub.com/docs/walkthroughs/create-a-pipeline) + +1. Set up a Dagster project, configure resources, and define the asset as follows: + + 1. To create a Dagster project: + ```sh + mkdir dagster_github_issues + cd dagster_github_issues + dagster project scaffold --name github-issues + ``` + + 1. Define `dlt` as a Dagster resource: + ```py + from dagster import ConfigurableResource + from dagster import ConfigurableResource + import dlt + + class DltPipeline(ConfigurableResource): + pipeline_name: str + dataset_name: str + destination: str + + def create_pipeline(self, resource_data, table_name): + + # configure the pipeline with your destination details + pipeline = dlt.pipeline( + pipeline_name=self.pipeline_name, + destination=self.destination, + dataset_name=self.dataset_name + ) + + # run the pipeline with your parameters + load_info = pipeline.run(resource_data, table_name=table_name) + + return load_info + ``` + 1. Define the asset as: + ```py + @asset + def issues_pipeline(pipeline: DltPipeline): + + logger = get_dagster_logger() + results = pipeline.create_pipeline(github_issues_resource, table_name='github_issues') + logger.info(results) + ``` + > For more information, please refer to + > [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) + +1. Next, define Dagster definitions as follows: + ```py + all_assets = load_assets_from_modules([assets]) + simple_pipeline = define_asset_job(name="simple_pipeline", selection= ['issues_pipeline']) + + defs = Definitions( + assets=all_assets, + jobs=[simple_pipeline], + resources={ + "pipeline": DltPipeline( + pipeline_name = "github_issues", + dataset_name = "dagster_github_issues", + destination = "bigquery", + ), + } + ) + ``` + +1. Finally, start the web server as: + + ```sh + dagster dev + ``` + +:::info +For the complete hands-on project on “Orchestrating unstructured data pipelines with dagster and +`dlt`", please refer to [article](https://dagster.io/blog/dagster-dlt). The author offers a +detailed overview and steps for ingesting GitHub issue data from a repository and storing it in +BigQuery. You can use a similar approach to build your pipelines. +::: + +### Additional Resources + +- A general configurable `dlt` resource orchestrated on Dagster: + [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). + +- Configure `dlt` pipelines for Dagster: + [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). + +- Configure MongoDB source as an Asset factory: + > Dagster provides the feature of + > [@multi_asset](https://github.com/dlt-hub/dlt-dagster-demo/blob/21a8d18b6f0424f40f2eed5030989306af8b8edb/mongodb_dlt/mongodb_dlt/assets/__init__.py#L18) + > declaration that will allow us to convert each collection under a database into a separate + > asset. This will make our pipeline easy to debug in case of failure and the collections + > independent of each other. + +:::note +These are external repositories and are subject to change. +::: diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-kestra.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-kestra.md new file mode 100644 index 0000000000..cfb63ce808 --- /dev/null +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-kestra.md @@ -0,0 +1,116 @@ +--- +title: Deploy with Kestra +description: How to deploy a pipeline with Kestra +keywords: [how to, deploy a pipeline, Kestra] +--- + +# Deploy with Kestra + +## Introduction to Kestra + +[Kestra](https://kestra.io/docs) is an open-source, scalable orchestration platform that enables +engineers to manage business-critical workflows declaratively in code. By applying  +infrastructure as code best practices to data, process, and microservice orchestration, you +can build and manage reliable workflows. + +Kestra facilitates reliable workflow management, offering advanced settings for resiliency, +triggers, real-time monitoring, and integration capabilities, making it a valuable tool for data +engineers and developers. + +### Kestra features + +Kestra provides a robust orchestration engine with features including: + +- Workflows accessible through a user interface, event-driven + automation, and an embedded visual studio code editor. +- It also offers embedded documentation, a live-updating topology view, and access to over 400 + plugins, enhancing its versatility. +- Kestra supports Git & CI/CD integrations, basic authentication, and benefits from community + support. + +To know more, please refer to [Kestra's documentation.](https://kestra.io/docs) + +## Building Data Pipelines with `dlt` + +**`dlt`** is an open-source Python library that allows you to declaratively load data sources +into well-structured tables or datasets. It does this through automatic schema inference and evolution. +The library simplifies building data pipeline by providing functionality to support the entire extract +and load process. + +### How does `dlt` integrate with Kestra for pipeline orchestration? + +To illustrate setting up a pipeline in Kestra, we’ll be using the following example: +[From Inbox to Insights AI-Enhanced Email Analysis with dlt and Kestra.](https://kestra.io/blogs/2023-12-04-dlt-kestra-usage) + +The example demonstrates automating a workflow to load data from Gmail to BigQuery using the `dlt`, +complemented by AI-driven summarization and sentiment analysis. You can refer to the project's +github repo by clicking [here.](https://github.com/dlt-hub/dlt-kestra-demo) + +:::info +For the detailed guide, please take a look at the project's [README](https://github.com/dlt-hub/dlt-kestra-demo/blob/main/README.md) section. +::: + +Here is the summary of the steps: + +1. Start by creating a virtual environment. + +1. Generate an `.env` File: Inside your project repository, create an `.env` file to store + credentials in "base64" format, prefixed with 'SECRET\_' for compatibility with Kestra's `secret()` + function. + +1. As per Kestra’s recommendation, install the docker desktop on your machine. + +1. Ensure Docker is running, then download the Docker compose file with: + + ```sh + curl -o docker-compose.yml \ + https://raw.githubusercontent.com/kestra-io/kestra/develop/docker-compose.yml + ``` + +1. Configure Docker compose file: + Edit the downloaded Docker compose file to link the `.env` file for environment + variables. + + ```yaml + kestra: + image: kestra/kestra:develop-full + env_file: + - .env + ``` + +1. Enable Auto-Restart: In your `docker-compose.yml`, set `restart: always` for both postgres and + kestra services to ensure they reboot automatically after a system restart. + +1. Launch Kestra Server: Execute `docker compose up -d` to start the server. + +1. Access Kestra UI: Navigate to `http://localhost:8080/` to use the Kestra user interface. + +1. Create and Configure Flows: + + - Go to 'Flows', then 'Create'. + - Configure the flow files in the editor. + - Save your flows. + +1. **Understand Flow Components**: + + - Each flow must have an `id`, `namespace`, and a list of `tasks` with their respective `id` and + `type`. + - The main flow orchestrates tasks like loading data from a source to a destination. + +By following these steps, you establish a structured workflow within Kestra, leveraging its powerful +features for efficient data pipeline orchestration. + +:::info +For detailed information on these steps, please consult the `README.md` in the +[dlt-kestra-demo](https://github.com/dlt-hub/dlt-kestra-demo/blob/main/README.md) repo. +::: + +### Additional Resources + +- Ingest Zendesk data into Weaviate using `dlt` with Kestra: + [here](https://kestra.io/blueprints/148-ingest-zendesk-data-into-weaviate-using-dlt). +- Ingest Zendesk data into DuckDb using dlt with Kestra: + [here.](https://kestra.io/blueprints/147-ingest-zendesk-data-into-duckdb-using-dlt) +- Ingest Pipedrive CRM data to BigQuery using `dlt` and schedule it to run every hour: + [here.](https://kestra.io/blueprints/146-ingest-pipedrive-crm-data-to-bigquery-using-dlt-and-schedule-it-to-run-every-hour) + diff --git a/docs/website/package.json b/docs/website/package.json index 70bca2d84f..1b1a5b1801 100644 --- a/docs/website/package.json +++ b/docs/website/package.json @@ -4,7 +4,7 @@ "private": true, "scripts": { "docusaurus": "docusaurus", - "start": "node tools/update_version_env.js && node tools/preprocess_docs.js && concurrently --kill-others \"node tools/preprocess_docs.js --watch\" \"docusaurus start\"", + "start": "PYTHONPATH=. poetry run pydoc-markdown && node tools/update_version_env.js && node tools/preprocess_docs.js && concurrently --kill-others \"node tools/preprocess_docs.js --watch\" \"docusaurus start\"", "build": "node tools/preprocess_docs.js && PYTHONPATH=. poetry run pydoc-markdown && node tools/update_version_env.js && docusaurus build", "build:netlify": "node tools/preprocess_docs.js && PYTHONPATH=. pydoc-markdown && node tools/update_version_env.js && docusaurus build --out-dir build/docs", "swizzle": "docusaurus swizzle", diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index a313367908..15c9c27512 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -69,6 +69,7 @@ const sidebars = { 'dlt-ecosystem/verified-sources/personio', 'dlt-ecosystem/verified-sources/pipedrive', 'dlt-ecosystem/verified-sources/salesforce', + 'dlt-ecosystem/verified-sources/scrapy', 'dlt-ecosystem/verified-sources/shopify', 'dlt-ecosystem/verified-sources/sql_database', 'dlt-ecosystem/verified-sources/slack', @@ -217,6 +218,8 @@ const sidebars = { 'reference/explainers/airflow-gcp-cloud-composer', 'walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions', 'walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook', + 'walkthroughs/deploy-a-pipeline/deploy-with-kestra', + 'walkthroughs/deploy-a-pipeline/deploy-with-dagster', ] }, { diff --git a/mypy.ini b/mypy.ini index 829da1c6ce..089fde35aa 100644 --- a/mypy.ini +++ b/mypy.ini @@ -116,4 +116,7 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-dotenv.*] +ignore_missing_imports = True + +[mypy-pytz.*] ignore_missing_imports = True \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 094345d590..05025b827b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -9106,4 +9106,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "3059208353b25cbd14865a8b59e8b1cb5aacfc988b60fb950051debe53abaf65" +content-hash = "689daf5e8e7a187e615f4055009988b80a783d2aeed6f2264c7503668433f02c" diff --git a/pyproject.toml b/pyproject.toml index c02d80d182..213db8d01d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.4.7" +version = "0.4.8a0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Ty Dunn "] @@ -147,6 +147,7 @@ google-api-python-client = ">=1.7.11" pytest-asyncio = "^0.23.5" types-sqlalchemy = "^1.4.53.38" ruff = "^0.3.2" +pyjwt = "^2.8.0" [tool.poetry.group.pipeline] optional = true diff --git a/tests/cli/common/test_telemetry_command.py b/tests/cli/common/test_telemetry_command.py index 18bd67a5e0..1b6588c9c8 100644 --- a/tests/cli/common/test_telemetry_command.py +++ b/tests/cli/common/test_telemetry_command.py @@ -139,7 +139,6 @@ def test_instrumentation_wrappers() -> None: COMMAND_DEPLOY_REPO_LOCATION, DeploymentMethods, ) - from dlt.common.exceptions import UnknownDestinationModule with patch("dlt.common.runtime.segment.before_send", _mock_before_send): start_test_telemetry() diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index a883f76ddb..5fbcd86d92 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -126,18 +126,14 @@ class MockProdConfiguration(RunConfiguration): @configspec class FieldWithNoDefaultConfiguration(RunConfiguration): - no_default: str - - if TYPE_CHECKING: - - def __init__(self, no_default: str = None, sentry_dsn: str = None) -> None: ... + no_default: str = None @configspec class InstrumentedConfiguration(BaseConfiguration): - head: str - tube: List[str] - heels: str + head: str = None + tube: List[str] = None + heels: str = None def to_native_representation(self) -> Any: return self.head + ">" + ">".join(self.tube) + ">" + self.heels @@ -156,63 +152,50 @@ def on_resolved(self) -> None: if self.head > self.heels: raise RuntimeError("Head over heels") - if TYPE_CHECKING: - - def __init__(self, head: str = None, tube: List[str] = None, heels: str = None) -> None: ... - @configspec class EmbeddedConfiguration(BaseConfiguration): - default: str - instrumented: InstrumentedConfiguration - sectioned: SectionedConfiguration - - if TYPE_CHECKING: - - def __init__( - self, - default: str = None, - instrumented: InstrumentedConfiguration = None, - sectioned: SectionedConfiguration = None, - ) -> None: ... + default: str = None + instrumented: InstrumentedConfiguration = None + sectioned: SectionedConfiguration = None @configspec class EmbeddedOptionalConfiguration(BaseConfiguration): - instrumented: Optional[InstrumentedConfiguration] + instrumented: Optional[InstrumentedConfiguration] = None @configspec class EmbeddedSecretConfiguration(BaseConfiguration): - secret: SecretConfiguration + secret: SecretConfiguration = None @configspec class NonTemplatedComplexTypesConfiguration(BaseConfiguration): - list_val: list # type: ignore[type-arg] - tuple_val: tuple # type: ignore[type-arg] - dict_val: dict # type: ignore[type-arg] + list_val: list = None # type: ignore[type-arg] + tuple_val: tuple = None # type: ignore[type-arg] + dict_val: dict = None # type: ignore[type-arg] @configspec class DynamicConfigA(BaseConfiguration): - field_for_a: str + field_for_a: str = None @configspec class DynamicConfigB(BaseConfiguration): - field_for_b: str + field_for_b: str = None @configspec class DynamicConfigC(BaseConfiguration): - field_for_c: str + field_for_c: str = None @configspec class ConfigWithDynamicType(BaseConfiguration): - discriminator: str - embedded_config: BaseConfiguration + discriminator: str = None + embedded_config: BaseConfiguration = None @resolve_type("embedded_config") def resolve_embedded_type(self) -> Type[BaseConfiguration]: @@ -240,8 +223,8 @@ def resolve_c_type(self) -> Type[BaseConfiguration]: @configspec class SubclassConfigWithDynamicType(ConfigWithDynamicType): - is_number: bool - dynamic_type_field: Any + is_number: bool = None + dynamic_type_field: Any = None @resolve_type("embedded_config") def resolve_embedded_type(self) -> Type[BaseConfiguration]: @@ -937,11 +920,7 @@ def test_is_valid_hint() -> None: def test_configspec_auto_base_config_derivation() -> None: @configspec class AutoBaseDerivationConfiguration: - auto: str - - if TYPE_CHECKING: - - def __init__(self, auto: str = None) -> None: ... + auto: str = None assert issubclass(AutoBaseDerivationConfiguration, BaseConfiguration) assert hasattr(AutoBaseDerivationConfiguration, "auto") diff --git a/tests/common/configuration/test_container.py b/tests/common/configuration/test_container.py index 9521f5960d..eddd0b21dc 100644 --- a/tests/common/configuration/test_container.py +++ b/tests/common/configuration/test_container.py @@ -20,19 +20,15 @@ @configspec class InjectableTestContext(ContainerInjectableContext): - current_value: str + current_value: str = None def parse_native_representation(self, native_value: Any) -> None: raise ValueError(native_value) - if TYPE_CHECKING: - - def __init__(self, current_value: str = None) -> None: ... - @configspec class EmbeddedWithInjectableContext(BaseConfiguration): - injected: InjectableTestContext + injected: InjectableTestContext = None @configspec @@ -47,12 +43,12 @@ class GlobalTestContext(InjectableTestContext): @configspec class EmbeddedWithNoDefaultInjectableContext(BaseConfiguration): - injected: NoDefaultInjectableContext + injected: NoDefaultInjectableContext = None @configspec class EmbeddedWithNoDefaultInjectableOptionalContext(BaseConfiguration): - injected: Optional[NoDefaultInjectableContext] + injected: Optional[NoDefaultInjectableContext] = None @pytest.fixture() diff --git a/tests/common/configuration/test_credentials.py b/tests/common/configuration/test_credentials.py index ae9b96e903..7c184c16e5 100644 --- a/tests/common/configuration/test_credentials.py +++ b/tests/common/configuration/test_credentials.py @@ -158,6 +158,34 @@ def test_connection_string_resolved_from_native_representation_env(environment: assert c.host == "aws.12.1" +def test_connection_string_from_init() -> None: + c = ConnectionStringCredentials("postgres://loader:pass@localhost:5432/dlt_data?a=b&c=d") + assert c.drivername == "postgres" + assert c.is_resolved() + assert not c.is_partial() + + c = ConnectionStringCredentials( + { + "drivername": "postgres", + "username": "loader", + "password": "pass", + "host": "localhost", + "port": 5432, + "database": "dlt_data", + "query": {"a": "b", "c": "d"}, + } + ) + assert c.drivername == "postgres" + assert c.username == "loader" + assert c.password == "pass" + assert c.host == "localhost" + assert c.port == 5432 + assert c.database == "dlt_data" + assert c.query == {"a": "b", "c": "d"} + assert c.is_resolved() + assert not c.is_partial() + + def test_gcp_service_credentials_native_representation(environment) -> None: with pytest.raises(InvalidGoogleNativeCredentialsType): GcpServiceAccountCredentials().parse_native_representation(1) diff --git a/tests/common/configuration/test_inject.py b/tests/common/configuration/test_inject.py index c6ab8aa756..1aa52c1919 100644 --- a/tests/common/configuration/test_inject.py +++ b/tests/common/configuration/test_inject.py @@ -167,7 +167,7 @@ def test_inject_with_sections() -> None: def test_inject_spec_in_func_params() -> None: @configspec class TestConfig(BaseConfiguration): - base_value: str + base_value: str = None # if any of args (ie. `init` below) is an instance of SPEC, we use it as initial value @@ -179,7 +179,7 @@ def test_spec_arg(base_value=dlt.config.value, init: TestConfig = None): spec = get_fun_spec(test_spec_arg) assert spec == TestConfig # call function with init, should resolve even if we do not provide the base_value in config - assert test_spec_arg(init=TestConfig(base_value="A")) == "A" # type: ignore[call-arg] + assert test_spec_arg(init=TestConfig(base_value="A")) == "A" def test_inject_with_sections_and_sections_context() -> None: @@ -272,7 +272,7 @@ def test_sections(value=dlt.config.value): def test_base_spec() -> None: @configspec class BaseParams(BaseConfiguration): - str_str: str + str_str: str = None @with_config(base=BaseParams) def f_explicit_base(str_str=dlt.config.value, opt: bool = True): diff --git a/tests/common/configuration/test_sections.py b/tests/common/configuration/test_sections.py index 9e0bc7e26d..bf6780e087 100644 --- a/tests/common/configuration/test_sections.py +++ b/tests/common/configuration/test_sections.py @@ -25,33 +25,33 @@ @configspec class SingleValConfiguration(BaseConfiguration): - sv: str + sv: str = None @configspec class EmbeddedConfiguration(BaseConfiguration): - sv_config: Optional[SingleValConfiguration] + sv_config: Optional[SingleValConfiguration] = None @configspec class EmbeddedWithSectionedConfiguration(BaseConfiguration): - embedded: SectionedConfiguration + embedded: SectionedConfiguration = None @configspec class EmbeddedIgnoredConfiguration(BaseConfiguration): # underscore prevents the field name to be added to embedded sections - _sv_config: Optional[SingleValConfiguration] + _sv_config: Optional[SingleValConfiguration] = None @configspec class EmbeddedIgnoredWithSectionedConfiguration(BaseConfiguration): - _embedded: SectionedConfiguration + _embedded: SectionedConfiguration = None @configspec class EmbeddedWithIgnoredEmbeddedConfiguration(BaseConfiguration): - ignored_embedded: EmbeddedIgnoredWithSectionedConfiguration + ignored_embedded: EmbeddedIgnoredWithSectionedConfiguration = None def test_sectioned_configuration(environment: Any, env_provider: ConfigProvider) -> None: diff --git a/tests/common/configuration/test_spec_union.py b/tests/common/configuration/test_spec_union.py index 4892967ab7..b1e316734d 100644 --- a/tests/common/configuration/test_spec_union.py +++ b/tests/common/configuration/test_spec_union.py @@ -26,8 +26,8 @@ def auth(self): @configspec class ZenEmailCredentials(ZenCredentials): - email: str - password: TSecretValue + email: str = None + password: TSecretValue = None def parse_native_representation(self, native_value: Any) -> None: assert isinstance(native_value, str) @@ -44,8 +44,8 @@ def auth(self): @configspec class ZenApiKeyCredentials(ZenCredentials): - api_key: str - api_secret: TSecretValue + api_key: str = None + api_secret: TSecretValue = None def parse_native_representation(self, native_value: Any) -> None: assert isinstance(native_value, str) @@ -62,14 +62,14 @@ def auth(self): @configspec class ZenConfig(BaseConfiguration): - credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials] + credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials] = None some_option: bool = False @configspec class ZenConfigOptCredentials: # add none to union to make it optional - credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, None] + credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, None] = None some_option: bool = False @@ -200,10 +200,10 @@ class GoogleAnalyticsCredentialsOAuth(GoogleAnalyticsCredentialsBase): This class is used to store credentials Google Analytics """ - client_id: str - client_secret: TSecretValue - project_id: TSecretValue - refresh_token: TSecretValue + client_id: str = None + client_secret: TSecretValue = None + project_id: TSecretValue = None + refresh_token: TSecretValue = None access_token: Optional[TSecretValue] = None diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index fcec881521..4f2219716a 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -42,12 +42,12 @@ @configspec class EmbeddedWithGcpStorage(BaseConfiguration): - gcp_storage: GcpServiceAccountCredentialsWithoutDefaults + gcp_storage: GcpServiceAccountCredentialsWithoutDefaults = None @configspec class EmbeddedWithGcpCredentials(BaseConfiguration): - credentials: GcpServiceAccountCredentialsWithoutDefaults + credentials: GcpServiceAccountCredentialsWithoutDefaults = None def test_secrets_from_toml_secrets(toml_providers: ConfigProvidersContext) -> None: @@ -378,7 +378,7 @@ def test_write_value(toml_providers: ConfigProvidersContext) -> None: # dict creates only shallow dict so embedded credentials will fail creds = WithCredentialsConfiguration() - creds.credentials = SecretCredentials({"secret_value": "***** ***"}) + creds.credentials = SecretCredentials(secret_value=TSecretValue("***** ***")) with pytest.raises(ValueError): provider.set_value("written_creds", dict(creds), None) diff --git a/tests/common/configuration/utils.py b/tests/common/configuration/utils.py index 73643561dc..670dcac87a 100644 --- a/tests/common/configuration/utils.py +++ b/tests/common/configuration/utils.py @@ -3,6 +3,7 @@ import datetime # noqa: I251 from typing import ( Any, + ClassVar, Iterator, List, Optional, @@ -71,19 +72,15 @@ class SecretCredentials(CredentialsConfiguration): @configspec class WithCredentialsConfiguration(BaseConfiguration): - credentials: SecretCredentials + credentials: SecretCredentials = None @configspec class SectionedConfiguration(BaseConfiguration): - __section__ = "DLT_TEST" + __section__: ClassVar[str] = "DLT_TEST" password: str = None - if TYPE_CHECKING: - - def __init__(self, password: str = None) -> None: ... - @pytest.fixture(scope="function") def environment() -> Any: diff --git a/tests/common/reflection/test_reflect_spec.py b/tests/common/reflection/test_reflect_spec.py index 092d25b717..952d0fc596 100644 --- a/tests/common/reflection/test_reflect_spec.py +++ b/tests/common/reflection/test_reflect_spec.py @@ -314,7 +314,7 @@ def f_kw_defaults_args( def test_reflect_custom_base() -> None: @configspec class BaseParams(BaseConfiguration): - str_str: str + str_str: str = None def _f_1(str_str=dlt.config.value, p_def: bool = True): pass diff --git a/tests/common/runtime/test_logging.py b/tests/common/runtime/test_logging.py index 19f67fe899..5ff92f7d94 100644 --- a/tests/common/runtime/test_logging.py +++ b/tests/common/runtime/test_logging.py @@ -3,7 +3,7 @@ from dlt.common import logger from dlt.common.runtime import exec_info -from dlt.common.runtime.logger import is_logging +from dlt.common.logger import is_logging from dlt.common.typing import StrStr, DictStrStr from dlt.common.configuration import configspec from dlt.common.configuration.specs import RunConfiguration diff --git a/tests/common/runtime/test_telemetry.py b/tests/common/runtime/test_telemetry.py index eece36aae7..e67f7e8360 100644 --- a/tests/common/runtime/test_telemetry.py +++ b/tests/common/runtime/test_telemetry.py @@ -35,16 +35,6 @@ class SentryLoggerConfiguration(RunConfiguration): class SentryLoggerCriticalConfiguration(SentryLoggerConfiguration): log_level: str = "CRITICAL" - if TYPE_CHECKING: - - def __init__( - self, - pipeline_name: str = "logger", - sentry_dsn: str = "https://sentry.io", - dlthub_telemetry_segment_write_key: str = "TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB", - log_level: str = "CRITICAL", - ) -> None: ... - def test_sentry_log_level() -> None: from dlt.common.runtime.sentry import _get_sentry_log_level diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 0bb7818b31..fe9e4b1476 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -132,7 +132,7 @@ def test_new_incomplete_column() -> None: def test_merge_columns() -> None: # tab_b overrides non default - col_a = utils.merge_columns(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=False) + col_a = utils.merge_column(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=False) # nullable is False - tab_b has it as default and those are not merged assert col_a == { "name": "test_2", @@ -146,7 +146,7 @@ def test_merge_columns() -> None: "prop": None, } - col_a = utils.merge_columns(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=True) + col_a = utils.merge_column(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=True) # nullable is True and primary_key is present - default values are merged assert col_a == { "name": "test_2", @@ -173,10 +173,10 @@ def test_diff_tables() -> None: empty = utils.new_table("table") del empty["resource"] print(empty) - partial = utils.diff_tables(empty, deepcopy(table)) + partial = utils.diff_table(empty, deepcopy(table)) # partial is simply table assert partial == table - partial = utils.diff_tables(deepcopy(table), empty) + partial = utils.diff_table(deepcopy(table), empty) # partial is empty assert partial == empty @@ -184,7 +184,7 @@ def test_diff_tables() -> None: changed = deepcopy(table) changed["description"] = "new description" changed["name"] = "new name" - partial = utils.diff_tables(deepcopy(table), changed) + partial = utils.diff_table(deepcopy(table), changed) print(partial) assert partial == {"name": "new name", "description": "new description", "columns": {}} @@ -192,7 +192,7 @@ def test_diff_tables() -> None: existing = deepcopy(table) changed["write_disposition"] = "append" changed["schema_contract"] = "freeze" - partial = utils.diff_tables(deepcopy(existing), changed) + partial = utils.diff_table(deepcopy(existing), changed) assert partial == { "name": "new name", "description": "new description", @@ -202,14 +202,14 @@ def test_diff_tables() -> None: } existing["write_disposition"] = "append" existing["schema_contract"] = "freeze" - partial = utils.diff_tables(deepcopy(existing), changed) + partial = utils.diff_table(deepcopy(existing), changed) assert partial == {"name": "new name", "description": "new description", "columns": {}} # detect changed column existing = deepcopy(table) changed = deepcopy(table) changed["columns"]["test"]["cluster"] = True - partial = utils.diff_tables(existing, changed) + partial = utils.diff_table(existing, changed) assert "test" in partial["columns"] assert "test_2" not in partial["columns"] assert existing["columns"]["test"] == table["columns"]["test"] != partial["columns"]["test"] @@ -218,7 +218,7 @@ def test_diff_tables() -> None: existing = deepcopy(table) changed = deepcopy(table) changed["columns"]["test"]["foreign_key"] = False - partial = utils.diff_tables(existing, changed) + partial = utils.diff_table(existing, changed) assert "test" in partial["columns"] # even if not present in tab_a at all @@ -226,7 +226,7 @@ def test_diff_tables() -> None: changed = deepcopy(table) changed["columns"]["test"]["foreign_key"] = False del existing["columns"]["test"]["foreign_key"] - partial = utils.diff_tables(existing, changed) + partial = utils.diff_table(existing, changed) assert "test" in partial["columns"] @@ -242,7 +242,7 @@ def test_diff_tables_conflicts() -> None: other = utils.new_table("table_2") with pytest.raises(TablePropertiesConflictException) as cf_ex: - utils.diff_tables(table, other) + utils.diff_table(table, other) assert cf_ex.value.table_name == "table" assert cf_ex.value.prop_name == "parent" @@ -250,7 +250,7 @@ def test_diff_tables_conflicts() -> None: changed = deepcopy(table) changed["columns"]["test"]["data_type"] = "bigint" with pytest.raises(CannotCoerceColumnException): - utils.diff_tables(table, changed) + utils.diff_table(table, changed) def test_merge_tables() -> None: @@ -261,6 +261,7 @@ def test_merge_tables() -> None: "x-special": 128, "columns": {"test": COL_1_HINTS, "test_2": COL_2_HINTS}, } + print(table) changed = deepcopy(table) changed["x-special"] = 129 # type: ignore[typeddict-unknown-key] changed["description"] = "new description" @@ -269,7 +270,7 @@ def test_merge_tables() -> None: changed["new-prop-3"] = False # type: ignore[typeddict-unknown-key] # drop column so partial has it del table["columns"]["test"] - partial = utils.merge_tables(table, changed) + partial = utils.merge_table(table, changed) assert "test" in table["columns"] assert table["x-special"] == 129 # type: ignore[typeddict-item] assert table["description"] == "new description" @@ -281,3 +282,39 @@ def test_merge_tables() -> None: # one column in partial assert len(partial["columns"]) == 1 assert partial["columns"]["test"] == COL_1_HINTS + # still has incomplete column + assert table["columns"]["test_2"] == COL_2_HINTS + # check order, we dropped test so it is added at the end + print(table) + assert list(table["columns"].keys()) == ["test_2", "test"] + + +def test_merge_tables_incomplete_columns() -> None: + table: TTableSchema = { + "name": "table", + "columns": {"test_2": COL_2_HINTS, "test": COL_1_HINTS}, + } + changed = deepcopy(table) + # reverse order, this order we want to have at the end + changed["columns"] = deepcopy({"test": COL_1_HINTS, "test_2": COL_2_HINTS}) + # it is completed now + changed["columns"]["test_2"]["data_type"] = "bigint" + partial = utils.merge_table(table, changed) + assert list(partial["columns"].keys()) == ["test_2"] + # test_2 goes to the end, it was incomplete in table so it got dropped before update + assert list(table["columns"].keys()) == ["test", "test_2"] + + table = { + "name": "table", + "columns": {"test_2": COL_2_HINTS, "test": COL_1_HINTS}, + } + + changed = deepcopy(table) + # reverse order, this order we want to have at the end + changed["columns"] = deepcopy({"test": COL_1_HINTS, "test_2": COL_2_HINTS}) + # still incomplete but changed + changed["columns"]["test_2"]["nullable"] = False + partial = utils.merge_table(table, changed) + assert list(partial["columns"].keys()) == ["test_2"] + # incomplete -> incomplete stays in place + assert list(table["columns"].keys()) == ["test_2", "test"] diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 653e9cc351..887b0aa9a0 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -84,15 +84,27 @@ def test_normalize_schema_name(schema: Schema) -> None: def test_new_schema(schema: Schema) -> None: assert schema.name == "event" + assert_is_new_schema(schema) + assert_new_schema_props(schema) + stored_schema = schema.to_dict() # version hash is present - assert len(stored_schema["version_hash"]) > 0 + assert stored_schema["version"] == 1 + assert stored_schema["version_hash"] is not None utils.validate_stored_schema(stored_schema) - assert_new_schema_values(schema) + + # to dict without bumping version should be used only internally + stored_schema = schema.to_dict(bump_version=False) + # version hash is present + assert stored_schema["version"] is None + assert stored_schema["version_hash"] is None + with pytest.raises(DictValidationException): + utils.validate_stored_schema(stored_schema) def test_new_schema_custom_normalizers(cn_schema: Schema) -> None: - assert_new_schema_values_custom_normalizers(cn_schema) + assert_is_new_schema(cn_schema) + assert_new_schema_props_custom_normalizers(cn_schema) def test_schema_config_normalizers(schema: Schema, schema_storage_no_import: SchemaStorage) -> None: @@ -222,8 +234,9 @@ def test_replace_schema_content() -> None: eth_v5: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v5") eth_v5["imported_version_hash"] = "IMP_HASH" schema_eth = Schema.from_dict(eth_v5) # type: ignore[arg-type] - schema.replace_schema_content(schema_eth) + schema.replace_schema_content(schema_eth.clone()) assert schema_eth.stored_version_hash == schema.stored_version_hash + assert schema_eth.stored_version == schema.stored_version assert schema_eth.version == schema.version assert schema_eth.version_hash == schema.version_hash assert schema_eth._imported_version_hash == schema._imported_version_hash @@ -239,16 +252,52 @@ def test_replace_schema_content() -> None: # make sure we linked the replaced schema to the incoming schema = Schema("simple") + # generate version and hash + schema._bump_version() eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") - schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] - schema_eth.bump_version() + schema_eth = Schema.from_dict(eth_v5) # type: ignore[arg-type] + assert not schema_eth.is_modified # modify simple schema by adding a table schema.update_table(schema_eth.get_table("blocks")) - replaced_stored_hash = schema.stored_version_hash + replaced_stored_hash = schema.version_hash schema.replace_schema_content(schema_eth, link_to_replaced_schema=True) assert replaced_stored_hash in schema.previous_hashes - assert replaced_stored_hash == schema.stored_version_hash - assert schema.stored_version_hash != schema.version_hash + assert schema_eth.stored_version_hash == schema.stored_version_hash + assert schema_eth.stored_version == schema.stored_version + assert schema_eth.version_hash == schema.version_hash + assert schema_eth.version == schema.version + assert not schema.is_modified + + # incoming schema still modified after replace + schema = Schema("simple") + # generate version and hash + schema._bump_version() + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + assert schema_eth.is_modified + schema.replace_schema_content(schema_eth, link_to_replaced_schema=True) + assert schema.is_modified + + # replace content of new schema + schema = Schema("simple") + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + schema_eth._bump_version() + schema.replace_schema_content(schema_eth, link_to_replaced_schema=True) + # nothing got added to prev hashes + assert schema.to_dict() == schema_eth.to_dict() + + # replace content with new schema + schema = Schema("simple") + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + schema_eth.replace_schema_content(schema, link_to_replaced_schema=True) + # schema tracked + assert schema_eth.name == "simple" + assert Schema.from_dict(eth_v5, bump_version=False).version_hash in schema.previous_hashes # type: ignore[arg-type] + # but still new + assert schema_eth.is_new + assert schema_eth.is_modified # replace with self eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") @@ -270,6 +319,40 @@ def test_replace_schema_content() -> None: assert schema_eth.version_hash not in schema_eth.previous_hashes +def test_clone(schema: Schema) -> None: + # set normalizers but ignore them when cloning + os.environ["SCHEMA__NAMING"] = "direct" + + cloned = schema.clone() + assert cloned.to_dict(bump_version=False) == schema.to_dict(bump_version=False) + # dicts are not shared + assert id(cloned._settings) != id(schema._settings) + assert id(cloned._schema_tables) != id(schema._schema_tables) + # make sure version didn't change + assert cloned._stored_version == schema._stored_version + + # clone with name + cloned = schema.clone(with_name="second") + assert cloned.name == "second" + assert cloned.is_new + assert cloned.is_modified + assert cloned._imported_version_hash is None + assert cloned.previous_hashes == [] + + # clone with normalizers update + cloned = schema.clone("second", update_normalizers=True) + assert cloned._normalizers_config != schema._normalizers_config + assert cloned._normalizers_config["names"] == "direct" + + # clone modified schema + simple = Schema("simple") + cloned = simple.clone() + assert cloned.to_dict(bump_version=False) == simple.to_dict(bump_version=False) + assert cloned.is_new + assert cloned.is_modified + assert cloned._normalizers_config["names"] == "direct" + + @pytest.mark.parametrize( "columns,hint,value", [ @@ -300,13 +383,15 @@ def test_new_schema_alt_name() -> None: def test_save_store_schema(schema: Schema, schema_storage: SchemaStorage) -> None: assert not schema_storage.storage.has_file(EXPECTED_FILE_NAME) saved_file_name = schema_storage.save_schema(schema) + assert schema.is_modified is False + assert schema.is_new is False # return absolute path assert saved_file_name == schema_storage.storage.make_full_path(EXPECTED_FILE_NAME) assert schema_storage.storage.has_file(EXPECTED_FILE_NAME) schema_copy = schema_storage.load_schema("event") assert schema.name == schema_copy.name assert schema.version == schema_copy.version - assert_new_schema_values(schema_copy) + assert_new_schema_props(schema_copy) def test_save_store_schema_custom_normalizers( @@ -314,7 +399,7 @@ def test_save_store_schema_custom_normalizers( ) -> None: schema_storage.save_schema(cn_schema) schema_copy = schema_storage.load_schema(cn_schema.name) - assert_new_schema_values_custom_normalizers(schema_copy) + assert_new_schema_props_custom_normalizers(schema_copy) def test_save_load_incomplete_column( @@ -707,7 +792,7 @@ def test_normalize_table_identifiers_merge_columns() -> None: } -def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: +def assert_new_schema_props_custom_normalizers(schema: Schema) -> None: # check normalizers config assert schema._normalizers_config["names"] == "tests.common.normalizers.custom_normalizers" assert ( @@ -727,13 +812,19 @@ def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: assert row[0] == (("a_table", None), {"bool": True}) -def assert_new_schema_values(schema: Schema) -> None: - assert schema.version == 1 - assert schema.stored_version == 1 - assert schema.stored_version_hash is not None - assert schema.version_hash is not None +def assert_is_new_schema(schema: Schema) -> None: + assert schema.stored_version is None + assert schema.stored_version_hash is None assert schema.ENGINE_VERSION == 9 assert schema._stored_previous_hashes == [] + assert schema.is_modified + assert schema.is_new + + +def assert_new_schema_props(schema: Schema) -> None: + assert schema.version == 1 + assert schema.version_hash is not None + assert len(schema.settings["default_hints"]) > 0 # check settings assert ( diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index dde05001e8..b67b028161 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -22,6 +22,9 @@ def test_content_hash() -> None: assert utils.generate_version_hash(eth_v4) == hash2 eth_v4["version_hash"] = "xxxx" assert utils.generate_version_hash(eth_v4) == hash2 + # import schema hash is also excluded + eth_v4["imported_version_hash"] = "xxxx" + assert utils.generate_version_hash(eth_v4) == hash2 # changing table order does not impact the hash loads_table = eth_v4["tables"].pop("_dlt_loads") # insert at the end: _dlt_loads was first originally @@ -65,22 +68,22 @@ def test_infer_column_bumps_version() -> None: _, new_table = schema.coerce_row("event_user", None, row) schema.update_table(new_table) # schema version will be recomputed - assert schema.version == 2 + assert schema.version == 1 assert schema.version_hash is not None version_hash = schema.version_hash # another table _, new_table = schema.coerce_row("event_bot", None, row) schema.update_table(new_table) - # version is still 2 (increment of 1) - assert schema.version == 2 + # version is still 1 (increment of 1) + assert schema.version == 1 # but the hash changed assert schema.version_hash != version_hash # save saved_schema = schema.to_dict() assert saved_schema["version_hash"] == schema.version_hash - assert saved_schema["version"] == 2 + assert saved_schema["version"] == 1 def test_preserve_version_on_load() -> None: diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 0e04554649..6cb76fba9d 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -28,36 +28,43 @@ ) -@pytest.fixture -def storage() -> SchemaStorage: - return init_storage(SchemaStorageConfiguration()) +@pytest.fixture(params=[LiveSchemaStorage, SchemaStorage]) +def storage(request) -> SchemaStorage: + return init_storage(request.param, SchemaStorageConfiguration()) @pytest.fixture -def synced_storage() -> SchemaStorage: +def live_storage() -> LiveSchemaStorage: + return init_storage(LiveSchemaStorage, SchemaStorageConfiguration()) # type: ignore[return-value] + + +@pytest.fixture(params=[LiveSchemaStorage, SchemaStorage]) +def synced_storage(request) -> SchemaStorage: # will be created in /schemas return init_storage( + request.param, SchemaStorageConfiguration( import_schema_path=TEST_STORAGE_ROOT + "/import", export_schema_path=TEST_STORAGE_ROOT + "/import", - ) + ), ) -@pytest.fixture -def ie_storage() -> SchemaStorage: +@pytest.fixture(params=[LiveSchemaStorage, SchemaStorage]) +def ie_storage(request) -> SchemaStorage: # will be created in /schemas return init_storage( + request.param, SchemaStorageConfiguration( import_schema_path=TEST_STORAGE_ROOT + "/import", export_schema_path=TEST_STORAGE_ROOT + "/export", - ) + ), ) -def init_storage(C: SchemaStorageConfiguration) -> SchemaStorage: +def init_storage(cls, C: SchemaStorageConfiguration) -> SchemaStorage: # use live schema storage for test which must be backward compatible with schema storage - s = LiveSchemaStorage(C, makedirs=True) + s = cls(C, makedirs=True) assert C is s.config if C.export_schema_path: os.makedirs(C.export_schema_path, exist_ok=True) @@ -101,13 +108,17 @@ def test_import_overwrites_existing_if_modified( def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: SchemaStorage) -> None: storage_schema = assert_schema_imported(synced_storage, storage) + assert not storage_schema.is_modified + initial_version = storage_schema.stored_version # stored_version = storage_schema.stored_version # stored_version_hash = storage_schema.stored_version_hash # evolve schema row = {"floatX": 78172.128, "confidenceX": 1.2, "strX": "STR"} _, new_table = storage_schema.coerce_row("event_user", None, row) storage_schema.update_table(new_table) + assert storage_schema.is_modified storage.save_schema(storage_schema) + assert not storage_schema.is_modified # now use synced storage to load schema again reloaded_schema = synced_storage.load_schema("ethereum") # the schema was not overwritten @@ -119,6 +130,7 @@ def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: Sch # the import schema gets modified storage_schema.tables["_dlt_loads"]["write_disposition"] = "append" storage_schema.tables.pop("event_user") + # we save the import schema (using export method) synced_storage._export_schema(storage_schema, synced_storage.config.export_schema_path) # now load will import again reloaded_schema = synced_storage.load_schema("ethereum") @@ -130,8 +142,8 @@ def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: Sch assert reloaded_schema._imported_version_hash == storage_schema.version_hash assert storage_schema.previous_hashes == reloaded_schema.previous_hashes - # but original version has increased - assert reloaded_schema.stored_version == storage_schema.version + 1 + # but original version has increased twice (because it was modified twice) + assert reloaded_schema.stored_version == storage_schema.version == initial_version + 2 def test_store_schema_tampered(synced_storage: SchemaStorage, storage: SchemaStorage) -> None: @@ -188,7 +200,7 @@ def test_remove_schema(storage: SchemaStorage) -> None: assert storage.list_schemas() == [] -def test_mapping_interface(storage: SchemaStorage) -> None: +def test_getter(storage: SchemaStorage) -> None: # empty storage assert len(storage) == 0 assert "ethereum" not in storage @@ -219,6 +231,34 @@ def test_mapping_interface(storage: SchemaStorage) -> None: assert set(i[0] for i in items) == set(["ethereum", "event"]) +def test_getter_with_import(ie_storage: SchemaStorage) -> None: + with pytest.raises(KeyError): + ie_storage["ethereum"] + prepare_import_folder(ie_storage) + # schema will be imported + schema = ie_storage["ethereum"] + assert schema.name == "ethereum" + version_hash = schema.version_hash + # the import schema gets modified + schema.tables["_dlt_loads"]["write_disposition"] = "append" + mod_version_hash = schema.version_hash + assert schema.is_modified + ie_storage.save_schema(schema) + assert not schema.is_modified + # now load via getter + schema_copy = ie_storage["ethereum"] + assert schema_copy.version_hash == schema_copy.stored_version_hash == mod_version_hash + assert schema_copy._imported_version_hash == version_hash + + # now save the schema as import + ie_storage._export_schema(schema, ie_storage.config.import_schema_path) + # if you get the schema, import hash will change + schema = ie_storage["ethereum"] + assert schema._imported_version_hash == mod_version_hash + # only true for live schema + # assert id(schema) == id(schema_copy) + + def test_save_store_schema_over_import(ie_storage: SchemaStorage) -> None: prepare_import_folder(ie_storage) # we have ethereum schema to be imported but we create new schema and save it @@ -269,7 +309,11 @@ def test_save_store_schema(storage: SchemaStorage) -> None: d_n = explicit_normalizers() d_n["names"] = "tests.common.normalizers.custom_normalizers" schema = Schema("column_event", normalizers=d_n) + assert schema.is_new + assert schema.is_modified storage.save_schema(schema) + assert not schema.is_new + assert not schema.is_modified assert storage.storage.has_file( SchemaStorage.NAMED_SCHEMA_FILE_PATTERN % ("column_event", "json") ) @@ -309,6 +353,118 @@ def test_schema_from_file() -> None: ) +def test_live_schema_instances(live_storage: LiveSchemaStorage) -> None: + schema = Schema("simple") + live_storage.save_schema(schema) + + # get schema via getter + getter_schema = live_storage["simple"] + # same id + assert id(getter_schema) == id(schema) + + # live schema is same as in storage + assert live_storage.is_live_schema_committed("simple") + # modify getter schema + getter_schema._schema_description = "this is getter schema" + assert getter_schema.is_modified + # getter is not committed + assert not live_storage.is_live_schema_committed("simple") + + # separate instance via load + load_schema = live_storage.load_schema("simple") + assert id(load_schema) != id(schema) + # changes not visible + assert load_schema._schema_description is None + + # bypass live schema to simulate 3rd party change + SchemaStorage.save_schema(live_storage, getter_schema) + # committed because hashes are matching with file + assert live_storage.is_live_schema_committed("simple") + getter_schema = live_storage["simple"] + assert id(getter_schema) == id(schema) + + SchemaStorage.save_schema(live_storage, load_schema) + # still committed + assert live_storage.is_live_schema_committed("simple") + # and aware of changes in storage + getter_schema = live_storage["simple"] + assert id(getter_schema) == id(schema) + assert getter_schema._schema_description is None + getter_schema_mod_hash = getter_schema.version_hash + + # create a new "simple" schema + second_simple = Schema("simple") + second_simple._schema_description = "Second simple" + live_storage.save_schema(second_simple) + # got saved + load_schema = live_storage.load_schema("simple") + assert load_schema._schema_description == "Second simple" + # live schema seamlessly updated + assert schema._schema_description == "Second simple" + assert not schema.is_modified + assert getter_schema_mod_hash in schema.previous_hashes + + +def test_commit_live_schema(live_storage: LiveSchemaStorage) -> None: + with pytest.raises(SchemaNotFoundError): + live_storage.commit_live_schema("simple") + # set live schema + schema = Schema("simple") + set_schema = live_storage.set_live_schema(schema) + assert id(set_schema) == id(schema) + assert "simple" in live_storage.live_schemas + assert not live_storage.is_live_schema_committed("simple") + # nothing in storage + with pytest.raises(SchemaNotFoundError): + SchemaStorage.__getitem__(live_storage, "simple") + with pytest.raises(SchemaNotFoundError): + live_storage.load_schema("simple") + assert not live_storage.is_live_schema_committed("simple") + + # commit + assert live_storage.commit_live_schema("simple") is not None + # schema in storage + live_storage.load_schema("simple") + assert live_storage.is_live_schema_committed("simple") + + # second commit does not save + assert live_storage.commit_live_schema("simple") is None + + # mod the schema + schema._schema_description = "mod the schema" + assert not live_storage.is_live_schema_committed("simple") + mod_hash = schema.version_hash + + # save another instance under the same name + schema_2 = Schema("simple") + schema_2._schema_description = "instance 2" + live_storage.save_schema(schema_2) + assert live_storage.is_live_schema_committed("simple") + # content replaces in place + assert schema._schema_description == "instance 2" + assert mod_hash in schema.previous_hashes + + +def test_live_schema_getter_when_committed(live_storage: LiveSchemaStorage) -> None: + # getter on committed is aware of changes to storage (also import) + schema = Schema("simple") + live_storage.set_live_schema(schema) + set_schema = live_storage["simple"] + live_storage.commit_live_schema("simple") + # change content in storage + cloned = set_schema.clone() + cloned._schema_description = "cloned" + SchemaStorage.save_schema(live_storage, cloned) + set_schema_2 = live_storage["simple"] + assert set_schema_2._schema_description == "cloned" + assert id(set_schema_2) == id(set_schema) + + +def test_new_live_schema_committed(live_storage: LiveSchemaStorage) -> None: + with pytest.raises(SchemaNotFoundError): + live_storage.is_live_schema_committed("simple") + + # def test_save_empty_schema_name(storage: SchemaStorage) -> None: # schema = Schema("") # schema.settings["schema_sealed"] = True diff --git a/tests/common/test_destination.py b/tests/common/test_destination.py index b93cb5b483..24b0928463 100644 --- a/tests/common/test_destination.py +++ b/tests/common/test_destination.py @@ -2,7 +2,7 @@ from dlt.common.destination.reference import DestinationClientDwhConfiguration, Destination from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.exceptions import InvalidDestinationReference, UnknownDestinationModule +from dlt.common.destination.exceptions import InvalidDestinationReference, UnknownDestinationModule from dlt.common.schema import Schema from tests.utils import ACTIVE_DESTINATIONS @@ -78,7 +78,7 @@ def test_import_destination_config() -> None: dest = Destination.from_reference(ref="dlt.destinations.duckdb", environment="stage") assert dest.destination_type == "dlt.destinations.duckdb" assert dest.config_params["environment"] == "stage" - config = dest.configuration(dest.spec(dataset_name="dataset")) # type: ignore + config = dest.configuration(dest.spec()._bind_dataset_name(dataset_name="dataset")) # type: ignore assert config.destination_type == "duckdb" assert config.destination_name == "duckdb" assert config.environment == "stage" @@ -87,7 +87,7 @@ def test_import_destination_config() -> None: dest = Destination.from_reference(ref=None, destination_name="duckdb", environment="production") assert dest.destination_type == "dlt.destinations.duckdb" assert dest.config_params["environment"] == "production" - config = dest.configuration(dest.spec(dataset_name="dataset")) # type: ignore + config = dest.configuration(dest.spec()._bind_dataset_name(dataset_name="dataset")) # type: ignore assert config.destination_type == "duckdb" assert config.destination_name == "duckdb" assert config.environment == "production" @@ -98,7 +98,7 @@ def test_import_destination_config() -> None: ) assert dest.destination_type == "dlt.destinations.duckdb" assert dest.config_params["environment"] == "devel" - config = dest.configuration(dest.spec(dataset_name="dataset")) # type: ignore + config = dest.configuration(dest.spec()._bind_dataset_name(dataset_name="dataset")) # type: ignore assert config.destination_type == "duckdb" assert config.destination_name == "my_destination" assert config.environment == "devel" @@ -112,63 +112,63 @@ def test_normalize_dataset_name() -> None: # with schema name appended assert ( - DestinationClientDwhConfiguration( - dataset_name="ban_ana_dataset", default_schema_name="default" - ).normalize_dataset_name(Schema("banana")) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name="ban_ana_dataset", default_schema_name="default") + .normalize_dataset_name(Schema("banana")) == "ban_ana_dataset_banana" ) # without schema name appended assert ( - DestinationClientDwhConfiguration( - dataset_name="ban_ana_dataset", default_schema_name="default" - ).normalize_dataset_name(Schema("default")) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name="ban_ana_dataset", default_schema_name="default") + .normalize_dataset_name(Schema("default")) == "ban_ana_dataset" ) # dataset name will be normalized (now it is up to destination to normalize this) assert ( - DestinationClientDwhConfiguration( - dataset_name="BaNaNa", default_schema_name="default" - ).normalize_dataset_name(Schema("banana")) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name="BaNaNa", default_schema_name="default") + .normalize_dataset_name(Schema("banana")) == "ba_na_na_banana" ) # empty schemas are invalid with pytest.raises(ValueError): - DestinationClientDwhConfiguration( - dataset_name="banana_dataset", default_schema_name=None + DestinationClientDwhConfiguration()._bind_dataset_name( + dataset_name="banana_dataset" ).normalize_dataset_name(Schema(None)) with pytest.raises(ValueError): - DestinationClientDwhConfiguration( + DestinationClientDwhConfiguration()._bind_dataset_name( dataset_name="banana_dataset", default_schema_name="" ).normalize_dataset_name(Schema("")) # empty dataset name is valid! assert ( - DestinationClientDwhConfiguration( - dataset_name="", default_schema_name="ban_schema" - ).normalize_dataset_name(Schema("schema_ana")) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name="", default_schema_name="ban_schema") + .normalize_dataset_name(Schema("schema_ana")) == "_schema_ana" ) # empty dataset name is valid! assert ( - DestinationClientDwhConfiguration( - dataset_name="", default_schema_name="schema_ana" - ).normalize_dataset_name(Schema("schema_ana")) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name="", default_schema_name="schema_ana") + .normalize_dataset_name(Schema("schema_ana")) == "" ) # None dataset name is valid! assert ( - DestinationClientDwhConfiguration( - dataset_name=None, default_schema_name="ban_schema" - ).normalize_dataset_name(Schema("schema_ana")) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name=None, default_schema_name="ban_schema") + .normalize_dataset_name(Schema("schema_ana")) == "_schema_ana" ) # None dataset name is valid! assert ( - DestinationClientDwhConfiguration( - dataset_name=None, default_schema_name="schema_ana" - ).normalize_dataset_name(Schema("schema_ana")) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name=None, default_schema_name="schema_ana") + .normalize_dataset_name(Schema("schema_ana")) is None ) @@ -176,9 +176,9 @@ def test_normalize_dataset_name() -> None: schema = Schema("barbapapa") schema._schema_name = "BarbaPapa" assert ( - DestinationClientDwhConfiguration( - dataset_name="set", default_schema_name="default" - ).normalize_dataset_name(schema) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name="set", default_schema_name="default") + .normalize_dataset_name(schema) == "set_barba_papa" ) @@ -186,8 +186,8 @@ def test_normalize_dataset_name() -> None: def test_normalize_dataset_name_none_default_schema() -> None: # if default schema is None, suffix is not added assert ( - DestinationClientDwhConfiguration( - dataset_name="ban_ana_dataset", default_schema_name=None - ).normalize_dataset_name(Schema("default")) + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name="ban_ana_dataset", default_schema_name=None) + .normalize_dataset_name(Schema("default")) == "ban_ana_dataset" ) diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 456ef3cb91..229ce17085 100644 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -3,7 +3,7 @@ import binascii import pytest from typing import Dict -from dlt.common.exceptions import IdentifierTooLongException, PipelineException, TerminalValueError +from dlt.common.exceptions import PipelineException, TerminalValueError from dlt.common.runners import Venv from dlt.common.utils import ( @@ -231,6 +231,8 @@ def test_extend_list_deduplicated() -> None: def test_exception_traces() -> None: + from dlt.common.destination.exceptions import IdentifierTooLongException + # bare exception without stack trace trace = get_exception_trace(Exception("Message")) assert trace["message"] == "Message" @@ -243,7 +245,7 @@ def test_exception_traces() -> None: raise IdentifierTooLongException("postgres", "table", "too_long_table", 8) except Exception as exc: trace = get_exception_trace(exc) - assert trace["exception_type"] == "dlt.common.exceptions.IdentifierTooLongException" + assert trace["exception_type"] == "dlt.common.destination.exceptions.IdentifierTooLongException" assert isinstance(trace["stack_trace"], list) assert trace["exception_attrs"] == { "destination_name": "postgres", @@ -262,6 +264,8 @@ def test_exception_traces() -> None: def test_exception_trace_chain() -> None: + from dlt.common.destination.exceptions import IdentifierTooLongException + try: raise TerminalValueError("Val") except Exception: @@ -276,7 +280,10 @@ def test_exception_trace_chain() -> None: # outer exception first assert len(traces) == 3 assert traces[0]["exception_type"] == "dlt.common.exceptions.PipelineException" - assert traces[1]["exception_type"] == "dlt.common.exceptions.IdentifierTooLongException" + assert ( + traces[1]["exception_type"] + == "dlt.common.destination.exceptions.IdentifierTooLongException" + ) assert traces[2]["exception_type"] == "dlt.common.exceptions.TerminalValueError" diff --git a/tests/destinations/test_custom_destination.py b/tests/destinations/test_custom_destination.py index 7b74e5406c..cfefceac88 100644 --- a/tests/destinations/test_custom_destination.py +++ b/tests/destinations/test_custom_destination.py @@ -12,16 +12,16 @@ from dlt.common.schema import TTableSchema from dlt.common.data_writers.writers import TLoaderFileFormat from dlt.common.destination.reference import Destination -from dlt.pipeline.exceptions import PipelineStepFailed -from dlt.common.utils import uniq_id -from dlt.common.exceptions import DestinationTerminalException, InvalidDestinationReference +from dlt.common.destination.exceptions import InvalidDestinationReference from dlt.common.configuration.exceptions import ConfigFieldMissingException from dlt.common.configuration.specs import ConnectionStringCredentials -from dlt.destinations.impl.destination.factory import _DESTINATIONS -from dlt.destinations.impl.destination.configuration import CustomDestinationClientConfiguration from dlt.common.configuration.inject import get_fun_spec from dlt.common.configuration.specs import BaseConfiguration +from dlt.destinations.impl.destination.factory import _DESTINATIONS +from dlt.destinations.impl.destination.configuration import CustomDestinationClientConfiguration +from dlt.pipeline.exceptions import PipelineStepFailed + from tests.load.utils import ( TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA, @@ -455,14 +455,13 @@ def my_gcp_sink( def test_destination_with_spec() -> None: @configspec class MyDestinationSpec(CustomDestinationClientConfiguration): - my_predefined_val: str + my_predefined_val: str = None # check destination without additional config params @dlt.destination(spec=MyDestinationSpec) def sink_func_with_spec( items: TDataItems, table: TTableSchema, my_predefined_val=dlt.config.value ) -> None: - # raise DestinationTerminalException("NEVER") pass wrapped_callable = sink_func_with_spec().config_params["destination_callable"] diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 0f19239330..dca4c0be6e 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -45,6 +45,24 @@ from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V9 +def test_default_resource() -> None: + @dlt.resource + def resource(): + yield [1, 2, 3] + + # simple generated table schema + assert resource().compute_table_schema() == { + "columns": {}, + "name": "resource", + "resource": "resource", + "write_disposition": "append", + } + assert resource().name == "resource" + assert resource._args_bound is False + assert resource.incremental is None + assert resource.write_disposition == "append" + + def test_none_returning_source() -> None: with pytest.raises(SourceNotAFunction): dlt.source("data")() # type: ignore[call-overload] diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index b86e198988..1879eaa9eb 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -15,6 +15,7 @@ from dlt.extract.extract import ExtractStorage, Extract from dlt.extract.hints import make_hints +from dlt.extract.items import TableNameMeta from tests.utils import clean_test_storage, TEST_STORAGE_ROOT from tests.extract.utils import expect_extracted_file @@ -164,6 +165,52 @@ def with_table_hints(): assert "pk" not in table["columns"] +def test_extract_hints_table_variant(extract_step: Extract) -> None: + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + @dlt.resource(primary_key="pk") + def with_table_hints(): + yield dlt.mark.with_hints( + {"id": 1, "pk": "A"}, + make_hints(table_name="table_a", columns=[{"name": "id", "data_type": "bigint"}]), + create_table_variant=True, + ) + # get the resource + resource = dlt.current.source().resources[dlt.current.resource_name()] + assert "table_a" in resource._hints_variants + # get table + table = resource.compute_table_schema(meta=TableNameMeta("table_a")) + assert "pk" in table["columns"] + assert "id" in table["columns"] + assert table["columns"]["pk"]["primary_key"] is True + assert table["columns"]["id"]["data_type"] == "bigint" + + schema = dlt.current.source_schema() + # table table_a will be created + assert "table_a" in schema.tables + schema_table = schema.tables["table_a"] + assert table == schema_table + + # dispatch to table b + yield dlt.mark.with_hints( + {"id": 2, "pk": "B"}, + make_hints(table_name="table_b", write_disposition="replace"), + create_table_variant=True, + ) + assert "table_b" in resource._hints_variants + # get table + table = resource.compute_table_schema(meta=TableNameMeta("table_b")) + assert table["write_disposition"] == "replace" + schema_table = schema.tables["table_b"] + assert table == schema_table + + # item to resource + yield {"id": 3, "pk": "C"} + + source = DltSource(dlt.Schema("hintable"), "module", [with_table_hints]) + extract_step.extract(source, 20, 1) + + # def test_extract_pipe_from_unknown_resource(): # pass diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index d9c73dfb20..6ff1a0bf5f 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -12,6 +12,7 @@ from dlt.common.typing import TDataItems from dlt.extract import DltResource, DltSource, Incremental +from dlt.extract.items import TableNameMeta from dlt.extract.source import DltResourceDict from dlt.extract.exceptions import ( DataItemRequiredForDynamicTableHints, @@ -1362,6 +1363,57 @@ def empty_gen(): assert table["columns"]["tags"] == {"name": "tags"} +def test_apply_hints_table_variants() -> None: + def empty_gen(): + yield [1, 2, 3] + + empty = DltResource.from_data(empty_gen) + + # table name must be a string + with pytest.raises(ValueError): + empty.apply_hints(write_disposition="append", create_table_variant=True) + with pytest.raises(ValueError): + empty.apply_hints( + table_name=lambda ev: ev["t"], write_disposition="append", create_table_variant=True + ) + + # table a with replace + empty.apply_hints(table_name="table_a", write_disposition="replace", create_table_variant=True) + table_a = empty.compute_table_schema(meta=TableNameMeta("table_a")) + assert table_a["name"] == "table_a" + assert table_a["write_disposition"] == "replace" + + # unknown table (without variant) - created out resource hints + table_unk = empty.compute_table_schema(meta=TableNameMeta("table_unk")) + assert table_unk["name"] == "empty_gen" + assert table_unk["write_disposition"] == "append" + + # resource hints are base for table variants + empty.apply_hints( + primary_key="id", + incremental=dlt.sources.incremental(cursor_path="x"), + columns=[{"name": "id", "data_type": "bigint"}], + ) + empty.apply_hints(table_name="table_b", write_disposition="merge", create_table_variant=True) + table_b = empty.compute_table_schema(meta=TableNameMeta("table_b")) + assert table_b["name"] == "table_b" + assert table_b["write_disposition"] == "merge" + assert len(table_b["columns"]) == 1 + assert table_b["columns"]["id"]["primary_key"] is True + # overwrite table_b, remove column def and primary_key + empty.apply_hints(table_name="table_b", columns=[], primary_key=(), create_table_variant=True) + table_b = empty.compute_table_schema(meta=TableNameMeta("table_b")) + assert table_b["name"] == "table_b" + assert table_b["write_disposition"] == "merge" + assert len(table_b["columns"]) == 0 + + # dyn hints not allowed + with pytest.raises(InconsistentTableTemplate): + empty.apply_hints( + table_name="table_b", write_disposition=lambda ev: ev["wd"], create_table_variant=True + ) + + def test_resource_no_template() -> None: empty = DltResource.from_data([1, 2, 3], name="table") assert empty.write_disposition == "append" diff --git a/tests/helpers/dbt_tests/local/utils.py b/tests/helpers/dbt_tests/local/utils.py index 7097140a83..8fd3dba44f 100644 --- a/tests/helpers/dbt_tests/local/utils.py +++ b/tests/helpers/dbt_tests/local/utils.py @@ -40,7 +40,9 @@ def setup_rasa_runner( runner = create_runner( Venv.restore_current(), # credentials are exported to env in setup_rasa_runner_client - DestinationClientDwhConfiguration(dataset_name=dataset_name or FIXTURES_DATASET_NAME), + DestinationClientDwhConfiguration()._bind_dataset_name( + dataset_name=dataset_name or FIXTURES_DATASET_NAME + ), TEST_STORAGE_ROOT, package_profile_name=profile_name, config=C, diff --git a/tests/helpers/providers/test_google_secrets_provider.py b/tests/helpers/providers/test_google_secrets_provider.py index d6d94774b9..00c54b5705 100644 --- a/tests/helpers/providers/test_google_secrets_provider.py +++ b/tests/helpers/providers/test_google_secrets_provider.py @@ -1,6 +1,5 @@ -import dlt from dlt import TSecretValue -from dlt.common import logger +from dlt.common.runtime.init import init_logging from dlt.common.configuration.specs import GcpServiceAccountCredentials from dlt.common.configuration.providers import GoogleSecretsProvider from dlt.common.configuration.accessors import secrets @@ -24,7 +23,7 @@ def test_regular_keys() -> None: - logger.init_logging(RunConfiguration()) + init_logging(RunConfiguration()) # copy bigquery credentials into providers credentials c = resolve_configuration( GcpServiceAccountCredentials(), sections=(known_sections.DESTINATION, "bigquery") diff --git a/tests/load/bigquery/test_bigquery_client.py b/tests/load/bigquery/test_bigquery_client.py index ac17bb8316..a97b612ad0 100644 --- a/tests/load/bigquery/test_bigquery_client.py +++ b/tests/load/bigquery/test_bigquery_client.py @@ -203,7 +203,8 @@ def test_get_oauth_access_token() -> None: def test_bigquery_configuration() -> None: config = resolve_configuration( - BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery") + BigQueryClientConfiguration()._bind_dataset_name(dataset_name="dataset"), + sections=("destination", "bigquery"), ) assert config.location == "US" assert config.get_location() == "US" @@ -215,7 +216,8 @@ def test_bigquery_configuration() -> None: # credential location is deprecated os.environ["CREDENTIALS__LOCATION"] = "EU" config = resolve_configuration( - BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery") + BigQueryClientConfiguration()._bind_dataset_name(dataset_name="dataset"), + sections=("destination", "bigquery"), ) assert config.location == "US" assert config.credentials.location == "EU" @@ -223,17 +225,21 @@ def test_bigquery_configuration() -> None: assert config.get_location() == "EU" os.environ["LOCATION"] = "ATLANTIS" config = resolve_configuration( - BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery") + BigQueryClientConfiguration()._bind_dataset_name(dataset_name="dataset"), + sections=("destination", "bigquery"), ) assert config.get_location() == "ATLANTIS" os.environ["DESTINATION__FILE_UPLOAD_TIMEOUT"] = "20000" config = resolve_configuration( - BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery") + BigQueryClientConfiguration()._bind_dataset_name(dataset_name="dataset"), + sections=("destination", "bigquery"), ) assert config.file_upload_timeout == 20000.0 # default fingerprint is empty - assert BigQueryClientConfiguration(dataset_name="dataset").fingerprint() == "" + assert ( + BigQueryClientConfiguration()._bind_dataset_name(dataset_name="dataset").fingerprint() == "" + ) def test_bigquery_job_errors(client: BigQueryClient, file_storage: FileStorage) -> None: diff --git a/tests/load/bigquery/test_bigquery_table_builder.py b/tests/load/bigquery/test_bigquery_table_builder.py index a223de9b26..fd58a6e033 100644 --- a/tests/load/bigquery/test_bigquery_table_builder.py +++ b/tests/load/bigquery/test_bigquery_table_builder.py @@ -4,10 +4,6 @@ from dlt.destinations.impl.bigquery.bigquery_adapter import ( PARTITION_HINT, CLUSTER_HINT, - TABLE_DESCRIPTION_HINT, - ROUND_HALF_EVEN_HINT, - ROUND_HALF_AWAY_FROM_ZERO_HINT, - TABLE_EXPIRATION_HINT, ) import google @@ -17,9 +13,12 @@ import dlt from dlt.common.configuration import resolve_configuration -from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults +from dlt.common.configuration.specs import ( + GcpServiceAccountCredentialsWithoutDefaults, + GcpServiceAccountCredentials, +) from dlt.common.pendulum import pendulum -from dlt.common.schema import Schema, TColumnHint +from dlt.common.schema import Schema from dlt.common.utils import custom_environ from dlt.common.utils import uniq_id from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate @@ -53,13 +52,13 @@ def test_configuration() -> None: @pytest.fixture def gcp_client(empty_schema: Schema) -> BigQueryClient: # return a client without opening connection - creds = GcpServiceAccountCredentialsWithoutDefaults() + creds = GcpServiceAccountCredentials() creds.project_id = "test_project_id" # noinspection PydanticTypeChecker return BigQueryClient( empty_schema, - BigQueryClientConfiguration( - dataset_name=f"test_{uniq_id()}", credentials=creds # type: ignore[arg-type] + BigQueryClientConfiguration(credentials=creds)._bind_dataset_name( + dataset_name=f"test_{uniq_id()}" ), ) diff --git a/tests/load/clickhouse/test_clickhouse_configuration.py b/tests/load/clickhouse/test_clickhouse_configuration.py index 9beb847f85..d6b41c0189 100644 --- a/tests/load/clickhouse/test_clickhouse_configuration.py +++ b/tests/load/clickhouse/test_clickhouse_configuration.py @@ -20,7 +20,7 @@ def test_connection_string_with_all_params() -> None: url = "clickhouse://user1:pass1@host1:9000/testdb?secure=0&connect_timeout=230&send_receive_timeout=1000" - creds = ClickhouseCredentials() + creds = ClickhouseCredentials() # type: ignore creds.parse_native_representation(url) assert creds.database == "testdb" diff --git a/tests/load/clickhouse/test_clickhouse_table_builder.py b/tests/load/clickhouse/test_clickhouse_table_builder.py index dad682a108..7efab56464 100644 --- a/tests/load/clickhouse/test_clickhouse_table_builder.py +++ b/tests/load/clickhouse/test_clickhouse_table_builder.py @@ -17,7 +17,7 @@ @pytest.fixture def clickhouse_client(empty_schema: Schema) -> ClickhouseClient: # Return a client without opening connection. - creds = ClickhouseCredentials() + creds = ClickhouseCredentials() # type: ignore return ClickhouseClient( empty_schema, ClickhouseClientConfiguration(dataset_name=f"test_{uniq_id()}", credentials=creds), @@ -34,7 +34,7 @@ def test_clickhouse_configuration() -> None: "DESTINATION__CLICKHOUSE__CREDENTIALS__PASSWORD": "fuss_do_rah", } ): - C = resolve_configuration(ClickhouseCredentials(), sections=("destination", "clickhouse")) + C = resolve_configuration(ClickhouseCredentials(), sections=("destination", "clickhouse")) # type: ignore assert C.database == "mydb" assert C.password == "fuss_do_rah" @@ -42,7 +42,7 @@ def test_clickhouse_configuration() -> None: assert ClickhouseClientConfiguration().fingerprint() == "" # Based on host. c = resolve_configuration( - ClickhouseCredentials(), + ClickhouseCredentials(), # type: ignore explicit_value="clickhouse://user1:pass@host1/db1", ) assert ClickhouseClientConfiguration(credentials=c).fingerprint() == digest128("host1") diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index 9127e39be4..8d30d05e42 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -17,7 +17,9 @@ def test_databricks_credentials_to_connector_params(): # JSON encoded dict of extra args os.environ["CREDENTIALS__CONNECTION_PARAMETERS"] = '{"extra_a": "a", "extra_b": "b"}' - config = resolve_configuration(DatabricksClientConfiguration(dataset_name="my-dataset")) + config = resolve_configuration( + DatabricksClientConfiguration()._bind_dataset_name(dataset_name="my-dataset") + ) credentials = config.credentials diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index ef151833e4..3deed7a77d 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -31,7 +31,9 @@ def test_duckdb_open_conn_default() -> None: delete_quack_db() try: get_resolved_traces().clear() - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset")) + c = resolve_configuration( + DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_dataset") + ) # print(str(c.credentials)) # print(str(os.getcwd())) # print(get_resolved_traces()) @@ -52,11 +54,15 @@ def test_duckdb_open_conn_default() -> None: def test_duckdb_database_path() -> None: # resolve without any path provided - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset")) + c = resolve_configuration( + DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_dataset") + ) assert c.credentials._conn_str().lower() == os.path.abspath("quack.duckdb").lower() # resolve without any path but with pipeline context p = dlt.pipeline(pipeline_name="quack_pipeline") - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset")) + c = resolve_configuration( + DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_dataset") + ) # still cwd db_path = os.path.abspath(os.path.join(".", "quack_pipeline.duckdb")) assert c.credentials._conn_str().lower() == db_path.lower() @@ -75,7 +81,9 @@ def test_duckdb_database_path() -> None: # test special :pipeline: path to create in pipeline folder c = resolve_configuration( - DuckDbClientConfiguration(dataset_name="test_dataset", credentials=":pipeline:") + DuckDbClientConfiguration(credentials=":pipeline:")._bind_dataset_name( + dataset_name="test_dataset" + ) ) db_path = os.path.abspath(os.path.join(p.working_dir, DEFAULT_DUCK_DB_NAME)) assert c.credentials._conn_str().lower() == db_path.lower() @@ -90,8 +98,8 @@ def test_duckdb_database_path() -> None: db_path = "_storage/test_quack.duckdb" c = resolve_configuration( DuckDbClientConfiguration( - dataset_name="test_dataset", credentials="duckdb:///_storage/test_quack.duckdb" - ) + credentials="duckdb:///_storage/test_quack.duckdb" + )._bind_dataset_name(dataset_name="test_dataset") ) assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower() conn = c.credentials.borrow_conn(read_only=False) @@ -102,7 +110,9 @@ def test_duckdb_database_path() -> None: # provide absolute path db_path = os.path.abspath("_storage/abs_test_quack.duckdb") c = resolve_configuration( - DuckDbClientConfiguration(dataset_name="test_dataset", credentials=f"duckdb:///{db_path}") + DuckDbClientConfiguration(credentials=f"duckdb:///{db_path}")._bind_dataset_name( + dataset_name="test_dataset", + ) ) assert os.path.isabs(c.credentials.database) assert c.credentials._conn_str().lower() == db_path.lower() @@ -114,7 +124,9 @@ def test_duckdb_database_path() -> None: # set just path as credentials db_path = "_storage/path_test_quack.duckdb" c = resolve_configuration( - DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path) + DuckDbClientConfiguration(credentials=db_path)._bind_dataset_name( + dataset_name="test_dataset" + ) ) assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower() conn = c.credentials.borrow_conn(read_only=False) @@ -124,7 +136,9 @@ def test_duckdb_database_path() -> None: db_path = os.path.abspath("_storage/abs_path_test_quack.duckdb") c = resolve_configuration( - DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path) + DuckDbClientConfiguration(credentials=db_path)._bind_dataset_name( + dataset_name="test_dataset" + ) ) assert os.path.isabs(c.credentials.database) assert c.credentials._conn_str().lower() == db_path.lower() @@ -138,7 +152,9 @@ def test_duckdb_database_path() -> None: with pytest.raises(duckdb.IOException): c = resolve_configuration( - DuckDbClientConfiguration(dataset_name="test_dataset", credentials=TEST_STORAGE_ROOT) + DuckDbClientConfiguration(credentials=TEST_STORAGE_ROOT)._bind_dataset_name( + dataset_name="test_dataset" + ) ) conn = c.credentials.borrow_conn(read_only=False) @@ -225,7 +241,7 @@ def test_external_duckdb_database() -> None: # pass explicit in memory database conn = duckdb.connect(":memory:") c = resolve_configuration( - DuckDbClientConfiguration(dataset_name="test_dataset", credentials=conn) + DuckDbClientConfiguration(credentials=conn)._bind_dataset_name(dataset_name="test_dataset") ) assert c.credentials._conn_borrows == 0 assert c.credentials._conn is conn diff --git a/tests/load/duckdb/test_duckdb_table_builder.py b/tests/load/duckdb/test_duckdb_table_builder.py index 9b12e04f77..542b18993c 100644 --- a/tests/load/duckdb/test_duckdb_table_builder.py +++ b/tests/load/duckdb/test_duckdb_table_builder.py @@ -14,7 +14,10 @@ @pytest.fixture def client(empty_schema: Schema) -> DuckDbClient: # return client without opening connection - return DuckDbClient(empty_schema, DuckDbClientConfiguration(dataset_name="test_" + uniq_id())) + return DuckDbClient( + empty_schema, + DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_" + uniq_id()), + ) def test_create_table(client: DuckDbClient) -> None: @@ -89,7 +92,9 @@ def test_create_table_with_hints(client: DuckDbClient) -> None: # same thing with indexes client = DuckDbClient( client.schema, - DuckDbClientConfiguration(dataset_name="test_" + uniq_id(), create_indexes=True), + DuckDbClientConfiguration(create_indexes=True)._bind_dataset_name( + dataset_name="test_" + uniq_id() + ), ) sql = client._get_table_update_sql("event_test_table", mod_update, False)[0] sqlfluff.parse(sql) diff --git a/tests/load/duckdb/test_motherduck_client.py b/tests/load/duckdb/test_motherduck_client.py index d57cf58f53..ba60e0de6d 100644 --- a/tests/load/duckdb/test_motherduck_client.py +++ b/tests/load/duckdb/test_motherduck_client.py @@ -19,13 +19,15 @@ def test_motherduck_database() -> None: # os.environ.pop("HOME", None) cred = MotherDuckCredentials("md:///?token=TOKEN") + print(dict(cred)) assert cred.password == "TOKEN" cred = MotherDuckCredentials() cred.parse_native_representation("md:///?token=TOKEN") assert cred.password == "TOKEN" config = resolve_configuration( - MotherDuckClientConfiguration(dataset_name="test"), sections=("destination", "motherduck") + MotherDuckClientConfiguration()._bind_dataset_name(dataset_name="test"), + sections=("destination", "motherduck"), ) # connect con = config.credentials.borrow_conn(read_only=False) diff --git a/tests/load/filesystem/test_aws_credentials.py b/tests/load/filesystem/test_aws_credentials.py index 7a0d42eb6d..62c2e3cd85 100644 --- a/tests/load/filesystem/test_aws_credentials.py +++ b/tests/load/filesystem/test_aws_credentials.py @@ -45,7 +45,7 @@ def test_aws_credentials_from_botocore(environment: Dict[str, str]) -> None: session = botocore.session.get_session() region_name = "eu-central-1" # session.get_config_variable('region') - c = AwsCredentials(session) + c = AwsCredentials.from_session(session) assert c.profile_name is None assert c.aws_access_key_id == "fake_access_key" assert c.region_name == region_name @@ -83,7 +83,7 @@ def test_aws_credentials_from_boto3(environment: Dict[str, str]) -> None: session = boto3.Session() - c = AwsCredentials(session) + c = AwsCredentials.from_session(session) assert c.profile_name is None assert c.aws_access_key_id == "fake_access_key" assert c.region_name == session.region_name diff --git a/tests/load/filesystem/utils.py b/tests/load/filesystem/utils.py index 1232be5c43..6e697fdef9 100644 --- a/tests/load/filesystem/utils.py +++ b/tests/load/filesystem/utils.py @@ -14,7 +14,7 @@ from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.destination.reference import LoadJob, TDestination +from dlt.common.destination.reference import LoadJob from dlt.common.pendulum import timedelta, __utcnow from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient @@ -24,11 +24,11 @@ def setup_loader(dataset_name: str) -> Load: - destination: TDestination = filesystem() # type: ignore[assignment] - config = filesystem.spec(dataset_name=dataset_name) + destination = filesystem() + config = destination.spec()._bind_dataset_name(dataset_name=dataset_name) # setup loader with Container().injectable_context(ConfigSectionContext(sections=("filesystem",))): - return Load(destination, initial_client_config=config) + return Load(destination, initial_client_config=config) # type: ignore[arg-type] @contextmanager diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index 75f46e8905..1b4a77a2ab 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -17,7 +17,9 @@ def client(empty_schema: Schema) -> MsSqlClient: # return client without opening connection return MsSqlClient( empty_schema, - MsSqlClientConfiguration(dataset_name="test_" + uniq_id(), credentials=MsSqlCredentials()), + MsSqlClientConfiguration(credentials=MsSqlCredentials())._bind_dataset_name( + dataset_name="test_" + uniq_id() + ), ) diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 8614af4734..afae1c22ca 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -56,7 +56,11 @@ def droppable_d( dlt.state()["data_from_d"] = {"foo1": {"bar": 1}, "foo2": {"bar": 2}} yield [dict(o=55), dict(o=22)] - return [droppable_a(), droppable_b(), droppable_c(), droppable_d()] + @dlt.resource(selected=True) + def droppable_no_state(): + yield [1, 2, 3] + + return [droppable_a(), droppable_b(), droppable_c(), droppable_d(), droppable_no_state] RESOURCE_TABLES = dict( @@ -64,8 +68,11 @@ def droppable_d( droppable_b=["droppable_b", "droppable_b__items"], droppable_c=["droppable_c", "droppable_c__items", "droppable_c__items__labels"], droppable_d=["droppable_d"], + droppable_no_state=["droppable_no_state"], ) +NO_STATE_RESOURCES = {"droppable_no_state"} + def assert_dropped_resources(pipeline: Pipeline, resources: List[str]) -> None: assert_dropped_resource_tables(pipeline, resources) @@ -95,7 +102,7 @@ def assert_dropped_resource_tables(pipeline: Pipeline, resources: List[str]) -> def assert_dropped_resource_states(pipeline: Pipeline, resources: List[str]) -> None: # Verify only requested resource keys are removed from state - all_resources = set(RESOURCE_TABLES.keys()) + all_resources = set(RESOURCE_TABLES.keys()) - NO_STATE_RESOURCES expected_keys = all_resources - set(resources) sources_state = pipeline.state["sources"] result_keys = set(sources_state["droppable"]["resources"].keys()) @@ -109,6 +116,8 @@ def assert_destination_state_loaded(pipeline: Pipeline) -> None: destination_state = state_sync.load_pipeline_state_from_destination( pipeline.pipeline_name, client ) + # current pipeline schema available in the destination + client.get_stored_schema_by_hash(pipeline.default_schema.version_hash) pipeline_state = dict(pipeline.state) del pipeline_state["_local"] assert pipeline_state == destination_state @@ -144,8 +153,7 @@ def test_drop_command_resources_and_state(destination_config: DestinationTestCon "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_drop_command_only_state(destination_config: DestinationTestConfiguration) -> None: - """Test the drop command with resource and state path options and - verify correct data is deleted from destination and locally""" + """Test drop command that deletes part of the state and syncs with destination""" source = droppable_source() pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) @@ -164,6 +172,28 @@ def test_drop_command_only_state(destination_config: DestinationTestConfiguratio assert_destination_state_loaded(pipeline) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +def test_drop_command_only_tables(destination_config: DestinationTestConfiguration) -> None: + """Test drop only tables and makes sure that schema and state are synced""" + source = droppable_source() + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline.run(source) + sources_state = pipeline.state["sources"] + + attached = _attach(pipeline) + helpers.drop(attached, resources=["droppable_no_state"]) + + attached = _attach(pipeline) + + assert_dropped_resources(attached, ["droppable_no_state"]) + # source state didn't change + assert pipeline.state["sources"] == sources_state + + assert_destination_state_loaded(pipeline) + + @pytest.mark.parametrize( "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) @@ -202,7 +232,7 @@ def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration attached = _attach(pipeline) with mock.patch.object( - helpers.DropCommand, "_drop_state_keys", side_effect=RuntimeError("Something went wrong") + helpers.DropCommand, "_extract_state", side_effect=RuntimeError("Something went wrong") ): with pytest.raises(RuntimeError): helpers.drop(attached, resources=("droppable_a", "droppable_b")) diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 05c70e2f62..017bef2c01 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -6,13 +6,16 @@ import dlt -from dlt.common.pipeline import SupportsPipeline from dlt.common import json, sleep +from dlt.common.pipeline import SupportsPipeline from dlt.common.destination import Destination +from dlt.common.destination.exceptions import DestinationHasFailedJobs +from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id + from dlt.destinations.exceptions import DatabaseUndefinedRelation from dlt.extract.exceptions import ResourceNameMissing from dlt.extract import DltSource @@ -21,8 +24,6 @@ PipelineConfigMissing, PipelineStepFailed, ) -from dlt.common.schema.exceptions import CannotCoerceColumnException -from dlt.common.exceptions import DestinationHasFailedJobs from tests.utils import TEST_STORAGE_ROOT, data_to_item_format, preserve_environ from tests.pipeline.utils import assert_data_table_counts, assert_load_info diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index 02da91cefe..e50654adcc 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -6,12 +6,12 @@ import dlt from dlt.common import pendulum -from dlt.common.schema.schema import Schema, utils -from dlt.common.utils import custom_environ, uniq_id -from dlt.common.exceptions import DestinationUndefinedEntity +from dlt.common.schema.schema import Schema +from dlt.common.utils import uniq_id +from dlt.common.destination.exceptions import DestinationUndefinedEntity + from dlt.load import Load from dlt.pipeline.exceptions import SqlClientNotAvailable - from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.state_sync import ( STATE_TABLE_COLUMNS, @@ -207,6 +207,7 @@ def _make_dn_name(schema_name: str) -> str: job_client ) == default_schema.naming.normalize_table_identifier(dataset_name) schema_two = Schema("two") + schema_two._bump_version() with p._get_destination_clients(schema_two)[0] as job_client: # use the job_client to do that job_client.initialize_storage() diff --git a/tests/load/postgres/test_postgres_client.py b/tests/load/postgres/test_postgres_client.py index daabf6fc51..896e449b28 100644 --- a/tests/load/postgres/test_postgres_client.py +++ b/tests/load/postgres/test_postgres_client.py @@ -62,6 +62,10 @@ def test_postgres_credentials_native_value(environment) -> None: assert c.is_resolved() assert c.password == "loader" + c = PostgresCredentials("postgres://loader:loader@localhost/dlt_data") + assert c.password == "loader" + assert c.database == "dlt_data" + def test_postgres_credentials_timeout() -> None: # test postgres timeout diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index fde9d82cf7..0ab1343a3b 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -2,7 +2,6 @@ from copy import deepcopy import sqlfluff -from dlt.common.schema.utils import new_table from dlt.common.utils import uniq_id from dlt.common.schema import Schema @@ -20,8 +19,8 @@ def client(empty_schema: Schema) -> PostgresClient: # return client without opening connection return PostgresClient( empty_schema, - PostgresClientConfiguration( - dataset_name="test_" + uniq_id(), credentials=PostgresCredentials() + PostgresClientConfiguration(credentials=PostgresCredentials())._bind_dataset_name( + dataset_name="test_" + uniq_id() ), ) @@ -97,10 +96,9 @@ def test_create_table_with_hints(client: PostgresClient) -> None: client = PostgresClient( client.schema, PostgresClientConfiguration( - dataset_name="test_" + uniq_id(), create_indexes=False, credentials=PostgresCredentials(), - ), + )._bind_dataset_name(dataset_name="test_" + uniq_id()), ) sql = client._get_table_update_sql("event_test_table", mod_update, False)[0] sqlfluff.parse(sql, dialect="postgres") diff --git a/tests/load/redshift/test_redshift_table_builder.py b/tests/load/redshift/test_redshift_table_builder.py index c6981e5553..bc132c7818 100644 --- a/tests/load/redshift/test_redshift_table_builder.py +++ b/tests/load/redshift/test_redshift_table_builder.py @@ -20,8 +20,8 @@ def client(empty_schema: Schema) -> RedshiftClient: # return client without opening connection return RedshiftClient( empty_schema, - RedshiftClientConfiguration( - dataset_name="test_" + uniq_id(), credentials=RedshiftCredentials() + RedshiftClientConfiguration(credentials=RedshiftCredentials())._bind_dataset_name( + dataset_name="test_" + uniq_id() ), ) diff --git a/tests/load/snowflake/test_snowflake_table_builder.py b/tests/load/snowflake/test_snowflake_table_builder.py index 1e80a61f1c..5d7108803e 100644 --- a/tests/load/snowflake/test_snowflake_table_builder.py +++ b/tests/load/snowflake/test_snowflake_table_builder.py @@ -21,7 +21,9 @@ def snowflake_client(empty_schema: Schema) -> SnowflakeClient: creds = SnowflakeCredentials() return SnowflakeClient( empty_schema, - SnowflakeClientConfiguration(dataset_name="test_" + uniq_id(), credentials=creds), + SnowflakeClientConfiguration(credentials=creds)._bind_dataset_name( + dataset_name="test_" + uniq_id() + ), ) diff --git a/tests/load/synapse/test_synapse_table_builder.py b/tests/load/synapse/test_synapse_table_builder.py index 871ceecf96..8575835820 100644 --- a/tests/load/synapse/test_synapse_table_builder.py +++ b/tests/load/synapse/test_synapse_table_builder.py @@ -25,8 +25,8 @@ def client(empty_schema: Schema) -> SynapseClient: # return client without opening connection client = SynapseClient( empty_schema, - SynapseClientConfiguration( - dataset_name="test_" + uniq_id(), credentials=SynapseCredentials() + SynapseClientConfiguration(credentials=SynapseCredentials())._bind_dataset_name( + dataset_name="test_" + uniq_id() ), ) assert client.config.create_indexes is False @@ -39,8 +39,8 @@ def client_with_indexes_enabled(empty_schema: Schema) -> SynapseClient: client = SynapseClient( empty_schema, SynapseClientConfiguration( - dataset_name="test_" + uniq_id(), credentials=SynapseCredentials(), create_indexes=True - ), + credentials=SynapseCredentials(), create_indexes=True + )._bind_dataset_name(dataset_name="test_" + uniq_id()), ) assert client.config.create_indexes is True return client diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index d7884abcf0..c5e4f874fc 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -6,8 +6,7 @@ from typing import List from dlt.common.exceptions import TerminalException, TerminalValueError -from dlt.common.schema.typing import TWriteDisposition -from dlt.common.storages import FileStorage, LoadStorage, PackageStorage, ParsedLoadJobFileName +from dlt.common.storages import FileStorage, PackageStorage, ParsedLoadJobFileName from dlt.common.storages.load_package import LoadJobInfo from dlt.common.storages.load_storage import JobWithUnsupportedWriterException from dlt.common.destination.reference import LoadJob, TDestination @@ -814,7 +813,9 @@ def setup_loader( if filesystem_staging: # do not accept jsonl to not conflict with filesystem destination client_config = client_config or DummyClientConfiguration(loader_file_format="reference") - staging_system_config = FilesystemDestinationClientConfiguration(dataset_name="dummy") + staging_system_config = FilesystemDestinationClientConfiguration()._bind_dataset_name( + dataset_name="dummy" + ) staging_system_config.as_staging = True os.makedirs(REMOTE_FILESYSTEM) staging = filesystem(bucket_url=REMOTE_FILESYSTEM) diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 63f9d3c28d..2e23086f81 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -111,7 +111,7 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: # modify schema schema.tables["event_slot"]["write_disposition"] = "replace" - schema.bump_version() + schema._bump_version() assert schema.version > this_schema.version # update in storage @@ -126,7 +126,7 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: # in that case the version will not change or go down first_schema = Schema.from_dict(json.loads(first_version_schema)) first_schema.tables["event_bot"]["write_disposition"] = "replace" - first_schema.bump_version() + first_schema._bump_version() assert first_schema.version == this_schema.version == 2 # wait to make load_newest_schema deterministic sleep(0.1) @@ -143,7 +143,7 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: # mock other schema in client and get the newest schema. it should not exist... client.schema = Schema("ethereum") assert client.get_stored_schema() is None - client.schema.bump_version() + client.schema._bump_version() schema_update = client.update_stored_schema() # no schema updates because schema has no tables assert schema_update == {} @@ -206,7 +206,7 @@ def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: record_hash = schema._infer_column("_dlt_id", "m,i0392903jdlkasjdlk") assert record_hash["unique"] is True schema.update_table(new_table(table_name, columns=[timestamp, sender_id, record_hash])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() # check hints in schema update table_update = schema_update[table_name]["columns"] @@ -233,7 +233,7 @@ def test_schema_update_create_table_bigquery(client: SqlJobClientBase) -> None: # this will be not null record_hash = schema._infer_column("_dlt_id", "m,i0392903jdlkasjdlk") schema.update_table(new_table("event_test_table", columns=[timestamp, sender_id, record_hash])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() # check hints in schema update table_update = schema_update["event_test_table"]["columns"] @@ -259,7 +259,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: col1 = schema._infer_column("col1", "string") table_name = "event_test_table" + uniq_id() schema.update_table(new_table(table_name, columns=[col1])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() assert table_name in schema_update assert len(schema_update[table_name]["columns"]) == 1 @@ -267,7 +267,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: # with single alter table col2 = schema._infer_column("col2", 1) schema.update_table(new_table(table_name, columns=[col2])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() assert len(schema_update) == 1 assert len(schema_update[table_name]["columns"]) == 1 @@ -278,7 +278,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: col4 = schema._infer_column("col4", 182879721.182912) col4["data_type"] = "timestamp" schema.update_table(new_table(table_name, columns=[col3, col4])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() assert len(schema_update[table_name]["columns"]) == 2 assert schema_update[table_name]["columns"]["col3"]["data_type"] == "double" @@ -297,7 +297,7 @@ def test_drop_tables(client: SqlJobClientBase) -> None: # Add columns in all tables schema.tables["event_user"]["columns"] = dict(schema.tables["event_slot"]["columns"]) schema.tables["event_bot"]["columns"] = dict(schema.tables["event_slot"]["columns"]) - schema.bump_version() + schema._bump_version() client.update_stored_schema() # Create a second schema with 2 hashes @@ -312,10 +312,10 @@ def test_drop_tables(client: SqlJobClientBase) -> None: schema_2.tables[tbl_name + "_2"]["name"] = tbl_name + "_2" client.schema = schema_2 - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() client.schema.tables["event_slot_2"]["columns"]["value"]["nullable"] = False - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() # Drop tables from the first schema @@ -323,7 +323,7 @@ def test_drop_tables(client: SqlJobClientBase) -> None: tables_to_drop = ["event_slot", "event_user"] for tbl in tables_to_drop: del schema.tables[tbl] - schema.bump_version() + schema._bump_version() client.drop_tables(*tables_to_drop) if isinstance(client, WithStagingDataset): with contextlib.suppress(DatabaseUndefinedRelation): @@ -363,7 +363,7 @@ def test_get_storage_table_with_all_types(client: SqlJobClientBase) -> None: schema = client.schema table_name = "event_test_table" + uniq_id() schema.update_table(new_table(table_name, columns=TABLE_UPDATE)) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() # we have all columns in the update table_update = schema_update[table_name]["columns"] @@ -407,7 +407,7 @@ def test_preserve_column_order(client: SqlJobClientBase) -> None: random.shuffle(columns) schema.update_table(new_table(table_name, columns=columns)) - schema.bump_version() + schema._bump_version() def _assert_columns_order(sql_: str) -> None: idx = 0 @@ -514,7 +514,7 @@ def test_load_with_all_types( table_name, write_disposition=write_disposition, columns=list(column_schemas.values()) ) ) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() if client.should_load_data_to_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] @@ -569,7 +569,7 @@ def test_write_dispositions( client.schema.update_table( new_table(child_table, columns=TABLE_UPDATE, parent_table_name=table_name) ) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() if write_disposition == "merge": @@ -578,7 +578,7 @@ def test_write_dispositions( # create staging for merge dataset with client.with_staging_dataset(): # type: ignore[attr-defined] client.initialize_storage() - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() for idx in range(2): # in the replace strategies, tables get truncated between loads @@ -728,7 +728,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: user_table = load_table("event_user")["event_user"] client.schema.update_table(new_table("event_user", columns=list(user_table.values()))) - client.schema.bump_version() + client.schema._bump_version() schema_update = client.update_stored_schema() assert len(schema_update) > 0 @@ -741,7 +741,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: event_2_schema = Schema.from_stored_schema(schema_dict) # swap schemas in client instance client.schema = event_2_schema - client.schema.bump_version() + client.schema._bump_version() schema_update = client.update_stored_schema() # no were detected - even if the schema is new. all the tables overlap assert schema_update == {} @@ -760,7 +760,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: event_3_schema.tables["event_user"]["columns"]["input_channel"]["nullable"] = False # swap schemas in client instance client.schema = event_3_schema - client.schema.bump_version() + client.schema._bump_version() schema_update = client.update_stored_schema() # no were detected - even if the schema is new. all the tables overlap and change in nullability does not do any updates assert schema_update == {} @@ -771,7 +771,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: event_3_schema.tables["event_user"]["columns"]["mandatory_column"] = new_column( "mandatory_column", "text", nullable=False ) - client.schema.bump_version() + client.schema._bump_version() with pytest.raises(DatabaseException) as py_ex: client.update_stored_schema() assert ( @@ -788,6 +788,6 @@ def prepare_schema(client: SqlJobClientBase, case: str) -> Tuple[List[Dict[str, table: TTableSchemaColumns = {k: client.schema._infer_column(k, v) for k, v in rows[0].items()} table_name = f"event_{case}_{uniq_id()}" client.schema.update_table(new_table(table_name, columns=list(table.values()))) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() return rows, table_name diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 026e481ede..d82925a7d3 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -5,17 +5,17 @@ from time import sleep from dlt.common import pendulum, Decimal -from dlt.common.exceptions import IdentifierTooLongException +from dlt.common.destination.exceptions import IdentifierTooLongException from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME from dlt.common.storages import FileStorage -from dlt.common.utils import derives_from_class_of_name, uniq_id +from dlt.common.utils import uniq_id + from dlt.destinations.exceptions import ( DatabaseException, DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, ) - from dlt.destinations.sql_client import DBApiCursor, SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase from dlt.destinations.typing import TNativeConn @@ -570,7 +570,7 @@ def test_max_column_identifier_length(client: SqlJobClientBase) -> None: def test_recover_on_explicit_tx(client: SqlJobClientBase) -> None: if client.capabilities.supports_transactions is False: pytest.skip("Destination does not support tx") - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() version_table = client.sql_client.make_qualified_table_name("_dlt_version") # simple syntax error diff --git a/tests/load/utils.py b/tests/load/utils.py index f5e0052770..b043a979cb 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -477,12 +477,12 @@ def prepare_table( table_name: str = "event_user", make_uniq_table: bool = True, ) -> str: - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() user_table = load_table(case_name)[table_name] user_table_name = table_name + uniq_id() if make_uniq_table else table_name client.schema.update_table(new_table(user_table_name, columns=list(user_table.values()))) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() return user_table_name diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index 48153f7706..3f966c2330 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -76,7 +76,7 @@ def test_all_data_types( client.schema.update_table( new_table(class_name, write_disposition=write_disposition, columns=TABLE_UPDATE) ) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() # write row @@ -113,7 +113,7 @@ def test_case_sensitive_properties_create(client: WeaviateClient) -> None: client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) ) - client.schema.bump_version() + client.schema._bump_version() with pytest.raises(PropertyNameConflict): client.update_stored_schema() @@ -128,7 +128,7 @@ def test_case_insensitive_properties_create(ci_client: WeaviateClient) -> None: ci_client.schema.update_table( ci_client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) ) - ci_client.schema.bump_version() + ci_client.schema._bump_version() ci_client.update_stored_schema() _, table_columns = ci_client.get_storage_table("ColClass") # later column overwrites earlier one so: double @@ -145,13 +145,13 @@ def test_case_sensitive_properties_add(client: WeaviateClient) -> None: client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) ) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_update)) ) - client.schema.bump_version() + client.schema._bump_version() with pytest.raises(PropertyNameConflict): client.update_stored_schema() @@ -166,7 +166,7 @@ def test_load_case_sensitive_data(client: WeaviateClient, file_storage: FileStor "col1": {"name": "col1", "data_type": "bigint", "nullable": False} } client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() # prepare a data item where is name clash due to Weaviate being CI data_clash = {"col1": 72187328, "coL1": 726171} @@ -185,7 +185,7 @@ def test_load_case_sensitive_data_ci(ci_client: WeaviateClient, file_storage: Fi "col1": {"name": "col1", "data_type": "bigint", "nullable": False} } ci_client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) - ci_client.schema.bump_version() + ci_client.schema._bump_version() ci_client.update_stored_schema() # prepare a data item where is name clash due to Weaviate being CI # but here we normalize the item diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index 8906958e0c..ccf926cc62 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -70,7 +70,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: } # check loads table without attaching to pipeline duckdb_cfg = resolve_configuration( - DuckDbClientConfiguration(dataset_name=GITHUB_DATASET), + DuckDbClientConfiguration()._bind_dataset_name(dataset_name=GITHUB_DATASET), sections=("destination", "duckdb"), ) with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: @@ -189,7 +189,7 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: venv = Venv.restore_current() print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_load.py")) duckdb_cfg = resolve_configuration( - DuckDbClientConfiguration(dataset_name=GITHUB_DATASET), + DuckDbClientConfiguration()._bind_dataset_name(dataset_name=GITHUB_DATASET), sections=("destination", "duckdb"), ) with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: diff --git a/tests/pipeline/test_import_export_schema.py b/tests/pipeline/test_import_export_schema.py index b1c2284f24..6f40e1d1eb 100644 --- a/tests/pipeline/test_import_export_schema.py +++ b/tests/pipeline/test_import_export_schema.py @@ -2,6 +2,7 @@ from dlt.common.utils import uniq_id +from tests.pipeline.utils import assert_load_info from tests.utils import TEST_STORAGE_ROOT from dlt.common.schema import Schema from dlt.common.storages.schema_storage import SchemaStorage @@ -83,7 +84,17 @@ def test_import_schema_is_respected() -> None: export_schema_path=EXPORT_SCHEMA_PATH, ) p.run(EXAMPLE_DATA, table_name="person") + # initial schema + evolved in normalize == version 2 + assert p.default_schema.stored_version == 2 assert p.default_schema.tables["person"]["columns"]["id"]["data_type"] == "bigint" + # import schema got saved + import_schema = _get_import_schema(name) + assert "person" in import_schema.tables + # initial schema (after extract) got saved + assert import_schema.stored_version == 1 + # import schema hash is set + assert p.default_schema._imported_version_hash == import_schema.version_hash + assert not p.default_schema.is_modified # take default schema, modify column type and save it to import folder modified_schema = p.default_schema.clone() @@ -91,14 +102,12 @@ def test_import_schema_is_respected() -> None: with open(os.path.join(IMPORT_SCHEMA_PATH, name + ".schema.yaml"), "w", encoding="utf-8") as f: f.write(modified_schema.to_pretty_yaml()) - # this will provoke a CannotCoerceColumnException - with pytest.raises(PipelineStepFailed) as exc: - p.run(EXAMPLE_DATA, table_name="person") - assert type(exc.value.exception) == CannotCoerceColumnException - - # schema is changed + # import schema will be imported into pipeline + p.run(EXAMPLE_DATA, table_name="person") + # again: extract + normalize + assert p.default_schema.stored_version == 3 + # change in pipeline schema assert p.default_schema.tables["person"]["columns"]["id"]["data_type"] == "text" - # import schema is not overwritten assert _get_import_schema(name).tables["person"]["columns"]["id"]["data_type"] == "text" @@ -110,7 +119,15 @@ def test_import_schema_is_respected() -> None: export_schema_path=EXPORT_SCHEMA_PATH, full_refresh=True, ) - p.run(EXAMPLE_DATA, table_name="person") + p.extract(EXAMPLE_DATA, table_name="person") + # starts with import schema v 1 that is dirty -> 2 + assert p.default_schema.stored_version == 3 + p.normalize() + assert p.default_schema.stored_version == 3 + info = p.load() + assert_load_info(info) + assert p.default_schema.stored_version == 3 + assert p.default_schema.tables["person"]["columns"]["id"]["data_type"] == "text" # import schema is not overwritten diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 2f221ac8a0..37356c2b44 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -20,12 +20,12 @@ from dlt.common.configuration.specs.gcp_credentials import GcpOAuthCredentials from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import WithStateSync -from dlt.common.exceptions import ( +from dlt.common.destination.exceptions import ( DestinationHasFailedJobs, DestinationTerminalException, - PipelineStateNotAvailable, UnknownDestinationModule, ) +from dlt.common.exceptions import PipelineStateNotAvailable from dlt.common.pipeline import LoadInfo, PipelineContext from dlt.common.runtime.collector import LogCollector from dlt.common.schema.utils import new_column, new_table @@ -441,6 +441,86 @@ def with_mark(): assert p.default_schema.tables["spec_table"]["resource"] == "with_mark" +def test_mark_hints_with_variant() -> None: + @dlt.resource(primary_key="pk") + def with_table_hints(): + # dispatch to table a + yield dlt.mark.with_hints( + {"id": 1, "pk": "A"}, + dlt.mark.make_hints( + table_name="table_a", columns=[{"name": "id", "data_type": "bigint"}] + ), + create_table_variant=True, + ) + + # dispatch to table b + yield dlt.mark.with_hints( + {"id": 2, "pk": "B"}, + dlt.mark.make_hints(table_name="table_b", write_disposition="replace"), + create_table_variant=True, + ) + + # item to resource + yield {"id": 3, "pk": "C"} + # table a with table_hints + yield dlt.mark.with_table_name({"id": 4, "pk": "D"}, "table_a") + # table b with table_hints + yield dlt.mark.with_table_name({"id": 5, "pk": "E"}, "table_b") + + pipeline_name = "pipe_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb") + info = pipeline.run(with_table_hints) + assert_load_info(info) + assert pipeline.last_trace.last_normalize_info.row_counts == { + "_dlt_pipeline_state": 1, + "table_a": 2, + "table_b": 2, + "with_table_hints": 1, + } + # check table counts + assert_data_table_counts(pipeline, {"table_a": 2, "table_b": 2, "with_table_hints": 1}) + + +def test_mark_hints_variant_dynamic_name() -> None: + @dlt.resource(table_name=lambda item: "table_" + item["tag"]) + def with_table_hints(): + # dispatch to table a + yield dlt.mark.with_hints( + {"id": 1, "pk": "A", "tag": "a"}, + dlt.mark.make_hints( + table_name="table_a", + primary_key="pk", + columns=[{"name": "id", "data_type": "bigint"}], + ), + create_table_variant=True, + ) + + # dispatch to table b + yield dlt.mark.with_hints( + {"id": 2, "pk": "B", "tag": "b"}, + dlt.mark.make_hints(table_name="table_b", write_disposition="replace"), + create_table_variant=True, + ) + + # dispatch by tag + yield {"id": 3, "pk": "C", "tag": "c"} + yield {"id": 4, "pk": "D", "tag": "a"} + yield {"id": 5, "pk": "E", "tag": "b"} + + pipeline_name = "pipe_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb") + info = pipeline.run(with_table_hints) + assert_load_info(info) + assert pipeline.last_trace.last_normalize_info.row_counts == { + "_dlt_pipeline_state": 1, + "table_a": 2, + "table_b": 2, + "table_c": 1, + } + # check table counts + assert_data_table_counts(pipeline, {"table_a": 2, "table_b": 2, "table_c": 1}) + + def test_restore_state_on_dummy() -> None: os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately @@ -952,6 +1032,73 @@ def reverse_order(item): ] +def test_preserve_new_fields_order_on_append() -> None: + pipeline_name = "pipe_" + uniq_id() + p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy") + + item = {"c1": 1, "c2": 2, "c3": "list"} + p.extract([item], table_name="order_1") + p.normalize() + assert list(p.default_schema.get_table_columns("order_1").keys()) == [ + "c1", + "c2", + "c3", + "_dlt_load_id", + "_dlt_id", + ] + + # add columns + item = {"c1": 1, "c4": 2.0, "c3": "list", "c5": {"x": 1}} + p.extract([item], table_name="order_1") + p.normalize() + assert list(p.default_schema.get_table_columns("order_1").keys()) == [ + "c1", + "c2", + "c3", + "_dlt_load_id", + "_dlt_id", + "c4", + "c5__x", + ] + + +def test_preserve_fields_order_incomplete_columns() -> None: + p = dlt.pipeline(pipeline_name="column_order", destination="dummy") + # incomplete columns (without data type) will be added in order of fields in data + + @dlt.resource(columns={"c3": {"precision": 32}}, primary_key="c2") + def items(): + yield {"c1": 1, "c2": 1, "c3": 1} + + p.extract(items) + p.normalize() + assert list(p.default_schema.get_table_columns("items").keys()) == [ + "c1", + "c2", + "c3", + "_dlt_load_id", + "_dlt_id", + ] + + # complete columns preserve order in "columns" + p = p.drop() + + @dlt.resource(columns={"c3": {"precision": 32, "data_type": "decimal"}}, primary_key="c1") + def items2(): + yield {"c1": 1, "c2": 1, "c3": 1} + + p.extract(items2) + p.normalize() + # c3 was first so goes first + assert list(p.default_schema.get_table_columns("items2").keys()) == [ + "c3", + "c1", + "c2", + "_dlt_load_id", + "_dlt_id", + ] + + def test_pipeline_log_progress() -> None: os.environ["TIMEOUT"] = "3.0" @@ -1269,7 +1416,7 @@ def test_drop_with_new_name() -> None: assert new_pipeline.pipeline_name == new_test_name -def test_remove_autodetect() -> None: +def test_schema_version_increase_and_source_update() -> None: now = pendulum.now() @dlt.source @@ -1282,12 +1429,81 @@ def autodetect(): ) pipeline = dlt.pipeline(destination="duckdb") + # control version of the schema + auto_source = autodetect() + assert auto_source.schema.stored_version is None + pipeline.extract(auto_source) + # extract did a first save + assert pipeline.default_schema.stored_version == 1 + # only one prev hash + assert len(pipeline.default_schema.previous_hashes) == 1 + # source schema was updated in the pipeline + assert auto_source.schema.stored_version == 1 + # source has pipeline schema + assert pipeline.default_schema is auto_source.schema + + pipeline.normalize() + # columns added and schema was saved in between + assert pipeline.default_schema.stored_version == 2 + assert len(pipeline.default_schema.previous_hashes) == 2 + # source schema still updated + assert auto_source.schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + pipeline.load() + # nothing changed in load + assert pipeline.default_schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + + # run same source again + pipeline.extract(auto_source) + assert pipeline.default_schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + pipeline.normalize() + assert pipeline.default_schema.stored_version == 2 + pipeline.load() + assert pipeline.default_schema.stored_version == 2 + + # run another instance of the same source pipeline.run(autodetect()) + assert pipeline.default_schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + assert "timestamp" in pipeline.default_schema.settings["detections"] + + # data has compatible schema with "numbers" but schema is taken from pipeline + pipeline.run([1, 2, 3], table_name="numbers") + assert "timestamp" in pipeline.default_schema.settings["detections"] + assert pipeline.default_schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + + # new table will evolve schema + pipeline.run([1, 2, 3], table_name="seq") + assert "timestamp" in pipeline.default_schema.settings["detections"] + assert pipeline.default_schema.stored_version == 4 + assert pipeline.default_schema is auto_source.schema + + +def test_remove_autodetect() -> None: + now = pendulum.now() + + @dlt.source + def autodetect(): + # add unix ts autodetection to current source schema + dlt.current.source_schema().add_type_detection("timestamp") + return dlt.resource( + [int(now.timestamp()), int(now.timestamp() + 1), int(now.timestamp() + 2)], + name="numbers", + ) + + pipeline = dlt.pipeline(destination="duckdb") + auto_source = autodetect() + pipeline.extract(auto_source) + pipeline.normalize() # unix ts recognized assert ( pipeline.default_schema.get_table("numbers")["columns"]["value"]["data_type"] == "timestamp" ) + pipeline.load() pipeline = pipeline.drop() @@ -1388,8 +1604,13 @@ def test_pipeline_list_packages() -> None: ) load_ids = pipeline.list_extracted_load_packages() assert len(load_ids) == 3 + extracted_package = pipeline.get_load_package_info(load_ids[1]) + assert extracted_package.schema_name == "airtable_emojis" + extracted_package = pipeline.get_load_package_info(load_ids[2]) + assert extracted_package.schema_name == "emojis_2" extracted_package = pipeline.get_load_package_info(load_ids[0]) assert extracted_package.state == "extracted" + assert extracted_package.schema_name == "airtable_emojis" # same load id continues till the end pipeline.normalize() load_ids_n = pipeline.list_normalized_load_packages() diff --git a/tests/pipeline/test_pipeline_file_format_resolver.py b/tests/pipeline/test_pipeline_file_format_resolver.py index 49a38c455b..588ad720a5 100644 --- a/tests/pipeline/test_pipeline_file_format_resolver.py +++ b/tests/pipeline/test_pipeline_file_format_resolver.py @@ -3,7 +3,7 @@ import dlt import pytest -from dlt.common.exceptions import ( +from dlt.common.destination.exceptions import ( DestinationIncompatibleLoaderFileFormatException, DestinationLoadingViaStagingNotSupported, DestinationNoStagingMode, diff --git a/tests/sources/helpers/rest_client/__init__.py b/tests/sources/helpers/rest_client/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/sources/helpers/rest_client/conftest.py b/tests/sources/helpers/rest_client/conftest.py new file mode 100644 index 0000000000..09676bdf37 --- /dev/null +++ b/tests/sources/helpers/rest_client/conftest.py @@ -0,0 +1,196 @@ +import re +from typing import NamedTuple, Callable, Pattern, List, TYPE_CHECKING +import base64 + +from urllib.parse import urlsplit, urlunsplit + +import pytest +import requests_mock + +from dlt.common import json + +if TYPE_CHECKING: + RequestCallback = Callable[[requests_mock.Request, requests_mock.Context], str] +else: + RequestCallback = Callable + +MOCK_BASE_URL = "https://api.example.com" + + +class Route(NamedTuple): + method: str + pattern: Pattern[str] + callback: RequestCallback + + +class APIRouter: + def __init__(self, base_url: str): + self.routes: List[Route] = [] + self.base_url = base_url + + def _add_route(self, method: str, pattern: str, func: RequestCallback) -> RequestCallback: + compiled_pattern = re.compile(f"{self.base_url}{pattern}") + self.routes.append(Route(method, compiled_pattern, func)) + return func + + def get(self, pattern: str) -> Callable[[RequestCallback], RequestCallback]: + def decorator(func: RequestCallback) -> RequestCallback: + return self._add_route("GET", pattern, func) + + return decorator + + def post(self, pattern: str) -> Callable[[RequestCallback], RequestCallback]: + def decorator(func: RequestCallback) -> RequestCallback: + return self._add_route("POST", pattern, func) + + return decorator + + def register_routes(self, mocker: requests_mock.Mocker) -> None: + for route in self.routes: + mocker.register_uri( + route.method, + route.pattern, + text=route.callback, + ) + + +router = APIRouter(MOCK_BASE_URL) + + +def serialize_page(records, page_number, total_pages, base_url, records_key="data"): + if records_key is None: + return json.dumps(records) + + response = { + records_key: records, + "page": page_number, + "total_pages": total_pages, + } + + if page_number < total_pages: + next_page = page_number + 1 + + scheme, netloc, path, _, _ = urlsplit(base_url) + next_page = urlunsplit([scheme, netloc, path, f"page={next_page}", ""]) + response["next_page"] = next_page + + return json.dumps(response) + + +def generate_posts(count=100): + return [{"id": i, "title": f"Post {i}"} for i in range(count)] + + +def generate_comments(post_id, count=50): + return [{"id": i, "body": f"Comment {i} for post {post_id}"} for i in range(count)] + + +def get_page_number(qs, key="page", default=1): + return int(qs.get(key, [default])[0]) + + +def paginate_response(request, records, page_size=10, records_key="data"): + page_number = get_page_number(request.qs) + total_records = len(records) + total_pages = (total_records + page_size - 1) // page_size + start_index = (page_number - 1) * 10 + end_index = start_index + 10 + records_slice = records[start_index:end_index] + return serialize_page(records_slice, page_number, total_pages, request.url, records_key) + + +@pytest.fixture(scope="module") +def mock_api_server(): + with requests_mock.Mocker() as m: + + @router.get(r"/posts_no_key(\?page=\d+)?$") + def posts_no_key(request, context): + return paginate_response(request, generate_posts(), records_key=None) + + @router.get(r"/posts(\?page=\d+)?$") + def posts(request, context): + return paginate_response(request, generate_posts()) + + @router.get(r"/posts/(\d+)/comments") + def post_comments(request, context): + post_id = int(request.url.split("/")[-2]) + return paginate_response(request, generate_comments(post_id)) + + @router.get(r"/posts/\d+$") + def post_detail(request, context): + post_id = request.url.split("/")[-1] + return json.dumps({"id": post_id, "body": f"Post body {post_id}"}) + + @router.get(r"/posts/\d+/some_details_404") + def post_detail_404(request, context): + """Return 404 for post with id > 0. Used to test ignoring 404 errors.""" + post_id = int(request.url.split("/")[-2]) + if post_id < 1: + return json.dumps({"id": post_id, "body": f"Post body {post_id}"}) + else: + context.status_code = 404 + return json.dumps({"error": "Post not found"}) + + @router.get(r"/posts_under_a_different_key$") + def posts_with_results_key(request, context): + return paginate_response(request, generate_posts(), records_key="many-results") + + @router.get("/protected/posts/basic-auth") + def protected_basic_auth(request, context): + auth = request.headers.get("Authorization") + creds = "user:password" + creds_base64 = base64.b64encode(creds.encode()).decode() + if auth == f"Basic {creds_base64}": + return paginate_response(request, generate_posts()) + context.status_code = 401 + return json.dumps({"error": "Unauthorized"}) + + @router.get("/protected/posts/bearer-token") + def protected_bearer_token(request, context): + auth = request.headers.get("Authorization") + if auth == "Bearer test-token": + return paginate_response(request, generate_posts()) + context.status_code = 401 + return json.dumps({"error": "Unauthorized"}) + + @router.get("/protected/posts/bearer-token-plain-text-error") + def protected_bearer_token_plain_text_erorr(request, context): + auth = request.headers.get("Authorization") + if auth == "Bearer test-token": + return paginate_response(request, generate_posts()) + context.status_code = 401 + return "Unauthorized" + + @router.get("/protected/posts/api-key") + def protected_api_key(request, context): + api_key = request.headers.get("x-api-key") + if api_key == "test-api-key": + return paginate_response(request, generate_posts()) + context.status_code = 401 + return json.dumps({"error": "Unauthorized"}) + + @router.post("/oauth/token") + def oauth_token(request, context): + return json.dumps( + { + "access_token": "test-token", + "expires_in": 3600, + } + ) + + @router.post("/auth/refresh") + def refresh_token(request, context): + body = request.json() + if body.get("refresh_token") == "valid-refresh-token": + return json.dumps({"access_token": "new-valid-token"}) + context.status_code = 401 + return json.dumps({"error": "Invalid refresh token"}) + + router.register_routes(m) + + yield m + + +def assert_pagination(pages, expected_start=0, page_size=10): + for i, page in enumerate(pages): + assert page == [{"id": i, "title": f"Post {i}"} for i in range(i * 10, (i + 1) * 10)] diff --git a/tests/sources/helpers/rest_client/private_key.pem b/tests/sources/helpers/rest_client/private_key.pem new file mode 100644 index 0000000000..ce4592157b --- /dev/null +++ b/tests/sources/helpers/rest_client/private_key.pem @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDQQxVECHvO2Gs9 +MaRlD0HG5IpoJ3jhuG+nTgDEY7AU75nO74juOZuQR6AxO5nS/QeZS6bbjrzgz9P4 +vtDTksuSwXrgFJF1M5qiYwLZBr3ZNQA/e/D39+L2735craFsy8x6Xz5OCSCWaAyu +ufOMl1Yt2vRsDZ+x0OPPvKgUCBkgRMDxPbf4kuWnG/f4Z6czt3oReE6SiriT7EXS +ucNccSzgVs9HRopJ0M7jcbWPwGUfSlA3IO1G5sAEfVCihpzFlC7OoB+qAKj0wnAZ +Kr6gOuEFneoNUlErpLaeQwdRE+h61s5JybxZhFgr69n6kYIPG8ra6spVyB13WYt1 +FMEtL4P1AgMBAAECggEALv0vx2OdoaApZAt3Etk0J17JzrG3P8CIKqi6GhV+9V5R +JwRbMhrb21wZy/ntXVI7XG5aBbhJK/UgV8Of5Ni+Z0yRv4zMe/PqfCCYVCTGAYPI +nEpH5n7u3fXP3jPL0/sQlfy2108OY/kygVrR1YMQzfRUyStywGFIAUdI6gogtyt7 +cjh07mmMc8HUMhAVyluE5hpQCLDv5Xige2PY7zv1TqhI3OoJFi27VeBCSyI7x/94 +GM1XpzdFcvYPNPo6aE9vGnDq8TfYwjy+hkY+D9DRpnEmVEXmeBdsxsSD+ybyprO1 +C2sytiV9d3wJ96fhsYupLK88EGxU2uhmFntHuasMQQKBgQD9cWVo7B18FCV/NAdS +nV3KzNtlIrGRFZ7FMZuVZ/ZjOpvzbTVbla3YbRjTkXYpK9Meo8KczwzxQ2TQ1qxY +67SrhfFRRWzktMWqwBSKHPIig+DnqUCUo7OSA0pN+u6yUvFWdINZucB+yMWtgRrj +8GuAMXD/vaoCiNrHVf2V191fwQKBgQDSXP3cqBjBtDLP3qFwDzOG8cR9qiiDvesQ +DXf5seV/rBCXZvkw81t+PGz0O/UrUonv/FqxQR0GqpAdX1ZM3Jko0WxbfoCgsT0u +1aSzcMq1JQt0CI77T8tIPYvym9FO+Jz89kX0WliL/I7GLsmG5EYBK/+dcJBh1QCE +VaMCgrbxNQKBgB10zYWJU8/1A3qqUGOQuLL2ZlV11892BNMEdgHCaIeV60Q6oCX5 +2o+59lW4pVQZrNr1y4uwIN/1pkUDflqDYqdA1RBOEl7uh77Vvk1jGd1bGIu0RzY/ +ZIKG8V7o2E9Pho820YFfLnlN2nPU+owdiFEI7go7QAQ1ZcAfRW7h/O/BAoGBAJg+ +IKO/LBuUFGoIT4HQHpR9CJ2BtkyR+Drn5HpbWyKpHmDUb2gT15VmmduwQOEXnSiH +1AMQgrc+XYpEYyrBRD8cQXV9+g1R+Fua1tXevXWX19AkGYab2xzvHgd46WRj3Qne +GgacFBVLtPCND+CF+HwEobwJqRSEmRks+QpqG4g5AoGAXpw9CZb+gYfwl2hphFGO +kT/NOfk8PN7WeZAe7ktStZByiGhHWaxqYE0q5favhNG6tMxSdmSOzYF8liHWuvJm +cDHqNVJeTGT8rjW7Iz08wj5F+ZAJYCMkM9aDpDUKJIHnOwYZCGfZxRJCiHTReyR7 +u03hoszfCn13l85qBnYlwaw= +-----END PRIVATE KEY----- diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py new file mode 100644 index 0000000000..b1038bced0 --- /dev/null +++ b/tests/sources/helpers/rest_client/test_client.py @@ -0,0 +1,167 @@ +import os +import pytest +from typing import Any, cast +from dlt.common.typing import TSecretStrValue +from dlt.sources.helpers.requests import Response, Request +from dlt.sources.helpers.rest_client import RESTClient +from dlt.sources.helpers.rest_client.client import Hooks +from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator + +from dlt.sources.helpers.rest_client.auth import AuthConfigBase +from dlt.sources.helpers.rest_client.auth import ( + BearerTokenAuth, + APIKeyAuth, + HttpBasicAuth, + OAuthJWTAuth, +) +from dlt.sources.helpers.rest_client.exceptions import IgnoreResponseException + +from .conftest import assert_pagination + + +def load_private_key(name="private_key.pem"): + key_path = os.path.join(os.path.dirname(__file__), name) + with open(key_path, "r", encoding="utf-8") as key_file: + return key_file.read() + + +TEST_PRIVATE_KEY = load_private_key() + + +@pytest.fixture +def rest_client() -> RESTClient: + return RESTClient( + base_url="https://api.example.com", + headers={"Accept": "application/json"}, + ) + + +@pytest.mark.usefixtures("mock_api_server") +class TestRESTClient: + def test_get_single_resource(self, rest_client): + response = rest_client.get("/posts/1") + assert response.status_code == 200 + assert response.json() == {"id": "1", "body": "Post body 1"} + + def test_pagination(self, rest_client: RESTClient): + pages_iter = rest_client.paginate( + "/posts", + paginator=JSONResponsePaginator(next_url_path="next_page"), + ) + + pages = list(pages_iter) + + assert_pagination(pages) + + def test_page_context(self, rest_client: RESTClient) -> None: + for page in rest_client.paginate( + "/posts", + paginator=JSONResponsePaginator(next_url_path="next_page"), + auth=AuthConfigBase(), + ): + # response that produced data + assert isinstance(page.response, Response) + # updated request + assert isinstance(page.request, Request) + # make request url should be same as next link in paginator + if page.paginator.has_next_page: + assert page.paginator.next_reference == page.request.url + + def test_default_paginator(self, rest_client: RESTClient): + pages_iter = rest_client.paginate("/posts") + + pages = list(pages_iter) + + assert_pagination(pages) + + def test_paginate_with_hooks(self, rest_client: RESTClient): + def response_hook(response: Response, *args: Any, **kwargs: Any) -> None: + if response.status_code == 404: + raise IgnoreResponseException + + hooks: Hooks = { + "response": response_hook, + } + + pages_iter = rest_client.paginate( + "/posts", + paginator=JSONResponsePaginator(next_url_path="next_page"), + hooks=hooks, + ) + + pages = list(pages_iter) + + assert_pagination(pages) + + pages_iter = rest_client.paginate( + "/posts/1/some_details_404", + paginator=JSONResponsePaginator(), + hooks=hooks, + ) + + pages = list(pages_iter) + assert pages == [] + + def test_basic_auth_success(self, rest_client: RESTClient): + response = rest_client.get( + "/protected/posts/basic-auth", + auth=HttpBasicAuth("user", cast(TSecretStrValue, "password")), + ) + assert response.status_code == 200 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + pages_iter = rest_client.paginate( + "/protected/posts/basic-auth", + auth=HttpBasicAuth("user", cast(TSecretStrValue, "password")), + ) + + pages = list(pages_iter) + assert_pagination(pages) + + def test_bearer_token_auth_success(self, rest_client: RESTClient): + response = rest_client.get( + "/protected/posts/bearer-token", + auth=BearerTokenAuth(cast(TSecretStrValue, "test-token")), + ) + assert response.status_code == 200 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + pages_iter = rest_client.paginate( + "/protected/posts/bearer-token", + auth=BearerTokenAuth(cast(TSecretStrValue, "test-token")), + ) + + pages = list(pages_iter) + assert_pagination(pages) + + def test_api_key_auth_success(self, rest_client: RESTClient): + response = rest_client.get( + "/protected/posts/api-key", + auth=APIKeyAuth(name="x-api-key", api_key=cast(TSecretStrValue, "test-api-key")), + ) + assert response.status_code == 200 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + def test_oauth_jwt_auth_success(self, rest_client: RESTClient): + auth = OAuthJWTAuth( + client_id="test-client-id", + private_key=TEST_PRIVATE_KEY, + auth_endpoint="https://api.example.com/oauth/token", + scopes=["read", "write"], + headers={"Content-Type": "application/json"}, + ) + + response = rest_client.get( + "/protected/posts/bearer-token", + auth=auth, + ) + + assert response.status_code == 200 + assert "test-token" in response.request.headers["Authorization"] + + pages_iter = rest_client.paginate( + "/protected/posts/bearer-token", + auth=auth, + ) + + assert_pagination(list(pages_iter)) diff --git a/tests/sources/helpers/rest_client/test_detector.py b/tests/sources/helpers/rest_client/test_detector.py new file mode 100644 index 0000000000..933c9be9cc --- /dev/null +++ b/tests/sources/helpers/rest_client/test_detector.py @@ -0,0 +1,356 @@ +import pytest +from dlt.common import jsonpath + +from dlt.sources.helpers.rest_client.detector import ( + find_records, + find_next_page_path, + single_entity_path, +) + + +TEST_RESPONSES = [ + { + "response": { + "data": [{"id": 1, "name": "Item 1"}, {"id": 2, "name": "Item 2"}], + "pagination": {"offset": 0, "limit": 2, "total": 100}, + }, + "expected": { + "type": "offset_limit", + "records_path": "data", + }, + }, + { + "response": { + "items": [ + {"id": 11, "title": "Page Item 1"}, + {"id": 12, "title": "Page Item 2"}, + ], + "page_info": {"current_page": 1, "items_per_page": 2, "total_pages": 50}, + }, + "expected": { + "type": "page_number", + "records_path": "items", + }, + }, + { + "response": { + "products": [ + {"id": 101, "name": "Product 1"}, + {"id": 102, "name": "Product 2"}, + ], + "next_cursor": "eyJpZCI6MTAyfQ==", + }, + "expected": { + "type": "cursor", + "records_path": "products", + "next_path": ["next_cursor"], + }, + }, + { + "response": { + "results": [ + {"id": 201, "description": "Result 1"}, + {"id": 202, "description": "Result 2"}, + ], + "cursors": {"next": "NjM=", "previous": "MTk="}, + }, + "expected": { + "type": "cursor", + "records_path": "results", + "next_path": ["cursors", "next"], + }, + }, + { + "response": { + "entries": [{"id": 31, "value": "Entry 1"}, {"id": 32, "value": "Entry 2"}], + "next_id": 33, + "limit": 2, + }, + "expected": { + "type": "cursor", + "records_path": "entries", + "next_path": ["next_id"], + }, + }, + { + "response": { + "comments": [ + {"id": 51, "text": "Comment 1"}, + {"id": 52, "text": "Comment 2"}, + ], + "page_number": 3, + "total_pages": 15, + }, + "expected": { + "type": "page_number", + "records_path": "comments", + }, + }, + { + "response": { + "count": 1023, + "next": "https://api.example.org/accounts/?page=5", + "previous": "https://api.example.org/accounts/?page=3", + "results": [{"id": 1, "name": "Account 1"}, {"id": 2, "name": "Account 2"}], + }, + "expected": { + "type": "json_link", + "records_path": "results", + "next_path": ["next"], + }, + }, + { + "response": { + "_embedded": {"items": [{"id": 1, "name": "Item 1"}, {"id": 2, "name": "Item 2"}]}, + "_links": { + "first": {"href": "http://api.example.com/items?page=0&size=2"}, + "self": {"href": "http://api.example.com/items?page=1&size=2"}, + "next": {"href": "http://api.example.com/items?page=2&size=2"}, + "last": {"href": "http://api.example.com/items?page=50&size=2"}, + }, + "page": {"size": 2, "totalElements": 100, "totalPages": 50, "number": 1}, + }, + "expected": { + "type": "json_link", + "records_path": "_embedded.items", + "next_path": ["_links", "next", "href"], + }, + }, + { + "response": { + "items": [{"id": 1, "name": "Item 1"}, {"id": 2, "name": "Item 2"}], + "meta": { + "currentPage": 1, + "pageSize": 2, + "totalPages": 50, + "totalItems": 100, + }, + "links": { + "firstPage": "/items?page=1&limit=2", + "previousPage": "/items?page=0&limit=2", + "nextPage": "/items?page=2&limit=2", + "lastPage": "/items?page=50&limit=2", + }, + }, + "expected": { + "type": "json_link", + "records_path": "items", + "next_path": ["links", "nextPage"], + }, + }, + { + "response": { + "data": [{"id": 1, "name": "Item 1"}, {"id": 2, "name": "Item 2"}], + "pagination": { + "currentPage": 1, + "pageSize": 2, + "totalPages": 5, + "totalItems": 10, + }, + }, + "expected": { + "type": "page_number", + "records_path": "data", + }, + }, + { + "response": { + "items": [{"id": 1, "title": "Item 1"}, {"id": 2, "title": "Item 2"}], + "pagination": {"page": 1, "perPage": 2, "total": 10, "totalPages": 5}, + }, + "expected": { + "type": "page_number", + "records_path": "items", + }, + }, + { + "response": { + "data": [ + {"id": 1, "description": "Item 1"}, + {"id": 2, "description": "Item 2"}, + ], + "meta": { + "currentPage": 1, + "itemsPerPage": 2, + "totalItems": 10, + "totalPages": 5, + }, + "links": { + "first": "/api/items?page=1", + "previous": None, + "next": "/api/items?page=2", + "last": "/api/items?page=5", + }, + }, + "expected": { + "type": "json_link", + "records_path": "data", + "next_path": ["links", "next"], + }, + }, + { + "response": { + "page": 2, + "per_page": 10, + "total": 100, + "pages": 10, + "data": [{"id": 1, "name": "Item 1"}, {"id": 2, "name": "Item 2"}], + }, + "expected": { + "type": "page_number", + "records_path": "data", + }, + }, + { + "response": { + "currentPage": 1, + "pageSize": 10, + "totalPages": 5, + "totalRecords": 50, + "items": [{"id": 1, "name": "Item 1"}, {"id": 2, "name": "Item 2"}], + }, + "expected": { + "type": "page_number", + "records_path": "items", + }, + }, + { + "response": { + "articles": [ + {"id": 21, "headline": "Article 1"}, + {"id": 22, "headline": "Article 2"}, + ], + "paging": {"current": 3, "size": 2, "total": 60}, + }, + "expected": { + "type": "page_number", + "records_path": "articles", + }, + }, + { + "response": { + "feed": [ + {"id": 41, "content": "Feed Content 1"}, + {"id": 42, "content": "Feed Content 2"}, + ], + "offset": 40, + "limit": 2, + "total_count": 200, + }, + "expected": { + "type": "offset_limit", + "records_path": "feed", + }, + }, + { + "response": { + "query_results": [ + {"id": 81, "snippet": "Result Snippet 1"}, + {"id": 82, "snippet": "Result Snippet 2"}, + ], + "page_details": { + "number": 1, + "size": 2, + "total_elements": 50, + "total_pages": 25, + }, + }, + "expected": { + "type": "page_number", + "records_path": "query_results", + }, + }, + { + "response": { + "posts": [ + {"id": 91, "title": "Blog Post 1"}, + {"id": 92, "title": "Blog Post 2"}, + ], + "pagination_details": { + "current_page": 4, + "posts_per_page": 2, + "total_posts": 100, + "total_pages": 50, + }, + }, + "expected": { + "type": "page_number", + "records_path": "posts", + }, + }, + { + "response": { + "catalog": [ + {"id": 101, "product_name": "Product A"}, + {"id": 102, "product_name": "Product B"}, + ], + "page_metadata": { + "index": 1, + "size": 2, + "total_items": 20, + "total_pages": 10, + }, + }, + "expected": { + "type": "page_number", + "records_path": "catalog", + }, + }, +] + + +@pytest.mark.parametrize("test_case", TEST_RESPONSES) +def test_find_records(test_case): + response = test_case["response"] + expected = test_case["expected"]["records_path"] + r = find_records(response) + # all of them look fine mostly because those are simple cases... + # case 7 fails because it is nested but in fact we select a right response + # assert r is create_nested_accessor(expected)(response) + assert r == jsonpath.find_values(expected, response)[0] + + +@pytest.mark.parametrize("test_case", TEST_RESPONSES) +def test_find_next_page_key(test_case): + response = test_case["response"] + expected = test_case.get("expected").get("next_path", None) # Some cases may not have next_path + assert find_next_page_path(response) == expected + + +@pytest.mark.skip +@pytest.mark.parametrize( + "path", + [ + "/users/{user_id}", + "/api/v1/products/{product_id}/", + "/api/v1/products/{product_id}//", + "/api/v1/products/{product_id}?param1=value1", + "/api/v1/products/{product_id}#section", + "/api/v1/products/{product_id}/#section", + "/users/{user_id}/posts/{post_id}", + "/users/{user_id}/posts/{post_id}/comments/{comment_id}", + "{entity}", + "/{entity}", + "/{user_123}", + ], +) +def test_single_entity_path_valid(path): + assert single_entity_path(path) is True + + +@pytest.mark.parametrize( + "path", + [ + "/users/user_id", + "/api/v1/products/product_id/", + "/users/{user_id}/details", + "/", + "/{}", + "/users/{123}", + "/users/{user-id}", + "/users/{user id}", + "/users/{user_id}/{", # Invalid ending + ], +) +def test_single_entity_path_invalid(path): + assert single_entity_path(path) is False diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py new file mode 100644 index 0000000000..258099292b --- /dev/null +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -0,0 +1,78 @@ +import pytest +from unittest.mock import Mock + +from requests.models import Response + +from dlt.sources.helpers.rest_client.paginators import ( + SinglePagePaginator, + OffsetPaginator, + HeaderLinkPaginator, + JSONResponsePaginator, +) + + +class TestHeaderLinkPaginator: + def test_update_state_with_next(self): + paginator = HeaderLinkPaginator() + response = Mock(Response) + response.links = {"next": {"url": "http://example.com/next"}} + paginator.update_state(response) + assert paginator.next_reference == "http://example.com/next" + assert paginator.has_next_page is True + + def test_update_state_without_next(self): + paginator = HeaderLinkPaginator() + response = Mock(Response) + response.links = {} + paginator.update_state(response) + assert paginator.has_next_page is False + + +class TestJSONResponsePaginator: + def test_update_state_with_next(self): + paginator = JSONResponsePaginator() + response = Mock(Response, json=lambda: {"next": "http://example.com/next", "results": []}) + paginator.update_state(response) + assert paginator.next_reference == "http://example.com/next" + assert paginator.has_next_page is True + + def test_update_state_without_next(self): + paginator = JSONResponsePaginator() + response = Mock(Response, json=lambda: {"results": []}) + paginator.update_state(response) + assert paginator.next_reference is None + assert paginator.has_next_page is False + + +class TestSinglePagePaginator: + def test_update_state(self): + paginator = SinglePagePaginator() + response = Mock(Response) + paginator.update_state(response) + assert paginator.has_next_page is False + + def test_update_state_with_next(self): + paginator = SinglePagePaginator() + response = Mock(Response, json=lambda: {"next": "http://example.com/next", "results": []}) + response.links = {"next": {"url": "http://example.com/next"}} + paginator.update_state(response) + assert paginator.has_next_page is False + + +class TestOffsetPaginator: + def test_update_state(self): + paginator = OffsetPaginator(initial_offset=0, initial_limit=10) + response = Mock(Response, json=lambda: {"total": 20}) + paginator.update_state(response) + assert paginator.offset == 10 + assert paginator.has_next_page is True + + # Test for reaching the end + paginator.update_state(response) + assert paginator.has_next_page is False + + def test_update_state_without_total(self): + paginator = OffsetPaginator(0, 10) + response = Mock(Response, json=lambda: {}) + with pytest.raises(ValueError): + paginator.update_state(response) diff --git a/tests/sources/helpers/rest_client/test_requests_paginate.py b/tests/sources/helpers/rest_client/test_requests_paginate.py new file mode 100644 index 0000000000..43b2a412db --- /dev/null +++ b/tests/sources/helpers/rest_client/test_requests_paginate.py @@ -0,0 +1,17 @@ +import pytest + +from dlt.sources.helpers.rest_client import paginate +from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator +from .conftest import assert_pagination + + +@pytest.mark.usefixtures("mock_api_server") +def test_requests_paginate(): + pages_iter = paginate( + "https://api.example.com/posts", + paginator=JSONResponsePaginator(next_url_path="next_page"), + ) + + pages = list(pages_iter) + + assert_pagination(pages) diff --git a/tests/sources/helpers/rest_client/test_utils.py b/tests/sources/helpers/rest_client/test_utils.py new file mode 100644 index 0000000000..0de9729a42 --- /dev/null +++ b/tests/sources/helpers/rest_client/test_utils.py @@ -0,0 +1,90 @@ +import pytest +from dlt.sources.helpers.rest_client.utils import join_url + + +@pytest.mark.parametrize( + "base_url, path, expected", + [ + # Normal cases + ( + "http://example.com", + "path/to/resource", + "http://example.com/path/to/resource", + ), + ( + "http://example.com/", + "/path/to/resource", + "http://example.com/path/to/resource", + ), + ( + "http://example.com/", + "path/to/resource", + "http://example.com/path/to/resource", + ), + ( + "http://example.com", + "//path/to/resource", + "http://example.com/path/to/resource", + ), + ( + "http://example.com///", + "//path/to/resource", + "http://example.com/path/to/resource", + ), + # Trailing and leading slashes + ("http://example.com/", "/", "http://example.com/"), + ("http://example.com", "/", "http://example.com/"), + ("http://example.com/", "///", "http://example.com/"), + ("http://example.com", "///", "http://example.com/"), + ("/", "path/to/resource", "/path/to/resource"), + ("/", "/path/to/resource", "/path/to/resource"), + # Empty strings + ("", "", ""), + ( + "", + "http://example.com/path/to/resource", + "http://example.com/path/to/resource", + ), + ("", "path/to/resource", "path/to/resource"), + ("http://example.com", "", "http://example.com"), + # Query parameters and fragments + ( + "http://example.com", + "path/to/resource?query=123", + "http://example.com/path/to/resource?query=123", + ), + ( + "http://example.com/", + "path/to/resource#fragment", + "http://example.com/path/to/resource#fragment", + ), + # Special characters in the path + ( + "http://example.com", + "/path/to/resource with spaces", + "http://example.com/path/to/resource with spaces", + ), + ("http://example.com", "/path/with/中文", "http://example.com/path/with/中文"), + # Protocols and subdomains + ("https://sub.example.com", "path", "https://sub.example.com/path"), + ("ftp://example.com", "/path", "ftp://example.com/path"), + # Missing protocol in base_url + ("example.com", "path", "example.com/path"), + ], +) +def test_join_url(base_url, path, expected): + assert join_url(base_url, path) == expected + + +@pytest.mark.parametrize( + "base_url, path, exception", + [ + (None, "path", ValueError), + ("http://example.com", None, AttributeError), + (123, "path", AttributeError), + ("http://example.com", 123, AttributeError), + ], +) +def test_join_url_invalid_input_types(base_url, path, exception): + with pytest.raises(exception): + join_url(base_url, path) diff --git a/tests/utils.py b/tests/utils.py index e36641ca71..5203ccbe6e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -19,7 +19,7 @@ ConfigProvidersContext, ) from dlt.common.pipeline import PipelineContext -from dlt.common.runtime.logger import init_logging +from dlt.common.runtime.init import init_logging from dlt.common.runtime.telemetry import start_telemetry, stop_telemetry from dlt.common.schema import Schema from dlt.common.storages import FileStorage