Merge pull request #3104 from catalyst-cooperative/nightly-build-2023…

…-12-01 Merge dev into main for 2023-12-01
catalyst-cooperative · Dec 1, 2023 · b188ea2 · b188ea2
2 parents 0ae7039 + 7cc80da
commit b188ea2
Show file tree

Hide file tree

Showing 96 changed files with 2,739 additions and 1,815 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -43,6 +43,9 @@ jobs:
           conda config --show
           printenv | sort
 
+      - name: Make input, output and dagster dirs
+        run: mkdir -p ${{ env.PUDL_OUTPUT }} ${{ env.PUDL_INPUT}} ${{ env.DAGSTER_HOME }}
+
       - name: Lint and build PUDL documentation with Sphinx
         run: |
           pip install --no-deps --editable .
@@ -82,6 +85,9 @@ jobs:
           conda config --show
           printenv | sort
 
+      - name: Make input, output and dagster dirs
+        run: mkdir -p ${{ env.PUDL_OUTPUT }} ${{ env.PUDL_INPUT}} ${{ env.DAGSTER_HOME }}
+
       - name: Log SQLite3 version
         run: |
           which sqlite3

diff --git a/.github/workflows/zenodo-cache-sync.yml b/.github/workflows/zenodo-cache-sync.yml
@@ -41,8 +41,6 @@ jobs:
 
       - name: Checkout desired branch
         uses: actions/checkout@v4
-        with:
-          ref: ${{ env.GITHUB_REF }}
 
       - name: Install Conda environment using mamba
         uses: mamba-org/setup-micromamba@v1

diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,8 @@
 docs/data_dictionaries/pudl_db.rst
 .ipynb_checkpoints/
 .cache/
+.ruff_cache/
+.mypy_cache/
 .pytest_cache/*
 .DS_Store
 build/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,11 +16,14 @@ repos:
       - id: check-added-large-files # Don't accidentally commit giant files.
       - id: check-merge-conflict # Watch for lingering merge markers.
       - id: check-yaml # Validate all YAML files.
+      - id: check-toml
       - id: check-case-conflict # Avoid case sensitivity in file names.
       - id: debug-statements # Watch for lingering debugger calls.
       - id: mixed-line-ending # Use Unix line-endings to avoid big no-op CSV diffs.
         args: ["--fix=lf"]
+      - id: trailing-whitespace
       - id: name-tests-test # Follow PyTest naming convention.
+      - id: end-of-file-fixer
 
   ####################################################################################
   # Formatters: hooks that re-write Python & documentation files

diff --git a/Makefile b/Makefile
@@ -7,7 +7,7 @@ etl_fast_yml := src/pudl/package_data/settings/etl_fast.yml
 etl_full_yml := src/pudl/package_data/settings/etl_full.yml
 
 # We use mamba locally, but micromamba in CI, so choose the right binary:
-ifdef GITHUB_ACTION
+ifdef GITHUB_ACTIONS
   mamba := micromamba
 else
   mamba := mamba
@@ -39,7 +39,7 @@ conda-clean:
 
 # Regenerate the conda lockfile and render platform specific conda environments.
 conda-lock.yml: pyproject.toml
-	${mamba} run --name base ${mamba} install --yes conda-lock prettier
+	${mamba} run --name base ${mamba} install --quiet --yes conda-lock prettier
 	${mamba} run --name base conda-lock \
 		--${mamba} \
 		--file=pyproject.toml \
@@ -53,6 +53,7 @@ conda-lock.yml: pyproject.toml
 # Create the pudl-dev conda environment based on the universal lockfile
 .PHONY: pudl-dev
 pudl-dev:
+	${mamba} run --name base ${mamba} install --quiet --yes conda-lock
 	${mamba} run --name base ${mamba} env remove --name pudl-dev
 	${mamba} run --name base conda-lock install \
 		--name pudl-dev \

diff --git a/README.rst b/README.rst
@@ -52,11 +52,11 @@ What data is available?
 
 PUDL currently integrates data from:
 
-* `EIA Form 860 <https://www.eia.gov/electricity/data/eia860/>`__: 2001-2022
+* `EIA Form 860 <https://www.eia.gov/electricity/data/eia860/>`__: 2001 - 2022
 * `EIA Form 860m <https://www.eia.gov/electricity/data/eia860m/>`__: 2023-06
-* `EIA Form 861 <https://www.eia.gov/electricity/data/eia861/>`__: 2001-2022
-* `EIA Form 923 <https://www.eia.gov/electricity/data/eia923/>`__: 2001-2022
-* `EPA Continuous Emissions Monitoring System (CEMS) <https://campd.epa.gov/>`__: 1995-2022
+* `EIA Form 861 <https://www.eia.gov/electricity/data/eia861/>`__: 2001 - 2022
+* `EIA Form 923 <https://www.eia.gov/electricity/data/eia923/>`__: 2001 - 2023-08
+* `EPA Continuous Emissions Monitoring System (CEMS) <https://campd.epa.gov/>`__: 1995 - 2022
 * `FERC Form 1 <https://www.ferc.gov/industries-data/electric/general-information/electric-industry-forms/form-1-electric-utility-annual>`__: 1994-2021
 * `FERC Form 714 <https://www.ferc.gov/industries-data/electric/general-information/electric-industry-forms/form-no-714-annual-electric/data>`__: 2006-2020
 * `US Census Demographic Profile 1 Geodatabase <https://www.census.gov/geographies/mapping-files/2010/geo/tiger-data.html>`__: 2010

diff --git a/devtools/datasette/fly/fly.toml b/devtools/datasette/fly/fly.toml
@@ -31,4 +31,4 @@ primary_region = "bos"
     timeout = 2000
 
 [deploy]
-wait_timeout = "15m"
+wait_timeout = "15m"
diff --git a/devtools/datasette/fly/run.sh b/devtools/datasette/fly/run.sh
@@ -7,4 +7,4 @@ find /data/ -name '*.sqlite' -delete
 mv all_dbs.tar.zst /data
 zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar
 tar -xf /data/all_dbs.tar --directory /data
-datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
+datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py
@@ -17,13 +17,15 @@
 Apart from that: the Dockerfile and dataset-specific
 metadata.yml/inspect-data.json are generated by this script.
 """
-
 import json
 import logging
 import secrets
+import sys
 from pathlib import Path
 from subprocess import check_call, check_output
 
+import click
+
 from pudl.metadata.classes import DatasetteMetadata
 from pudl.workspace.setup import PudlPaths
 
@@ -46,7 +48,7 @@
 """
 
 
-def make_dockerfile():
+def make_dockerfile() -> str:
     """Write a dockerfile from template, to use in fly deploy.
 
     We write this from template so we can generate a datasette secret. This way
@@ -56,7 +58,7 @@ def make_dockerfile():
     return DOCKERFILE_TEMPLATE.format(datasette_secret=datasette_secret)
 
 
-def inspect_data(datasets, pudl_out):
+def inspect_data(datasets: list[str], pudl_out: Path) -> str:
     """Pre-inspect databases to generate some metadata for Datasette.
 
     This is done in the image build process in datasette-publish-fly, but since
@@ -80,43 +82,99 @@ def inspect_data(datasets, pudl_out):
     return inspect_output
 
 
-def metadata(pudl_out) -> str:
+def metadata(pudl_out: Path) -> str:
     """Return human-readable metadata for Datasette."""
     return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml()
 
 
-def main():
+@click.command(context_settings={"help_option_names": ["-h", "--help"]})
+@click.option(
+    "--fly",
+    "-f",
+    "deploy",
+    flag_value="fly",
+    help="Deploy Datasette to fly.io.",
+    default=True,
+)
+@click.option(
+    "--local",
+    "-l",
+    "deploy",
+    flag_value="local",
+    help="Deploy Datasette locally for testing or debugging purposes.",
+)
+@click.option(
+    "--metadata",
+    "-m",
+    "deploy",
+    flag_value="metadata",
+    help="Generate the Datasette metadata.yml in current directory, but do not deploy.",
+)
+def deploy_datasette(deploy: str) -> int:
     """Generate deployment files and run the deploy."""
-    fly_dir = Path(__file__).parent.absolute() / "fly"
-    docker_path = fly_dir / "Dockerfile"
-    inspect_path = fly_dir / "inspect-data.json"
-    metadata_path = fly_dir / "metadata.yml"
-
     pudl_out = PudlPaths().pudl_output
-    datasets = [str(p.name) for p in pudl_out.glob("*.sqlite")]
-    logging.info(f"Inspecting DBs for datasette: {datasets}...")
-    inspect_output = inspect_data(datasets, pudl_out)
-    with inspect_path.open("w") as f:
-        f.write(json.dumps(inspect_output))
-
-    logging.info("Writing metadata...")
-    with metadata_path.open("w") as f:
-        f.write(metadata(pudl_out))
-
-    logging.info("Writing Dockerfile...")
-    with docker_path.open("w") as f:
-        f.write(make_dockerfile())
-
-    logging.info(f"Compressing {datasets} and putting into docker context...")
-    check_call(
-        ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets,  # noqa: S603
-        cwd=pudl_out,
+    metadata_yml = metadata(pudl_out)
+    # Order the databases to highlight PUDL
+    datasets = (
+        ["pudl.sqlite"]
+        + sorted(str(p.name) for p in pudl_out.glob("ferc*.sqlite"))
+        + ["censusdp1tract.sqlite"]
     )
 
-    logging.info("Running fly deploy...")
-    check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir)  # noqa: S603
-    logging.info("Deploy finished!")
+    if deploy == "fly":
+        logging.info("Deploying to fly.io...")
+        fly_dir = Path(__file__).parent.absolute() / "fly"
+        docker_path = fly_dir / "Dockerfile"
+        inspect_path = fly_dir / "inspect-data.json"
+        metadata_path = fly_dir / "metadata.yml"
+
+        logging.info(f"Inspecting DBs for datasette: {datasets}...")
+        inspect_output = inspect_data(datasets, pudl_out)
+        with inspect_path.open("w") as f:
+            f.write(json.dumps(inspect_output))
+
+        logging.info(f"Writing Datasette metadata to: {metadata_path}")
+        with metadata_path.open("w") as f:
+            f.write(metadata_yml)
+
+        logging.info("Writing Dockerfile...")
+        with docker_path.open("w") as f:
+            f.write(make_dockerfile())
+
+        logging.info(f"Compressing {datasets} and putting into docker context...")
+        check_call(
+            ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets,  # noqa: S603
+            cwd=pudl_out,
+        )
+
+        logging.info("Running fly deploy...")
+        check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir)  # noqa: S603
+        logging.info("Deploy finished!")
+
+    elif deploy == "local":
+        logging.info("Running Datasette locally...")
+        metadata_path = pudl_out / "metadata.yml"
+        logging.info(f"Writing Datasette metadata to: {metadata_path}")
+        with metadata_path.open("w") as f:
+            f.write(metadata_yml)
+
+        check_call(
+            ["/usr/bin/env", "datasette", "serve", "-m", "metadata.yml"] + datasets,  # noqa: S603
+            cwd=pudl_out,
+        )
+
+    elif deploy == "metadata":
+        metadata_path = Path.cwd() / "metadata.yml"
+        logging.info(f"Writing Datasette metadata to: {metadata_path}")
+        with metadata_path.open("w") as f:
+            f.write(metadata_yml)
+
+    else:
+        logging.error(f"Unrecognized deployment destination: {deploy=}")
+        return 1
+
+    return 0
 
 
 if __name__ == "__main__":
-    main()
+    sys.exit(deploy_datasette())
diff --git a/devtools/materialize_asset.py b/devtools/materialize_asset.py
@@ -38,7 +38,7 @@ def main(asset_id):
                 config={
                     "resources": {
                         "dataset_settings": {
-                            "config": etl_fast_settings.dict(),
+                            "config": etl_fast_settings.model_dump(),
                         },
                     },
                 },

diff --git a/devtools/sqlite_to_duckdb.py b/devtools/sqlite_to_duckdb.py
@@ -0,0 +1,69 @@
+"""A naive script for converting SQLite to DuckDB."""
+import logging
+from pathlib import Path
+
+import click
+import duckdb
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@click.command()
+@click.argument("sqlite_path", type=click.Path(exists=True, resolve_path=True))
+@click.argument(
+    "duckdb_path", type=click.Path(resolve_path=True, writable=True, allow_dash=False)
+)
+def convert_sqlite_to_duckdb(sqlite_path, duckdb_path):
+    """Convert an SQLite database to DuckDB format.
+
+    Args:
+        sqlite_path (str): Path to the existing SQLite database file.
+        duckdb_path (str): Path to the new DuckDB database file (should not exist).
+
+    Example:
+        python sqlite_to_duckdb.py sqlite.db duckdb.db
+    """
+    sqlite_path = Path(sqlite_path)
+    duckdb_path = Path(duckdb_path)
+
+    # Check if DuckDB file already exists
+    if duckdb_path.exists():
+        click.echo(
+            f"Error: DuckDB file '{duckdb_path}' already exists. Please provide a new filename."
+        )
+        return
+
+    # Connect to DuckDB database
+    duckdb_conn = duckdb.connect(database=str(duckdb_path))
+    duckdb_cursor = duckdb_conn.cursor()
+
+    # Fetch table names from SQLite database using DuckDB
+    duckdb_cursor.execute(f"ATTACH DATABASE '{sqlite_path}' AS sqlite_db;")
+    duckdb_cursor.execute("SELECT name FROM main.sqlite_master WHERE type='table';")
+    table_names = [row[0] for row in duckdb_cursor.fetchall()]
+
+    # Copy tables from SQLite to DuckDB
+    for table_name in table_names:
+        logger.info(f"Working on table: {table_name}")
+        # Fetch column names and types from SQLite table using DuckDB
+        duckdb_cursor.execute(f"PRAGMA table_info(sqlite_db.{table_name});")
+        columns_info = duckdb_cursor.fetchall()
+        column_definitions = ", ".join([f"{col[1]} {col[2]}" for col in columns_info])
+
+        # Create equivalent table in DuckDB
+        duckdb_cursor.execute(f"CREATE TABLE {table_name} ({column_definitions});")
+
+        # Copy data from SQLite to DuckDB using DuckDB
+        duckdb_cursor.execute(
+            f"INSERT INTO {table_name} SELECT * FROM sqlite_db.{table_name};"  # noqa: S608
+        )
+
+    # Commit and close connections
+    duckdb_conn.commit()
+    duckdb_conn.close()
+
+
+if __name__ == "__main__":
+    convert_sqlite_to_duckdb()