Retrieve databundle light data size check (#911)

* retrieve_databundle size check commit #1 * retrieve_databundle size check commit #2 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revised get_best_bundles_by_category() function * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * retrieve_databundle_light PR update * release note is added to PR911 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Davide Fioriti <[email protected]>
pypsa-meets-earth · Nov 22, 2023 · 5f77c98 · 5f77c98
1 parent 84cef6e
commit 5f77c98
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 16 deletions.
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
@@ -18,6 +18,8 @@ E.g. if a new rule becomes available describe how to use it `snakemake -j1 run_t
 
 * Function added in clean_osm_data script to allow the use of custom network data instead or on-top of OSM data. `PR #842 <'https://github.com/pypsa-meets-earth/pypsa-earth/pull/842>`__
 
+* Improve retrieve_databundle to prioritize smallest databundles `PR #911 <https://github.com/pypsa-meets-earth/pypsa-earth/pull/911>`__
+
 
 PyPSA-Earth 0.2.3
 =================

diff --git a/scripts/retrieve_databundle_light.py b/scripts/retrieve_databundle_light.py
@@ -85,6 +85,7 @@
 import re
 from zipfile import ZipFile
 
+import pandas as pd
 import yaml
 from _helpers import (
     configure_logging,
@@ -483,28 +484,32 @@ def get_best_bundles_by_category(
         List of bundles to download
     """
     # dictionary with the number of match by configuration for tutorial/non-tutorial configurations
-    dict_n_matched = {
-        bname: config_bundles[bname]["n_matched"]
-        for bname in config_bundles
-        if config_bundles[bname]["category"] == category
-        and config_bundles[bname].get("tutorial", False) == tutorial
-        and _check_disabled_by_opt(config_bundles[bname], config_enable) != ["all"]
-    }
+    df_matches = pd.DataFrame(columns=["bundle_name", "bundle_size", "n_matched"])
+
+    for bname, bvalue in config_bundles.items():
+        if (
+            bvalue["category"] == category
+            and bvalue.get("tutorial", False) == tutorial
+            and _check_disabled_by_opt(bvalue, config_enable) != ["all"]
+        ):
+            df_matches.loc[bname] = [
+                bname,
+                len(bvalue["countries"]),
+                bvalue["n_matched"],
+            ]
 
-    returned_bundles = []
+    df_matches["neg_bundle_size"] = -df_matches["bundle_size"]
+    df_matches.sort_values(
+        by=["n_matched", "neg_bundle_size"], inplace=True, ascending=False
+    )
 
-    # check if non-empty dictionary
-    if dict_n_matched:
-        # if non-empty, then pick bundles until all countries are selected
-        # or no more bundles are found
-        dict_sort = sorted(dict_n_matched.items(), key=lambda d: d[1])
+    returned_bundles = []
 
+    if not df_matches.empty:
         current_matched_countries = []
         remaining_countries = set(country_list)
 
-        for d_val in dict_sort:
-            bname = d_val[0]
-
+        for bname in df_matches.index:
             cbundle_list = set(config_bundles[bname]["countries"])
 
             # list of countries in the bundle that are not yet matched