diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 5a48de0c3..3f50f3920 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -18,6 +18,8 @@ E.g. if a new rule becomes available describe how to use it `snakemake -j1 run_t * Function added in clean_osm_data script to allow the use of custom network data instead or on-top of OSM data. `PR #842 <'https://github.com/pypsa-meets-earth/pypsa-earth/pull/842>`__ +* Improve retrieve_databundle to prioritize smallest databundles `PR #911 `__ + PyPSA-Earth 0.2.3 ================= diff --git a/scripts/retrieve_databundle_light.py b/scripts/retrieve_databundle_light.py index 7b9b6e125..c05586642 100644 --- a/scripts/retrieve_databundle_light.py +++ b/scripts/retrieve_databundle_light.py @@ -85,6 +85,7 @@ import re from zipfile import ZipFile +import pandas as pd import yaml from _helpers import ( configure_logging, @@ -483,28 +484,32 @@ def get_best_bundles_by_category( List of bundles to download """ # dictionary with the number of match by configuration for tutorial/non-tutorial configurations - dict_n_matched = { - bname: config_bundles[bname]["n_matched"] - for bname in config_bundles - if config_bundles[bname]["category"] == category - and config_bundles[bname].get("tutorial", False) == tutorial - and _check_disabled_by_opt(config_bundles[bname], config_enable) != ["all"] - } + df_matches = pd.DataFrame(columns=["bundle_name", "bundle_size", "n_matched"]) + + for bname, bvalue in config_bundles.items(): + if ( + bvalue["category"] == category + and bvalue.get("tutorial", False) == tutorial + and _check_disabled_by_opt(bvalue, config_enable) != ["all"] + ): + df_matches.loc[bname] = [ + bname, + len(bvalue["countries"]), + bvalue["n_matched"], + ] - returned_bundles = [] + df_matches["neg_bundle_size"] = -df_matches["bundle_size"] + df_matches.sort_values( + by=["n_matched", "neg_bundle_size"], inplace=True, ascending=False + ) - # check if non-empty dictionary - if dict_n_matched: - # if non-empty, then pick bundles until all countries are selected - # or no more bundles are found - dict_sort = sorted(dict_n_matched.items(), key=lambda d: d[1]) + returned_bundles = [] + if not df_matches.empty: current_matched_countries = [] remaining_countries = set(country_list) - for d_val in dict_sort: - bname = d_val[0] - + for bname in df_matches.index: cbundle_list = set(config_bundles[bname]["countries"]) # list of countries in the bundle that are not yet matched