Skip to content

Commit

Permalink
SciGlass data is now downloaded from Zenodo
Browse files Browse the repository at this point in the history
  • Loading branch information
drcassar committed Aug 27, 2023
1 parent e105e63 commit 1712f20
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 18 deletions.
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
include *.txt
recursive-include docs *
recursive-include glasspy/data/datafiles *
recursive-include glasspy/predict/models *
recursive-include glasspy/chemistry/data *
Binary file removed glasspy/data/datafiles/select_AtMol.csv.zip
Binary file not shown.
Binary file removed glasspy/data/datafiles/select_Gcomp.csv.zip
Binary file not shown.
Binary file removed glasspy/data/datafiles/select_SciGK.csv.zip
Binary file not shown.
90 changes: 73 additions & 17 deletions glasspy/data/load.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,71 @@
"""This is the module to load available data in GlassPy.
Right now, the main source of GlassPy data is the SciGlass database. The SciGlass database is available at https://github.com/epam/SciGlass licensed under ODC Open Database License (ODbL). For a plain text version of this database, see the for at
https://github.com/drcassar/SciGlass. Data that ships with GlassPy is the same as the data in the plain text fork.
Right now, the main source of GlassPy data is the SciGlass database. The
SciGlass database is available at https://github.com/epam/SciGlass licensed
under ODC Open Database License (ODbL). For a plain text version of this
database, see the for at https://github.com/drcassar/SciGlass. Data that ships
with GlassPy is the same as the data in the plain text fork.
Typical usage example:
source = SciGlass()
df = source.data
"""

import io
import shutil
import zipfile
from pathlib import Path
import requests
import pandas as pd
import numpy as np
import os
from platformdirs import user_data_dir

from glasspy.chemistry.convert import to_element_array, wt_to_mol
from .translators import AtMol_translation, SciGK_translation

__CUR_PATH = os.path.dirname(__file__)
_ELEMENTS_PATH = os.path.join(__CUR_PATH, "datafiles/select_AtMol.csv.zip")
_PROPERTIES_PATH = os.path.join(__CUR_PATH, "datafiles/select_SciGK.csv.zip")
_COMPOUNDS_PATH = os.path.join(__CUR_PATH, "datafiles/select_Gcomp.csv.zip")

def _download_sciglass_data(path_dict):
"""Downloads the SciGlass database to your computer."""

print("Downloading SciGlass database to your computer...")
print("This is only required once and may take a few minutes.")

record_id = "8287159"
api_url = f"https://zenodo.org/api/records/{record_id}"
response = requests.get(api_url, timeout=60)
record_data = response.json()
url = record_data["files"][0]["links"]["self"]

download = requests.get(url, timeout=3600)
zip_file = io.BytesIO(download.content)

with zipfile.ZipFile(zip_file, "r") as zip_ref:
for item in zip_ref.namelist():
for name, path in path_dict.values():
if name in item:
path.parent.mkdir(parents=True, exist_ok=True)
with zip_ref.open(item) as source_file:
with open(path, "wb") as target_file:
shutil.copyfileobj(source_file, target_file)

print("Download completed!")


def _sciglass_path_dict():
"""Get the SciGlass file paths in your system."""

data_dir = Path(user_data_dir("GlassPy"))

path_dict = {
"elements": ("AtMol", data_dir / "select_AtMol.csv.zip"),
"properties": ("SciGK", data_dir / "select_SciGK.csv.zip"),
"compounds": ("Gcomp", data_dir / "select_Gcomp.csv.zip"),
}

if not all(val[1].is_file() for val in path_dict.values()):
_download_sciglass_data(path_dict)

return path_dict


class SciGlass:
Expand Down Expand Up @@ -57,18 +102,19 @@ def __init__(
autocleanup: bool = True,
metadata: bool = True,
):
path_dict = _sciglass_path_dict()

# default behavior is returning everything if no config is given
if (not elements_cfg) and (not properties_cfg) and (not compounds_cfg):
elements_cfg = {
"path": _ELEMENTS_PATH,
"path": path_dict["elements"][1],
"translate": AtMol_translation,
"acceptable_sum_deviation": 1,
"final_sum": 1,
}

compounds_cfg = {
"path": _COMPOUNDS_PATH,
"path": path_dict["compounds"][1],
"acceptable_sum_deviation": 1,
"final_sum": 1,
"return_weight": False,
Expand All @@ -92,7 +138,7 @@ def __init__(
}

properties_cfg = {
"path": _PROPERTIES_PATH,
"path": path_dict["properties"][1],
"translate": SciGK_translation,
"keep": self.available_properties(),
}
Expand Down Expand Up @@ -122,7 +168,7 @@ def __init__(

if metadata:
metadata_cfg = {
"path": _PROPERTIES_PATH,
"path": path_dict["properties"][1],
"translate": SciGK_translation,
"keep": self.available_properties_metadata(),
}
Expand All @@ -148,7 +194,7 @@ def __init__(
dfs = {
k: dfs[k]
for k in ["elements", "compounds", "property", "metadata"]
if k in dfs.keys()
if k in dfs
}

self.data = pd.concat(dfs, axis=1, join="inner")
Expand Down Expand Up @@ -179,8 +225,12 @@ def get_properties(self, **kwargs):
integer that merges both numbers
"""

path_dict = _sciglass_path_dict()

df = pd.read_csv(
kwargs.get("path", _PROPERTIES_PATH), sep="\t", low_memory=False
kwargs.get("path", path_dict["properties"][1]),
sep="\t",
low_memory=False,
)
df = df.assign(ID=lambda x: x.KOD * 100000000 + x.GLASNO)
df = df.drop(["KOD", "GLASNO"], axis=1)
Expand Down Expand Up @@ -225,9 +275,12 @@ def get_elements(self, **kwargs):
has a glass number and a paper number. This ID used in GlassPy is an
integer that merges both numbers
"""
path_dict = _sciglass_path_dict()

df = pd.read_csv(
kwargs.get("path", _ELEMENTS_PATH), sep="\t", low_memory=False
kwargs.get("path", path_dict["elements"][1]),
sep="\t",
low_memory=False,
)
df = df.assign(ID=lambda x: x.Kod * 100000000 + x.GlasNo)
df = df.drop(["Kod", "GlasNo"], axis=1)
Expand Down Expand Up @@ -268,9 +321,12 @@ def get_compounds(self, **kwargs):
has a glass number and a paper number. This ID used in GlassPy is an
integer that merges both numbers
"""
path_dict = _sciglass_path_dict()

df = pd.read_csv(
kwargs.get("path", _COMPOUNDS_PATH), sep="\t", low_memory=False
kwargs.get("path", path_dict["compounds"][1]),
sep="\t",
low_memory=False,
)
df = df.assign(ID=lambda x: x.Kod * 100000000 + x.GlasNo)
df = df.drop(["Kod", "GlasNo"], axis=1)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ scikit-learn==1.2.0
compress_pickle>=2.1.0
torch
lightning>=2.0.0
platformdirs
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"compress_pickle>=2.1.0",
"torch",
"lightning>=2.0.0",
"platformdirs",
],
extras_require={
"extra": ["glasspy_extra"],
Expand Down

0 comments on commit 1712f20

Please sign in to comment.