From bcc30d78e2857fc332510405f4a09005d128c023 Mon Sep 17 00:00:00 2001
From: Peter Sefton <pt@ptsefton.com>
Date: Fri, 10 Jan 2025 06:17:46 +1100
Subject: [PATCH] Update root-data-entity.md

---
 .../1.2-DRAFT/root-data-entity.md             | 397 +-----------------
 1 file changed, 1 insertion(+), 396 deletions(-)

diff --git a/docs/_specification/1.2-DRAFT/root-data-entity.md b/docs/_specification/1.2-DRAFT/root-data-entity.md
index 547ce47d..63873702 100644
--- a/docs/_specification/1.2-DRAFT/root-data-entity.md
+++ b/docs/_specification/1.2-DRAFT/root-data-entity.md
@@ -127,402 +127,7 @@ be minimally valid.
 
 ## Direct properties of the Root Data Entity
 
-The _Root Data Entity_ of _Valid RO-Crate Dataset_ 
-Skip to content
-Search Gists
-All gists
-Back to GitHub
-@SamHames
-SamHames/rocrate_relational.py
-Created January 9, 2025 13:31 • Report abuse
-
-Code
-Revisions 1
-Clone this repository at &lt;script src=&quot;https://gist.github.com/SamHames/b969b4c99300b9c5726bfd5e31c94aba.js&quot;&gt;&lt;/script&gt;
-Turn an ROCrate into a relational SQLite schema
-rocrate_relational.py
-from collections import defaultdict, Counter
-import json
-import sqlite3
-import textwrap
-
-
-class ROCrateToSQLite:
-
-    def __init__(self, rocrate, db_path):
-
-        self.rocrate = rocrate
-        self.db_path = db_path
-
-        self._id_type_map = None
-        self.db = sqlite3.connect(self.db_path, isolation_level=None)
-
-    @property
-    def id_type_map(self):
-        """
-        Mapping from a given @id to types of entities.
-        Cached on generation as it's needed in a few different places.
-        """
-
-        if self._id_type_map is None:
-            id_type_map = defaultdict(set)
-
-            for entity in self.rocrate["@graph"]:
-
-                entity_id = entity["@id"]
-                entity_type = entity["@type"]
-
-                if isinstance(entity_type, list):
-                    for e_t in entity_type:
-                        id_type_map[entity_id].add(e_t)
-                else:
-                    id_type_map[entity_id].add(entity_type)
-
-            self._id_type_map = id_type_map
-
-        return self._id_type_map
-
-    def collect_column_details(self):
-        """
-        Infer the required columns and their details.
-        The details calculated are:
-        1. The maximum cardinality (effectively whether the value has 0, 1, or many possible
-           values - this effects whether this is a 1:1 relationship or a 1:many relationship)
-        2. Whether the values are literal values, references to other tables, or a mixture
-           of the two.
-        These leads to a choice table for how to represent these relationships:
-        cardinality: at most once | many
-        reference | nullable fk | bridge table (two sided fk)
-        literal | nullable | one:many table
-        mixed | (two columns?) | (two or more outrider tables?)
-        """
-
-        # The first number is the max cardinality, the second is the set of value kinds
-        # literal or reference to another entity observed.
-        column_details = defaultdict(Counter)
-
-        for entity in self.rocrate["@graph"]:
-
-            # Mandatory properties
-            entity_id = entity["@id"]
-            entity_types = entity["@type"]
-
-            if not isinstance(entity_types, list):
-                entity_types = [entity_types]
-
-            for key, values in entity.items():
-                if key in ("@id", "@type"):
-                    continue
-
-                # A value can be a single value, or a list of values - if it's not a
-                # list, convert to a list of a single value.
-                if not isinstance(values, list):
-                    values = [values]
-
-                # Keep cardinality counts for each reference type - there might be
-                # arrays of two kinds of information that can be consistently mapped to
-                # a single column.
-                cardinality_counts = Counter()
-
-                for value in values:
-                    # TODO: confirm ROCrate doesn't allow nesting and that @id will
-                    # always be present in a valid ROCrate.
-                    if isinstance(value, dict):
-                        referenced_id = value["@id"]
-                        for referenced_type in self.id_type_map[referenced_id]:
-                            cardinality_counts[referenced_type] += 1
-                    else:
-                        # Empty reference -> literal type
-                        cardinality_counts[""] += 1
-
-                # Max observed cardinality sets how we model this relationally
-                for entity_type in entity_types:
-                    for kind, cardinality in cardinality_counts.items():
-                        column_details[entity_type][(key, kind)] = max(
-                            column_details[entity_type][(key, kind)], cardinality
-                        )
-
-        return column_details
-
-    def generate_sql_statements_column_details(self, column_details):
-        """
-        Generate the necessary SQL table create and insert statements.
-        The return represents a dictionary mapping table names to the create table
-        statements, and an insert statement that uses named bound parameters that
-        can be executed by passing the parameters as a dictionary.
-        This also returns an annotated version of column details, mapping the details
-        of the ROCrate data back to these generated column names to make inserting data
-        a lookup table.
-        """
-
-        tables = {}
-
-        annotated_column_details = defaultdict(dict)
-
-        for entity_type, columns in column_details.items():
-
-            cardinality_indexed = defaultdict(list)
-
-            # Break columns into two groups: those containing at most one reference, the
-            # primary table, and those containing up to more than one, indicating
-            # 1:many or many:many relationships.
-            for col, max_cardinality in columns.items():
-                # Bucket into two groups: at most one, or more than one
-                cardinality_indexed[min(max_cardinality, 2)].append(col)
-
-            # Create the main table for this type (columns of cardinality 1)
-            main_table_columns = cardinality_indexed[1]
-            column_statements = ["id primary key"]
-            insert_keys = ["id"]
-
-            for col_name, ref_table in main_table_columns:
-                if ref_table:
-                    # Generate a name including the reference table - since a single
-                    # entity with a named property can reference any other entity, we
-                    # might have multiple columns with the same property names pointing
-                    # at different types of things, so we need to split them out.
-                    gen_name = f"{col_name}_{ref_table}"
-                    column_statements.append(f'"{gen_name}" references "{ref_table}"')
-                    insert_keys.append(gen_name)
-                    annotated_column_details[entity_type][col_name, ref_table] = (
-                        "1:1",
-                        entity_type,
-                    )
-                else:
-                    column_statements.append(f'"{col_name}"')
-                    insert_keys.append(col_name)
-                    annotated_column_details[entity_type][col_name, ref_table] = (
-                        "1:1",
-                        entity_type,
-                    )
-
-            all_columns = ",\n  ".join(column_statements)
-            create_table_statement = (
-                f'create table "{entity_type}" (\n  {all_columns}\n)'
-            )
-
-            # SQLite let's you escape identifiers like table names, but not placeholders
-            # for bound parameters, so we need to add an extra layer of indirection
-            # here.
-            insert_mapping = {key: str(i) for i, key in enumerate(insert_keys)}
-            value_bindings = ", ".join(f":{i}" for i, _ in enumerate(insert_keys))
-            insert_statement = f'insert into "{entity_type}" values ({value_bindings})'
-
-            tables[entity_type] = (
-                create_table_statement,
-                insert_statement,
-                insert_mapping,
-            )
-
-            # Generate 1:many and many:many tables
-            for col_name, ref_table in cardinality_indexed[2]:
-
-                table_name = f"{entity_type}_{col_name}"
-
-                # many-many bridge table
-                if ref_table:
-                    create_table_statement = textwrap.dedent(
-                        f"""
-                        create table "{table_name}" (
-                            "{entity_type}" references "{entity_type}",
-                            "{ref_table}" references "{ref_table}",
-                            primary key ("{entity_type}", "{ref_table}")
-                        )
-                        """
-                    )
-                    insert_statement = (
-                        f'insert into "{table_name}" values (:entity_type, :ref_table)'
-                    )
-
-                else:
-                    create_table_statement = textwrap.dedent(
-                        f"""
-                        create table "{table_name}" (
-                            "{entity_type}" references "{entity_type}",
-                            "{col_name}",
-                            primary key ("{entity_type}", "{col_name}")
-                        )
-                        """
-                    )
-                    insert_statement = (
-                        f'insert into "{table_name}" values (:entity_type, :col_name)'
-                    )
-
-                annotated_column_details[entity_type][col_name, ref_table] = (
-                    "1:many",
-                    table_name,
-                )
-
-                tables[table_name] = (create_table_statement, insert_statement, {})
-
-        return tables, annotated_column_details
-
-    def insert_rocrate_data(self, sql_statements, annotated_column_details):
-        """Insert the rocrate data into the database."""
-
-        try:
-            self.db.execute("begin")
-
-            # Drop and create all necessary tables first.
-
-            # TODO: do we want to handle cases where the database already exists
-            # separately? Perhaps a default don't overwrite data unless flagged?
-            for table_name in sql_statements:
-                self.db.execute(f"drop table if exists [{table_name}]")
-
-            for table_statement, _, _ in sql_statements.values():
-                self.db.execute(table_statement)
-
-            for entity in self.rocrate["@graph"]:
-
-                # Mandatory properties
-                entity_id = entity["@id"]
-                entity_types = entity["@type"]
-
-                if not isinstance(entity_types, list):
-                    entity_types = [entity_types]
-
-                # TODO: can probably handle multiply typed entities better? This is
-                # expanding to one table per type, but I'm pretty sure some of the
-                # doubly typed ones are always doubly typed and might be handlable
-                # better.
-                for entity_type in entity_types:
-
-                    # TODO: add the 'id' column!
-                    entity_main = {"id": entity_id}
-                    entity_extra_rows = defaultdict(list)
-
-                    for key, values in entity.items():
-                        if key in ("@id", "@type"):
-                            continue
-
-                        # A value can be a single value, or a list of values - if it's
-                        # not a list, convert to a list of a single value.
-                        if not isinstance(values, list):
-                            values = [values]
-
-                        # Strategy - decompose all of the values into the types, so we
-                        # can lookup where they need to be inserted in
-                        # annotated_column_details, then insert them with an
-                        # accumulated dictionary with the statements. Ultimately this
-                        # comes down to two choices: this is either a part of the main
-                        # table for the relevant types, in which case we accumulate a
-                        # single dict for this entity, or it's one of the :many tables,
-                        # in which case we can immediately generate a tuple of data for
-                        # inserting.
-                        for value in values:
-
-                            if isinstance(value, dict):
-                                referenced_id = value["@id"]
-                                for ref_table in self.id_type_map[referenced_id]:
-
-                                    cardinality, insert_table = (
-                                        annotated_column_details[entity_type][
-                                            (key, ref_table)
-                                        ]
-                                    )
-
-                                    if cardinality == "1:1":
-                                        gen_name = f"{key}_{ref_table}"
-                                        entity_main[gen_name] = referenced_id
-
-                                    elif cardinality == "1:many":
-                                        entity_extra_rows[insert_table].append(
-                                            {
-                                                "entity_type": entity_id,
-                                                "ref_table": referenced_id,
-                                            }
-                                        )
-
-                            else:
-                                cardinality, insert_table = annotated_column_details[
-                                    entity_type
-                                ][(key, "")]
-
-                                if cardinality == "1:1":
-                                    entity_main[key] = value
-
-                                elif cardinality == "1:many":
-                                    entity_extra_rows[insert_table].append(
-                                        {
-                                            "entity_type": entity_id,
-                                            "col_name": value,
-                                        }
-                                    )
-
-                    # Now actually insert the data, mapping the actual names to the
-                    # generated insert statements, and generating explicit nulls for
-                    # missing data.
-                    col_mapping = sql_statements[entity_type][2]
-                    entity_main = {
-                        value: entity_main.get(key, None)
-                        for key, value in col_mapping.items()
-                    }
-                    self.db.execute(sql_statements[entity_type][1], entity_main)
-
-                    for table, rows in entity_extra_rows.items():
-                        self.db.executemany(sql_statements[table][1], rows)
-
-            self.db.execute("commit")
-
-        except Exception:
-            self.db.execute("rollback")
-            raise
-
-
-# def insert_rocrate_sql(crate_json, column_specification, db):
-#     """
-#     Insert the rocrate values into the given database.
-
-#     Assumes that the db has already been setup.
-
-#     """
-
-
-# column_details = collect_schema_details(crate_json)
-
-# tables = generate_schema_statements(column_details)
-
-
-# Work plan: Collect additional column information in either collect_column_details or
-# generate_schema_statements to enable easier insert processing.
-
-if __name__ == "__main__":
-    import requests
-
-    crate_url = (
-        "https://data.ldaca.edu.au/api/object/meta"
-        "?resolve-parts&noUrid&id=arcp%3A%2F%2Fname%2Chdl10.26180~23961609"
-    )
-
-    crate = requests.get(crate_url)
-
-    crate_json = crate.json()
-
-    crater = ROCrateToSQLite(crate_json, "graph.db")
-
-    columns = crater.collect_column_details()
-    statements, annotated_columns = crater.generate_sql_statements_column_details(
-        columns
-    )
-
-    crater.insert_rocrate_data(statements, annotated_columns)
-@ptsefton
-Comment
-
-Leave a comment
-Footer
-© 2025 GitHub, Inc.
-Footer navigation
-
-    Terms
-    Privacy
-    Security
-    Status
-    Docs
-    Contact
-
-MUST have the following properties:
+The _Root Data Entity_ of a _Valid RO-Crate Dataset_  MUST have the following properties:
 
 *  `@type`: MUST be [Dataset] or an array that contains `Dataset`
 *  `@id`:  SHOULD be the string `./` or an absolute URI (see [below](#root-data-entity-identifier))