From bcc30d78e2857fc332510405f4a09005d128c023 Mon Sep 17 00:00:00 2001 From: Peter Sefton Date: Fri, 10 Jan 2025 06:17:46 +1100 Subject: [PATCH] Update root-data-entity.md --- .../1.2-DRAFT/root-data-entity.md | 397 +----------------- 1 file changed, 1 insertion(+), 396 deletions(-) diff --git a/docs/_specification/1.2-DRAFT/root-data-entity.md b/docs/_specification/1.2-DRAFT/root-data-entity.md index 547ce47d..63873702 100644 --- a/docs/_specification/1.2-DRAFT/root-data-entity.md +++ b/docs/_specification/1.2-DRAFT/root-data-entity.md @@ -127,402 +127,7 @@ be minimally valid. ## Direct properties of the Root Data Entity -The _Root Data Entity_ of _Valid RO-Crate Dataset_ -Skip to content -Search Gists -All gists -Back to GitHub -@SamHames -SamHames/rocrate_relational.py -Created January 9, 2025 13:31 • Report abuse - -Code -Revisions 1 -Clone this repository at <script src="https://gist.github.com/SamHames/b969b4c99300b9c5726bfd5e31c94aba.js"></script> -Turn an ROCrate into a relational SQLite schema -rocrate_relational.py -from collections import defaultdict, Counter -import json -import sqlite3 -import textwrap - - -class ROCrateToSQLite: - - def __init__(self, rocrate, db_path): - - self.rocrate = rocrate - self.db_path = db_path - - self._id_type_map = None - self.db = sqlite3.connect(self.db_path, isolation_level=None) - - @property - def id_type_map(self): - """ - Mapping from a given @id to types of entities. - Cached on generation as it's needed in a few different places. - """ - - if self._id_type_map is None: - id_type_map = defaultdict(set) - - for entity in self.rocrate["@graph"]: - - entity_id = entity["@id"] - entity_type = entity["@type"] - - if isinstance(entity_type, list): - for e_t in entity_type: - id_type_map[entity_id].add(e_t) - else: - id_type_map[entity_id].add(entity_type) - - self._id_type_map = id_type_map - - return self._id_type_map - - def collect_column_details(self): - """ - Infer the required columns and their details. - The details calculated are: - 1. The maximum cardinality (effectively whether the value has 0, 1, or many possible - values - this effects whether this is a 1:1 relationship or a 1:many relationship) - 2. Whether the values are literal values, references to other tables, or a mixture - of the two. - These leads to a choice table for how to represent these relationships: - cardinality: at most once | many - reference | nullable fk | bridge table (two sided fk) - literal | nullable | one:many table - mixed | (two columns?) | (two or more outrider tables?) - """ - - # The first number is the max cardinality, the second is the set of value kinds - # literal or reference to another entity observed. - column_details = defaultdict(Counter) - - for entity in self.rocrate["@graph"]: - - # Mandatory properties - entity_id = entity["@id"] - entity_types = entity["@type"] - - if not isinstance(entity_types, list): - entity_types = [entity_types] - - for key, values in entity.items(): - if key in ("@id", "@type"): - continue - - # A value can be a single value, or a list of values - if it's not a - # list, convert to a list of a single value. - if not isinstance(values, list): - values = [values] - - # Keep cardinality counts for each reference type - there might be - # arrays of two kinds of information that can be consistently mapped to - # a single column. - cardinality_counts = Counter() - - for value in values: - # TODO: confirm ROCrate doesn't allow nesting and that @id will - # always be present in a valid ROCrate. - if isinstance(value, dict): - referenced_id = value["@id"] - for referenced_type in self.id_type_map[referenced_id]: - cardinality_counts[referenced_type] += 1 - else: - # Empty reference -> literal type - cardinality_counts[""] += 1 - - # Max observed cardinality sets how we model this relationally - for entity_type in entity_types: - for kind, cardinality in cardinality_counts.items(): - column_details[entity_type][(key, kind)] = max( - column_details[entity_type][(key, kind)], cardinality - ) - - return column_details - - def generate_sql_statements_column_details(self, column_details): - """ - Generate the necessary SQL table create and insert statements. - The return represents a dictionary mapping table names to the create table - statements, and an insert statement that uses named bound parameters that - can be executed by passing the parameters as a dictionary. - This also returns an annotated version of column details, mapping the details - of the ROCrate data back to these generated column names to make inserting data - a lookup table. - """ - - tables = {} - - annotated_column_details = defaultdict(dict) - - for entity_type, columns in column_details.items(): - - cardinality_indexed = defaultdict(list) - - # Break columns into two groups: those containing at most one reference, the - # primary table, and those containing up to more than one, indicating - # 1:many or many:many relationships. - for col, max_cardinality in columns.items(): - # Bucket into two groups: at most one, or more than one - cardinality_indexed[min(max_cardinality, 2)].append(col) - - # Create the main table for this type (columns of cardinality 1) - main_table_columns = cardinality_indexed[1] - column_statements = ["id primary key"] - insert_keys = ["id"] - - for col_name, ref_table in main_table_columns: - if ref_table: - # Generate a name including the reference table - since a single - # entity with a named property can reference any other entity, we - # might have multiple columns with the same property names pointing - # at different types of things, so we need to split them out. - gen_name = f"{col_name}_{ref_table}" - column_statements.append(f'"{gen_name}" references "{ref_table}"') - insert_keys.append(gen_name) - annotated_column_details[entity_type][col_name, ref_table] = ( - "1:1", - entity_type, - ) - else: - column_statements.append(f'"{col_name}"') - insert_keys.append(col_name) - annotated_column_details[entity_type][col_name, ref_table] = ( - "1:1", - entity_type, - ) - - all_columns = ",\n ".join(column_statements) - create_table_statement = ( - f'create table "{entity_type}" (\n {all_columns}\n)' - ) - - # SQLite let's you escape identifiers like table names, but not placeholders - # for bound parameters, so we need to add an extra layer of indirection - # here. - insert_mapping = {key: str(i) for i, key in enumerate(insert_keys)} - value_bindings = ", ".join(f":{i}" for i, _ in enumerate(insert_keys)) - insert_statement = f'insert into "{entity_type}" values ({value_bindings})' - - tables[entity_type] = ( - create_table_statement, - insert_statement, - insert_mapping, - ) - - # Generate 1:many and many:many tables - for col_name, ref_table in cardinality_indexed[2]: - - table_name = f"{entity_type}_{col_name}" - - # many-many bridge table - if ref_table: - create_table_statement = textwrap.dedent( - f""" - create table "{table_name}" ( - "{entity_type}" references "{entity_type}", - "{ref_table}" references "{ref_table}", - primary key ("{entity_type}", "{ref_table}") - ) - """ - ) - insert_statement = ( - f'insert into "{table_name}" values (:entity_type, :ref_table)' - ) - - else: - create_table_statement = textwrap.dedent( - f""" - create table "{table_name}" ( - "{entity_type}" references "{entity_type}", - "{col_name}", - primary key ("{entity_type}", "{col_name}") - ) - """ - ) - insert_statement = ( - f'insert into "{table_name}" values (:entity_type, :col_name)' - ) - - annotated_column_details[entity_type][col_name, ref_table] = ( - "1:many", - table_name, - ) - - tables[table_name] = (create_table_statement, insert_statement, {}) - - return tables, annotated_column_details - - def insert_rocrate_data(self, sql_statements, annotated_column_details): - """Insert the rocrate data into the database.""" - - try: - self.db.execute("begin") - - # Drop and create all necessary tables first. - - # TODO: do we want to handle cases where the database already exists - # separately? Perhaps a default don't overwrite data unless flagged? - for table_name in sql_statements: - self.db.execute(f"drop table if exists [{table_name}]") - - for table_statement, _, _ in sql_statements.values(): - self.db.execute(table_statement) - - for entity in self.rocrate["@graph"]: - - # Mandatory properties - entity_id = entity["@id"] - entity_types = entity["@type"] - - if not isinstance(entity_types, list): - entity_types = [entity_types] - - # TODO: can probably handle multiply typed entities better? This is - # expanding to one table per type, but I'm pretty sure some of the - # doubly typed ones are always doubly typed and might be handlable - # better. - for entity_type in entity_types: - - # TODO: add the 'id' column! - entity_main = {"id": entity_id} - entity_extra_rows = defaultdict(list) - - for key, values in entity.items(): - if key in ("@id", "@type"): - continue - - # A value can be a single value, or a list of values - if it's - # not a list, convert to a list of a single value. - if not isinstance(values, list): - values = [values] - - # Strategy - decompose all of the values into the types, so we - # can lookup where they need to be inserted in - # annotated_column_details, then insert them with an - # accumulated dictionary with the statements. Ultimately this - # comes down to two choices: this is either a part of the main - # table for the relevant types, in which case we accumulate a - # single dict for this entity, or it's one of the :many tables, - # in which case we can immediately generate a tuple of data for - # inserting. - for value in values: - - if isinstance(value, dict): - referenced_id = value["@id"] - for ref_table in self.id_type_map[referenced_id]: - - cardinality, insert_table = ( - annotated_column_details[entity_type][ - (key, ref_table) - ] - ) - - if cardinality == "1:1": - gen_name = f"{key}_{ref_table}" - entity_main[gen_name] = referenced_id - - elif cardinality == "1:many": - entity_extra_rows[insert_table].append( - { - "entity_type": entity_id, - "ref_table": referenced_id, - } - ) - - else: - cardinality, insert_table = annotated_column_details[ - entity_type - ][(key, "")] - - if cardinality == "1:1": - entity_main[key] = value - - elif cardinality == "1:many": - entity_extra_rows[insert_table].append( - { - "entity_type": entity_id, - "col_name": value, - } - ) - - # Now actually insert the data, mapping the actual names to the - # generated insert statements, and generating explicit nulls for - # missing data. - col_mapping = sql_statements[entity_type][2] - entity_main = { - value: entity_main.get(key, None) - for key, value in col_mapping.items() - } - self.db.execute(sql_statements[entity_type][1], entity_main) - - for table, rows in entity_extra_rows.items(): - self.db.executemany(sql_statements[table][1], rows) - - self.db.execute("commit") - - except Exception: - self.db.execute("rollback") - raise - - -# def insert_rocrate_sql(crate_json, column_specification, db): -# """ -# Insert the rocrate values into the given database. - -# Assumes that the db has already been setup. - -# """ - - -# column_details = collect_schema_details(crate_json) - -# tables = generate_schema_statements(column_details) - - -# Work plan: Collect additional column information in either collect_column_details or -# generate_schema_statements to enable easier insert processing. - -if __name__ == "__main__": - import requests - - crate_url = ( - "https://data.ldaca.edu.au/api/object/meta" - "?resolve-parts&noUrid&id=arcp%3A%2F%2Fname%2Chdl10.26180~23961609" - ) - - crate = requests.get(crate_url) - - crate_json = crate.json() - - crater = ROCrateToSQLite(crate_json, "graph.db") - - columns = crater.collect_column_details() - statements, annotated_columns = crater.generate_sql_statements_column_details( - columns - ) - - crater.insert_rocrate_data(statements, annotated_columns) -@ptsefton -Comment - -Leave a comment -Footer -© 2025 GitHub, Inc. -Footer navigation - - Terms - Privacy - Security - Status - Docs - Contact - -MUST have the following properties: +The _Root Data Entity_ of a _Valid RO-Crate Dataset_ MUST have the following properties: * `@type`: MUST be [Dataset] or an array that contains `Dataset` * `@id`: SHOULD be the string `./` or an absolute URI (see [below](#root-data-entity-identifier))