Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Throw an error in writer if property isn't defined #154

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ node_properties:
- 'id'
- 'category'
- 'provided_by'
- 'iri'
- 'name'
- 'synonym'
- 'has_attribute'
- 'deprecated'
- 'full_name'
- 'in_taxon'
- 'xref'
- 'in_taxon_label'
- 'description'
- 'type'

edge_properties:
- 'id'
Expand All @@ -46,4 +57,35 @@ edge_properties:
- 'object'
- 'category'
- 'relation'
- 'provided_by'
- 'provided_by'
- 'object_closure'
- 'negated'
- 'qualifier'
- 'name'
- 'deprecated'
- 'original_subject'
- 'has_evidence'
- 'description'
- 'subject_label_closure'
- 'aggregator_knowledge_source'
- 'has_attribute'
- 'type'
- 'timepoint'
- 'subject_category_closure'
- 'object_category'
- 'primary_knowledge_source'
- 'original_object'
- 'knowledge_source'
- 'iri'
- 'subject_namespace'
- 'subject_closure'
- 'object_namespace'
- 'object_category_closure'
- 'object_label_closure'
- 'agent_type'
- 'knowledge_level'
- 'publications'
- 'retrieval_source_ids'
- 'original_predicate'
- 'subject_category'
- 'qualifiers'
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ node_properties:
- 'id'
- 'category'
- 'provided_by'
- 'deprecated'
- 'full_name'
- 'in_taxon_label'
- 'has_attribute'
- 'type'
- 'symbol'
- 'in_taxon'
- 'has_biological_sequence'
- 'xref'
- 'name'
- 'iri'
- 'synonym'
- 'description'

edge_properties:
- 'id'
Expand All @@ -43,4 +56,35 @@ edge_properties:
- 'object'
- 'category'
- 'relation'
- 'provided_by'
- 'provided_by'
- 'knowledge_level'
- 'type'
- 'has_attribute'
- 'original_subject'
- 'subject_category'
- 'object_closure'
- 'description'
- 'object_category_closure'
- 'subject_closure'
- 'original_predicate'
- 'has_evidence'
- 'object_category'
- 'subject_label_closure'
- 'iri'
- 'aggregator_knowledge_source'
- 'original_object'
- 'name'
- 'primary_knowledge_source'
- 'subject_namespace'
- 'subject_category_closure'
- 'deprecated'
- 'timepoint'
- 'qualifiers'
- 'agent_type'
- 'object_namespace'
- 'retrieval_source_ids'
- 'object_label_closure'
- 'publications'
- 'qualifier'
- 'knowledge_source'
- 'negated'
46 changes: 45 additions & 1 deletion examples/string-w-map/map-protein-links-detailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ node_properties:
- 'id'
- 'category'
- 'provided_by'
- 'deprecated'
- 'has_attribute'
- 'iri'
- 'in_taxon'
- 'xref'
- 'symbol'
- 'description'
- 'type'
- 'name'
- 'synonym'
- 'full_name'
- 'in_taxon_label'
- 'has_biological_sequence'

edge_properties:
- 'id'
Expand All @@ -43,4 +56,35 @@ edge_properties:
- 'object'
- 'category'
- 'relation'
- 'provided_by'
- 'provided_by'
- 'subject_closure'
- 'object_closure'
- 'name'
- 'subject_namespace'
- 'aggregator_knowledge_source'
- 'object_category'
- 'type'
- 'original_predicate'
- 'subject_label_closure'
- 'retrieval_source_ids'
- 'agent_type'
- 'primary_knowledge_source'
- 'iri'
- 'knowledge_source'
- 'qualifiers'
- 'timepoint'
- 'object_namespace'
- 'negated'
- 'object_category_closure'
- 'deprecated'
- 'original_object'
- 'original_subject'
- 'subject_category'
- 'has_attribute'
- 'publications'
- 'subject_category_closure'
- 'qualifier'
- 'object_label_closure'
- 'description'
- 'knowledge_level'
- 'has_evidence'
42 changes: 42 additions & 0 deletions examples/string/protein-links-detailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,17 @@ node_properties:
- 'id'
- 'category'
- 'provided_by'
- 'iri'
- 'name'
- 'synonym'
- 'has_attribute'
- 'deprecated'
- 'full_name'
- 'in_taxon'
- 'xref'
- 'in_taxon_label'
- 'description'
- 'type'

edge_properties:
- 'id'
Expand All @@ -33,3 +44,34 @@ edge_properties:
- 'category'
- 'relation'
- 'provided_by'
- 'object_closure'
- 'negated'
- 'qualifier'
- 'name'
- 'deprecated'
- 'original_subject'
- 'has_evidence'
- 'description'
- 'subject_label_closure'
- 'aggregator_knowledge_source'
- 'has_attribute'
- 'type'
- 'timepoint'
- 'subject_category_closure'
- 'object_category'
- 'primary_knowledge_source'
- 'original_object'
- 'knowledge_source'
- 'iri'
- 'subject_namespace'
- 'subject_closure'
- 'object_namespace'
- 'object_category_closure'
- 'object_label_closure'
- 'agent_type'
- 'knowledge_level'
- 'publications'
- 'retrieval_source_ids'
- 'original_predicate'
- 'subject_category'
- 'qualifiers'
30 changes: 9 additions & 21 deletions src/koza/io/writer/jsonl_writer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import json
import os
from typing import Iterable, List, Optional
from typing import List, Optional

from koza.converter.kgx_converter import KGXConverter
from koza.io.writer.writer import KozaWriter
from koza.model.config.sssom_config import SSSOMConfig

Expand All @@ -13,35 +12,24 @@ def __init__(
output_dir: str,
source_name: str,
node_properties: List[str],
edge_properties: Optional[List[str]] = [],
edge_properties: Optional[List[str]] = None,
sssom_config: SSSOMConfig = None,
):
self.output_dir = output_dir
self.source_name = source_name
self.sssom_config = sssom_config

self.converter = KGXConverter()
super().__init__(output_dir, source_name, node_properties, edge_properties, sssom_config)

os.makedirs(output_dir, exist_ok=True)
if node_properties:
self.nodeFH = open(f"{output_dir}/{source_name}_nodes.jsonl", "w")
if edge_properties:
self.edgeFH = open(f"{output_dir}/{source_name}_edges.jsonl", "w")

def write(self, entities: Iterable):
(nodes, edges) = self.converter.convert(entities)

if nodes:
for n in nodes:
node = json.dumps(n, ensure_ascii=False)
self.nodeFH.write(node + '\n')
def write_edge(self, edge: dict):
edge = json.dumps(edge, ensure_ascii=False)
self.edgeFH.write(edge + '\n')

if edges:
for e in edges:
if self.sssom_config:
e = self.sssom_config.apply_mapping(e)
edge = json.dumps(e, ensure_ascii=False)
self.edgeFH.write(edge + '\n')
def write_node(self, node: dict):
node = json.dumps(node, ensure_ascii=False)
self.nodeFH.write(node + '\n')

def finalize(self):
if hasattr(self, 'nodeFH'):
Expand Down
58 changes: 34 additions & 24 deletions src/koza/io/writer/tsv_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
# NOTE - May want to rename to KGXWriter at some point, if we develop writers for other models non biolink/kgx specific

from pathlib import Path
from typing import Dict, Iterable, List, Literal, Set, Union
from typing import Dict, List, Literal, Set, Tuple, Union

from numpy.f2py.auxfuncs import throw_error
from ordered_set import OrderedSet

from koza.converter.kgx_converter import KGXConverter
# from koza.converter.kgx_converter import KGXConverter
from koza.io.utils import build_export_row
from koza.io.writer.writer import KozaWriter
from koza.model.config.sssom_config import SSSOMConfig
Expand All @@ -21,43 +22,31 @@ def __init__(
edge_properties: List[str] = None,
sssom_config: SSSOMConfig = None,
):
self.basename = source_name
self.dirname = output_dir
super().__init__(output_dir, source_name, node_properties, edge_properties, sssom_config)
self.delimiter = "\t"
self.list_delimiter = "|"
self.converter = KGXConverter()
self.sssom_config = sssom_config

Path(self.dirname).mkdir(parents=True, exist_ok=True)
Path(self.output_dir).mkdir(parents=True, exist_ok=True)

if node_properties: # Make node file
self.node_columns = TSVWriter._order_columns(node_properties, "node")
self.nodes_file_name = Path(self.dirname if self.dirname else "", f"{self.basename}_nodes.tsv")
self.node_columns = TSVWriter._order_columns(set(node_properties), "node")
self.nodes_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_nodes.tsv")
self.nodeFH = open(self.nodes_file_name, "w")
self.nodeFH.write(self.delimiter.join(self.node_columns) + "\n")

if edge_properties: # Make edge file
if sssom_config:
edge_properties = self.add_sssom_columns(edge_properties)
self.edge_columns = TSVWriter._order_columns(edge_properties, "edge")
self.edges_file_name = Path(self.dirname if self.dirname else "", f"{self.basename}_edges.tsv")
self.edge_columns = TSVWriter._order_columns(set(edge_properties), "edge")
self.edges_file_name = Path(self.output_dir if self.output_dir else "", f"{self.source_name}_edges.tsv")
self.edgeFH = open(self.edges_file_name, "w")
self.edgeFH.write(self.delimiter.join(self.edge_columns) + "\n")

def write(self, entities: Iterable) -> None:
"""Write an entities object to separate node and edge .tsv files"""
def write_edge(self, edge: dict):
self.write_row(edge, record_type="edge")

nodes, edges = self.converter.convert(entities)

if nodes:
for node in nodes:
self.write_row(node, record_type="node")

if edges:
for edge in edges:
if self.sssom_config:
edge = self.sssom_config.apply_mapping(edge)
self.write_row(edge, record_type="edge")
def write_node(self, node: dict):
self.write_row(node, record_type="node")

def write_row(self, record: Dict, record_type: Literal["node", "edge"]) -> None:
"""Write a row to the underlying store.
Expand All @@ -69,6 +58,13 @@ def write_row(self, record: Dict, record_type: Literal["node", "edge"]) -> None:
fh = self.nodeFH if record_type == "node" else self.edgeFH
columns = self.node_columns if record_type == "node" else self.edge_columns
row = build_export_row(record, list_delimiter=self.list_delimiter)

# Throw error if the record has extra columns
columns_tuple = tuple(columns)
row_keys_tuple = tuple(row.keys())
if self.has_extra_columns(row_keys_tuple, columns_tuple):
throw_error(f"Record has extra columns: {set(row.keys()) - set(columns)} not defined in {record_type}")

values = []
if record_type == "node":
row["id"] = record["id"]
Expand All @@ -87,6 +83,19 @@ def finalize(self):
if hasattr(self, "edgeFH"):
self.edgeFH.close()

@staticmethod
def has_extra_columns(row_keys: Tuple[str, ...], columns_tuple: Tuple[str, ...]) -> bool:
"""Check if a row has extra columns.

Args:
row_keys: Tuple[str, ...] - A tuple of row keys
columns_tuple: Tuple[str, ...] - A tuple of columns

Returns:
bool - True if row has extra columns, False otherwise
"""
return not set(row_keys).issubset(set(columns_tuple))

@staticmethod
def _order_columns(cols: Set, record_type: Literal["node", "edge"]) -> OrderedSet:
"""Arrange node or edge columns in a defined order.
Expand All @@ -97,6 +106,7 @@ def _order_columns(cols: Set, record_type: Literal["node", "edge"]) -> OrderedSe
Returns:
OrderedSet - A set with elements in a defined order
"""
core_columns = set()
if record_type == "node":
core_columns = OrderedSet(["id", "category", "name", "description", "xref", "provided_by", "synonym"])
elif record_type == "edge":
Expand Down
Loading