Skip to content

Commit

Permalink
Merge pull request #27 from Sefaria/edemirci-aai-patch-2
Browse files Browse the repository at this point in the history
fix(query): improve performance by restricting MATCH to 'Chunk' or 'Records' labels
  • Loading branch information
Paul-Yu-Chun-Chang authored Sep 5, 2024
2 parents b3ecc21 + 344b546 commit c8448d7
Showing 1 changed file with 4 additions and 35 deletions.
39 changes: 4 additions & 35 deletions VirtualHavruta/vh.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def get_graph_neighbors_by_url(self, url: str, relationship: str, depth: int, fi
for i in range(1, depth + 1):
source_filter = f'AND {"NOT" if filter_mode_nodes == "secondary" else ""} neighbor.primaryDocCategory IN $primaryDocCategories' if filter_mode_nodes else ''
query = f"""
MATCH (start {{url: $url}})
MATCH (start:Records {{url: $url}})
WITH start
MATCH (start){start_node_operator}[:FROM_TO*{i}]{related_node_operator}(neighbor)
WHERE neighbor <> start
Expand All @@ -506,37 +506,6 @@ def get_graph_neighbors_by_url(self, url: str, relationship: str, depth: int, fi
nodes.extend(neighbor_nodes)
self.logger.info(f"MsgID={msg_id}. [GRAGH NEIGHBOR RETRIEVAL] Retrieved graph neighbors: {nodes}.")
return nodes

def query_node_by_url(self, url: str,) -> str|None:
"""Given a url, query the graph database for the node with that url.
If more than one node has the same url, return only one.
Parameters
----------
url
of node
Returns
-------
unique id of the node
"""
query_parameters = {"url": url}
query_string="""
MATCH (n)
WHERE n.`metadata.url`=$url
RETURN n.id
LIMIT 1
"""
with neo4j.GraphDatabase.driver(self.config["database"]["kg"]["url"], auth=(self.config["database"]["kg"]["username"], self.config["database"]["kg"]["password"])) as driver:
id, _, _ = driver.execute_query(
query_string,
parameters_=query_parameters,
database_=self.config["database"]["kg"] ["name"],)
if id:
return id[0].data()["n.id"]
else:
return None

def query_graph_db_by_url(self, urls: list[str]) -> list[Document]:
"""Given a list of urls, query the graph database for the nodes with those urls.
Expand All @@ -557,7 +526,7 @@ def query_graph_db_by_url(self, urls: list[str]) -> list[Document]:
"""
query_parameters = {"urls": urls}
query_string="""
MATCH (n)
MATCH (n:Records)
WHERE any(substring IN $urls WHERE n.url CONTAINS substring)
RETURN n
"""
Expand Down Expand Up @@ -1459,7 +1428,7 @@ def get_chunks_corresponding_to_nodes(self, nodes: list[Document], batch_size: i
self.logger.info(f"MsgID={msg_id}. [NODE2CHUNK] Using the following nodes to find corresponding chunks: {query_parameters}")
query_string = """
UNWIND $params AS param
MATCH (n)
MATCH (n:Chunk)
WHERE n.versionTitle = param.versionTitle AND n.url = param.url
RETURN n
"""
Expand Down Expand Up @@ -1494,7 +1463,7 @@ def get_node_corresponding_to_chunk(self, chunk: Document, msg_id: str = '') ->
query_parameters = {"url": chunk.metadata["url"], "versionTitle": chunk.metadata["versionTitle"]}
self.logger.info(f"MsgID={msg_id}. [CHUNK2NODE] Using the following chunk to find a corresponding node: {query_parameters}")
query_string="""
MATCH (n)
MATCH (n:Records)
WHERE n.url=$url
AND n.versionTitle=$versionTitle
RETURN n
Expand Down

0 comments on commit c8448d7

Please sign in to comment.