From 07a0387250eff61231d925cebeeb390c710f6c06 Mon Sep 17 00:00:00 2001
From: edemirci-aai <138793418+edemirci-aai@users.noreply.github.com>
Date: Thu, 5 Sep 2024 14:00:23 +0300
Subject: [PATCH 1/5] fix(query): improve performance by restricting MATCH to
'Chunk' label
Changed `MATCH (n)` to `MATCH (n:Chunk)` on line 1462 so neo4j query planner uses the existing index.
---
VirtualHavruta/vh.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py
index afbb7a2..846ddfd 100644
--- a/VirtualHavruta/vh.py
+++ b/VirtualHavruta/vh.py
@@ -1459,7 +1459,7 @@ def get_chunks_corresponding_to_nodes(self, nodes: list[Document], batch_size: i
self.logger.info(f"MsgID={msg_id}. [NODE2CHUNK] Using the following nodes to find corresponding chunks: {query_parameters}")
query_string = """
UNWIND $params AS param
- MATCH (n)
+ MATCH (n:Chunk)
WHERE n.versionTitle = param.versionTitle AND n.url = param.url
RETURN n
"""
From 208c6f73ea386fdf373f6b4a32ae7ede7004a2bb Mon Sep 17 00:00:00 2001
From: edemirci-aai <138793418+edemirci-aai@users.noreply.github.com>
Date: Thu, 5 Sep 2024 14:41:42 +0300
Subject: [PATCH 2/5] fix(query): improve performance by restricting MATCH to
'Records' label for KG
Changed `MATCH (n)` to `MATCH (n:Records) on line 560` to make KG use indices created.
---
VirtualHavruta/vh.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py
index 846ddfd..1a0d039 100644
--- a/VirtualHavruta/vh.py
+++ b/VirtualHavruta/vh.py
@@ -557,7 +557,7 @@ def query_graph_db_by_url(self, urls: list[str]) -> list[Document]:
"""
query_parameters = {"urls": urls}
query_string="""
- MATCH (n)
+ MATCH (n:Records)
WHERE any(substring IN $urls WHERE n.url CONTAINS substring)
RETURN n
"""
From 4013ec0092402181342d26f4c9a1cb81873f30f2 Mon Sep 17 00:00:00 2001
From: Paul Yu-Chun Chang
Date: Thu, 5 Sep 2024 11:46:56 +0000
Subject: [PATCH 3/5] fix: query kg node by url
---
VirtualHavruta/vh.py | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py
index 846ddfd..7ba2a3e 100644
--- a/VirtualHavruta/vh.py
+++ b/VirtualHavruta/vh.py
@@ -507,7 +507,7 @@ def get_graph_neighbors_by_url(self, url: str, relationship: str, depth: int, fi
self.logger.info(f"MsgID={msg_id}. [GRAGH NEIGHBOR RETRIEVAL] Retrieved graph neighbors: {nodes}.")
return nodes
- def query_node_by_url(self, url: str,) -> str|None:
+ def query_kg_node_by_url(self, url: str) -> str|None:
"""Given a url, query the graph database for the node with that url.
If more than one node has the same url, return only one.
@@ -519,22 +519,21 @@ def query_node_by_url(self, url: str,) -> str|None:
Returns
-------
- unique id of the node
- """
+ node
+ """
query_parameters = {"url": url}
query_string="""
- MATCH (n)
- WHERE n.`metadata.url`=$url
- RETURN n.id
- LIMIT 1
+ MATCH (n:Records)
+ WHERE n.url=$url
+ RETURN n
"""
with neo4j.GraphDatabase.driver(self.config["database"]["kg"]["url"], auth=(self.config["database"]["kg"]["username"], self.config["database"]["kg"]["password"])) as driver:
- id, _, _ = driver.execute_query(
+ nodes, _, _ = driver.execute_query(
query_string,
parameters_=query_parameters,
database_=self.config["database"]["kg"] ["name"],)
- if id:
- return id[0].data()["n.id"]
+ if nodes:
+ return nodes
else:
return None
From 12977cb24e4c6a742eb292f772c2ceb91b056084 Mon Sep 17 00:00:00 2001
From: edemirci-aai <138793418+edemirci-aai@users.noreply.github.com>
Date: Thu, 5 Sep 2024 14:53:58 +0300
Subject: [PATCH 4/5] fix(query): improve performance by restricting MATCH to
'Records' label
Changed `MATCH (n)` to `MATCH (n:Records)` on line 1496 so KG uses indexes created.
---
VirtualHavruta/vh.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py
index 1db0ccf..ccc08f6 100644
--- a/VirtualHavruta/vh.py
+++ b/VirtualHavruta/vh.py
@@ -1493,7 +1493,7 @@ def get_node_corresponding_to_chunk(self, chunk: Document, msg_id: str = '') ->
query_parameters = {"url": chunk.metadata["url"], "versionTitle": chunk.metadata["versionTitle"]}
self.logger.info(f"MsgID={msg_id}. [CHUNK2NODE] Using the following chunk to find a corresponding node: {query_parameters}")
query_string="""
- MATCH (n)
+ MATCH (n:Records)
WHERE n.url=$url
AND n.versionTitle=$versionTitle
RETURN n
From 344b5460d4f39ceffcdbe9527453c189067b0686 Mon Sep 17 00:00:00 2001
From: Paul Yu-Chun Chang
Date: Thu, 5 Sep 2024 12:12:40 +0000
Subject: [PATCH 5/5] fix: add node labels to neighbor queries
---
VirtualHavruta/vh.py | 32 +-------------------------------
1 file changed, 1 insertion(+), 31 deletions(-)
diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py
index ccc08f6..0fb1166 100644
--- a/VirtualHavruta/vh.py
+++ b/VirtualHavruta/vh.py
@@ -491,7 +491,7 @@ def get_graph_neighbors_by_url(self, url: str, relationship: str, depth: int, fi
for i in range(1, depth + 1):
source_filter = f'AND {"NOT" if filter_mode_nodes == "secondary" else ""} neighbor.primaryDocCategory IN $primaryDocCategories' if filter_mode_nodes else ''
query = f"""
- MATCH (start {{url: $url}})
+ MATCH (start:Records {{url: $url}})
WITH start
MATCH (start){start_node_operator}[:FROM_TO*{i}]{related_node_operator}(neighbor)
WHERE neighbor <> start
@@ -506,36 +506,6 @@ def get_graph_neighbors_by_url(self, url: str, relationship: str, depth: int, fi
nodes.extend(neighbor_nodes)
self.logger.info(f"MsgID={msg_id}. [GRAGH NEIGHBOR RETRIEVAL] Retrieved graph neighbors: {nodes}.")
return nodes
-
- def query_kg_node_by_url(self, url: str) -> str|None:
- """Given a url, query the graph database for the node with that url.
-
- If more than one node has the same url, return only one.
-
- Parameters
- ----------
- url
- of node
-
- Returns
- -------
- node
- """
- query_parameters = {"url": url}
- query_string="""
- MATCH (n:Records)
- WHERE n.url=$url
- RETURN n
- """
- with neo4j.GraphDatabase.driver(self.config["database"]["kg"]["url"], auth=(self.config["database"]["kg"]["username"], self.config["database"]["kg"]["password"])) as driver:
- nodes, _, _ = driver.execute_query(
- query_string,
- parameters_=query_parameters,
- database_=self.config["database"]["kg"] ["name"],)
- if nodes:
- return nodes
- else:
- return None
def query_graph_db_by_url(self, urls: list[str]) -> list[Document]:
"""Given a list of urls, query the graph database for the nodes with those urls.