From 07a0387250eff61231d925cebeeb390c710f6c06 Mon Sep 17 00:00:00 2001 From: edemirci-aai <138793418+edemirci-aai@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:00:23 +0300 Subject: [PATCH 1/5] fix(query): improve performance by restricting MATCH to 'Chunk' label Changed `MATCH (n)` to `MATCH (n:Chunk)` on line 1462 so neo4j query planner uses the existing index. --- VirtualHavruta/vh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py index afbb7a2..846ddfd 100644 --- a/VirtualHavruta/vh.py +++ b/VirtualHavruta/vh.py @@ -1459,7 +1459,7 @@ def get_chunks_corresponding_to_nodes(self, nodes: list[Document], batch_size: i self.logger.info(f"MsgID={msg_id}. [NODE2CHUNK] Using the following nodes to find corresponding chunks: {query_parameters}") query_string = """ UNWIND $params AS param - MATCH (n) + MATCH (n:Chunk) WHERE n.versionTitle = param.versionTitle AND n.url = param.url RETURN n """ From 208c6f73ea386fdf373f6b4a32ae7ede7004a2bb Mon Sep 17 00:00:00 2001 From: edemirci-aai <138793418+edemirci-aai@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:41:42 +0300 Subject: [PATCH 2/5] fix(query): improve performance by restricting MATCH to 'Records' label for KG Changed `MATCH (n)` to `MATCH (n:Records) on line 560` to make KG use indices created. --- VirtualHavruta/vh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py index 846ddfd..1a0d039 100644 --- a/VirtualHavruta/vh.py +++ b/VirtualHavruta/vh.py @@ -557,7 +557,7 @@ def query_graph_db_by_url(self, urls: list[str]) -> list[Document]: """ query_parameters = {"urls": urls} query_string=""" - MATCH (n) + MATCH (n:Records) WHERE any(substring IN $urls WHERE n.url CONTAINS substring) RETURN n """ From 4013ec0092402181342d26f4c9a1cb81873f30f2 Mon Sep 17 00:00:00 2001 From: Paul Yu-Chun Chang Date: Thu, 5 Sep 2024 11:46:56 +0000 Subject: [PATCH 3/5] fix: query kg node by url --- VirtualHavruta/vh.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py index 846ddfd..7ba2a3e 100644 --- a/VirtualHavruta/vh.py +++ b/VirtualHavruta/vh.py @@ -507,7 +507,7 @@ def get_graph_neighbors_by_url(self, url: str, relationship: str, depth: int, fi self.logger.info(f"MsgID={msg_id}. [GRAGH NEIGHBOR RETRIEVAL] Retrieved graph neighbors: {nodes}.") return nodes - def query_node_by_url(self, url: str,) -> str|None: + def query_kg_node_by_url(self, url: str) -> str|None: """Given a url, query the graph database for the node with that url. If more than one node has the same url, return only one. @@ -519,22 +519,21 @@ def query_node_by_url(self, url: str,) -> str|None: Returns ------- - unique id of the node - """ + node + """ query_parameters = {"url": url} query_string=""" - MATCH (n) - WHERE n.`metadata.url`=$url - RETURN n.id - LIMIT 1 + MATCH (n:Records) + WHERE n.url=$url + RETURN n """ with neo4j.GraphDatabase.driver(self.config["database"]["kg"]["url"], auth=(self.config["database"]["kg"]["username"], self.config["database"]["kg"]["password"])) as driver: - id, _, _ = driver.execute_query( + nodes, _, _ = driver.execute_query( query_string, parameters_=query_parameters, database_=self.config["database"]["kg"] ["name"],) - if id: - return id[0].data()["n.id"] + if nodes: + return nodes else: return None From 12977cb24e4c6a742eb292f772c2ceb91b056084 Mon Sep 17 00:00:00 2001 From: edemirci-aai <138793418+edemirci-aai@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:53:58 +0300 Subject: [PATCH 4/5] fix(query): improve performance by restricting MATCH to 'Records' label Changed `MATCH (n)` to `MATCH (n:Records)` on line 1496 so KG uses indexes created. --- VirtualHavruta/vh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py index 1db0ccf..ccc08f6 100644 --- a/VirtualHavruta/vh.py +++ b/VirtualHavruta/vh.py @@ -1493,7 +1493,7 @@ def get_node_corresponding_to_chunk(self, chunk: Document, msg_id: str = '') -> query_parameters = {"url": chunk.metadata["url"], "versionTitle": chunk.metadata["versionTitle"]} self.logger.info(f"MsgID={msg_id}. [CHUNK2NODE] Using the following chunk to find a corresponding node: {query_parameters}") query_string=""" - MATCH (n) + MATCH (n:Records) WHERE n.url=$url AND n.versionTitle=$versionTitle RETURN n From 344b5460d4f39ceffcdbe9527453c189067b0686 Mon Sep 17 00:00:00 2001 From: Paul Yu-Chun Chang Date: Thu, 5 Sep 2024 12:12:40 +0000 Subject: [PATCH 5/5] fix: add node labels to neighbor queries --- VirtualHavruta/vh.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/VirtualHavruta/vh.py b/VirtualHavruta/vh.py index ccc08f6..0fb1166 100644 --- a/VirtualHavruta/vh.py +++ b/VirtualHavruta/vh.py @@ -491,7 +491,7 @@ def get_graph_neighbors_by_url(self, url: str, relationship: str, depth: int, fi for i in range(1, depth + 1): source_filter = f'AND {"NOT" if filter_mode_nodes == "secondary" else ""} neighbor.primaryDocCategory IN $primaryDocCategories' if filter_mode_nodes else '' query = f""" - MATCH (start {{url: $url}}) + MATCH (start:Records {{url: $url}}) WITH start MATCH (start){start_node_operator}[:FROM_TO*{i}]{related_node_operator}(neighbor) WHERE neighbor <> start @@ -506,36 +506,6 @@ def get_graph_neighbors_by_url(self, url: str, relationship: str, depth: int, fi nodes.extend(neighbor_nodes) self.logger.info(f"MsgID={msg_id}. [GRAGH NEIGHBOR RETRIEVAL] Retrieved graph neighbors: {nodes}.") return nodes - - def query_kg_node_by_url(self, url: str) -> str|None: - """Given a url, query the graph database for the node with that url. - - If more than one node has the same url, return only one. - - Parameters - ---------- - url - of node - - Returns - ------- - node - """ - query_parameters = {"url": url} - query_string=""" - MATCH (n:Records) - WHERE n.url=$url - RETURN n - """ - with neo4j.GraphDatabase.driver(self.config["database"]["kg"]["url"], auth=(self.config["database"]["kg"]["username"], self.config["database"]["kg"]["password"])) as driver: - nodes, _, _ = driver.execute_query( - query_string, - parameters_=query_parameters, - database_=self.config["database"]["kg"] ["name"],) - if nodes: - return nodes - else: - return None def query_graph_db_by_url(self, urls: list[str]) -> list[Document]: """Given a list of urls, query the graph database for the nodes with those urls.