bug: citation duplicates and accordion overflow (#51)

Co-authored-by: Kevin Boyer <[email protected]>
navapbc · Aug 13, 2024 · 1e283f6 · 1e283f6
1 parent 43fc8da
commit 1e283f6
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 9 deletions.
diff --git a/app/src/chainlit.py b/app/src/chainlit.py
@@ -169,7 +169,7 @@ def _get_retrieval_metadata(chunks_with_scores: Sequence[ChunkWithScore]) -> dic
         "chunks": [
             {
                 "document.name": chunk_with_score.chunk.document.name,
-                "chunk.id": chunk_with_score.chunk.id,
+                "chunk.id": str(chunk_with_score.chunk.id),
                 "score": chunk_with_score.score,
             }
             for chunk_with_score in chunks_with_scores

diff --git a/app/src/format.py b/app/src/format.py
@@ -99,11 +99,13 @@ def _format_to_accordion_html(document: Document, score: float) -> str:
 def _format_to_accordion_group_html(documents: OrderedDict[Document, list[ChunkWithScore]]) -> str:
     global _accordion_id
     html = ""
-    internal_citation = ""
     for document in documents:
+        internal_citation = ""
         _accordion_id += 1
         for index, chunk in enumerate(documents[document], start=1):
-            formatted_chunk = re.sub(r"\n+", "\n", chunk.chunk.content).strip()
+            chunk_lines = chunk.chunk.content.splitlines()
+            formatted_chunk = " ".join(chunk_lines)
+            formatted_chunk = re.sub(r"\t+", "", formatted_chunk).strip()
             formatted_chunk = f"<p>{formatted_chunk} </p>" if formatted_chunk else ""
             citation = f"<h4>Citation #{index} (score: {str(chunk.score)})</h4>"
             similarity_score = f"<p>Similarity Score: {str(chunk.score)}</p>"

diff --git a/app/tests/src/test_chainlit.py b/app/tests/src/test_chainlit.py
@@ -24,17 +24,17 @@ def test__get_retrieval_metadata(chunks_with_scores):
         "chunks": [
             {
                 "document.name": chunks_with_scores[0].chunk.document.name,
-                "chunk.id": chunks_with_scores[0].chunk.id,
+                "chunk.id": str(chunks_with_scores[0].chunk.id),
                 "score": chunks_with_scores[0].score,
             },
             {
                 "document.name": chunks_with_scores[1].chunk.document.name,
-                "chunk.id": chunks_with_scores[1].chunk.id,
+                "chunk.id": str(chunks_with_scores[1].chunk.id),
                 "score": chunks_with_scores[1].score,
             },
             {
                 "document.name": chunks_with_scores[2].chunk.document.name,
-                "chunk.id": chunks_with_scores[2].chunk.id,
+                "chunk.id": str(chunks_with_scores[2].chunk.id),
                 "score": chunks_with_scores[2].score,
             },
         ]

diff --git a/app/tests/src/test_format.py b/app/tests/src/test_format.py
@@ -93,9 +93,9 @@ def test_format_bem_documents():
         chunks_shown_max_num=2, chunks_shown_min_score=0.91, chunks_with_scores=chunks_with_scores
     )
 
-    assert docs[0].content not in html
-    assert docs[1].content not in html
-    assert docs[3].content in html
+    assert docs[0].content.replace("\n", " ") not in html
+    assert docs[1].content.replace("\n", " ") not in html
+    assert docs[3].content.replace("\n", " ") in html
     assert "Citation #2" in html
     assert "Citation #3" not in html
     assert "<p>Similarity Score: 0.95</p>" in html