Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: remove BEM-specific code and generalize PDF processing #159

Merged
merged 14 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,6 @@ ingest-guru-cards: check-ingest-arguments
ingest-policy-pdfs: check-ingest-arguments
$(PY_RUN_CMD) ingest-policy-pdfs "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)"

ingest-bem-pdfs: check-ingest-arguments
$(PY_RUN_CMD) ingest-bem-pdfs "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)"

ingest-edd-web: check-ingest-arguments
$(PY_RUN_CMD) ingest-edd-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)

Expand Down
1 change: 0 additions & 1 deletion app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ db-migrate-down = "src.db.migrations.run:down"
db-migrate-down-all = "src.db.migrations.run:downall"
ingest-guru-cards = "src.ingest_guru_cards:main"
ingest-policy-pdfs = "src.ingest_policy_pdfs:main"
fg-nava marked this conversation as resolved.
Show resolved Hide resolved
ingest-bem-pdfs = "src.ingest_bem_pdfs:main"
ingest-edd-web = "src.ingest_edd_web:main"
scrape-edd-web = "src.ingestion.scrape_edd_web:main"
ingest-imagine-la = "src.ingestion.imagine_la.ingest:main"
Expand Down
17 changes: 1 addition & 16 deletions app/src/chat_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
split_into_subsections,
)
from src.db.models.document import ChunkWithScore, Subsection
from src.format import BemFormattingConfig, FormattingConfig, format_guru_cards
from src.format import FormattingConfig, format_guru_cards
from src.generate import PROMPT, ChatHistory, MessageAttributes, analyze_message, generate
from src.retrieve import retrieve_with_scores
from src.util.class_utils import all_subclasses
Expand Down Expand Up @@ -170,21 +170,6 @@ class GuruSnapEngine(BaseEngine):
formatter = staticmethod(format_guru_cards)


class BridgesEligibilityManualEngine(BaseEngine):
retrieval_k: int = 10
retrieval_k_min_score: float = -1

# Note: currently not used
chunks_shown_min_score: float = -1
chunks_shown_max_num: int = 8

engine_id: str = "bridges-eligibility-manual"
name: str = "Michigan Bridges Eligibility Manual Chat Engine"
datasets = ["bridges-eligibility-manual"]

formatting_config = BemFormattingConfig()


class CaEddWebEngine(BaseEngine):
retrieval_k: int = 50
retrieval_k_min_score: float = -1
Expand Down
128 changes: 1 addition & 127 deletions app/src/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from src.citations import CITATION_PATTERN, remap_citation_ids
from src.db.models.document import Chunk, ChunkWithScore, Document, Subsection
from src.util.bem_util import get_bem_url, replace_bem_with_link

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,32 +39,6 @@ def format_accordion_body(self, citation_body: str) -> str:
return to_html(citation_body)


class BemFormattingConfig(FormattingConfig):
"BEM-specific formatting configuration"

def __init__(self) -> None:
self.add_citation_link_per_subsection = True

def get_citation_link(self, subsection: Subsection) -> str:
chunk = subsection.chunk
bem_url_for_page = get_bem_url(chunk.document.name)
if chunk.page_number:
bem_url_for_page += "#page=" + str(chunk.page_number)
return (
f"<p><a href={bem_url_for_page!r}>Open document to page {chunk.page_number}</a></p>"
if chunk.page_number
else ""
)

def get_superscript_link(self, chunk: Chunk) -> str:
link = get_bem_url(chunk.document.name) if "BEM" in chunk.document.name else "#"
link += "#page=" + str(chunk.page_number) if chunk.page_number else ""
return link

def format_accordion_body(self, citation_body: str) -> str:
return to_html(replace_bem_with_link(citation_body))


def format_guru_cards(
chunks_shown_max_num: int,
chunks_shown_min_score: float,
Expand All @@ -92,7 +65,7 @@ def format_guru_cards(
return response_with_citations + "<h3>Related Guru cards</h3>" + cards_html


def _get_bem_documents_to_show(
def _get_documents_to_show(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is not called by any remaining code, remove it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed

chunks_shown_max_num: int,
chunks_shown_min_score: float,
chunks_with_scores: list[ChunkWithScore],
Expand Down Expand Up @@ -207,12 +180,10 @@ def _build_citation_body(
)
if config.add_citation_link_per_subsection:
citation_link = config.get_citation_link(subsection)
# generated citation links for BEM redirect to specific pages
citation_body += f"<div>{citation_link}</div>"

if not config.add_citation_link_per_subsection:
citation_link = config.get_document_link(document)
# display source link once
citation_body += f"<div>{citation_link}</div>"
return citation_body

Expand All @@ -234,23 +205,6 @@ def _get_breadcrumb_html(headings: Sequence[str] | None, document_name: str) ->
return f"<div><b>{' → '.join(headings)}</b></div>"


# TODO: This is not called. Remove it?
def format_bem_documents(
chunks_shown_max_num: int,
chunks_shown_min_score: float,
chunks_with_scores: Sequence[ChunkWithScore],
subsections: Sequence[Subsection],
raw_response: str,
) -> str:
response_with_citations = reify_citations(raw_response, subsections, BemFormattingConfig())

documents = _get_bem_documents_to_show(
chunks_shown_max_num, chunks_shown_min_score, list(chunks_with_scores)
)

return response_with_citations + _format_bem_to_accordion_group_html(documents)


def _format_guru_to_accordion_html(document: Document, score: float) -> str:
global _accordion_id
_accordion_id += 1
Expand All @@ -276,86 +230,6 @@ def _format_guru_to_accordion_html(document: Document, score: float) -> str:
</div>"""


def _format_bem_to_accordion_group_html(
documents: OrderedDict[Document, list[ChunkWithScore]]
) -> str:
global _accordion_id
html = ""
citation_number = 1
for document in documents:
citations = ""
_accordion_id += 1

citation_number_start = citation_number

for chunk_with_score in documents[document]:
chunk = chunk_with_score.chunk

formatted_chunk = _add_ellipses_for_bem(chunk)
formatted_chunk = replace_bem_with_link(formatted_chunk)

# Adjust markdown for lists so Chainlit renders correctly
formatted_chunk = re.sub("^ - ", "- ", formatted_chunk, flags=re.MULTILINE)
if formatted_chunk.startswith("- "):
formatted_chunk = "\n" + formatted_chunk

bem_url_for_page = get_bem_url(document.name)
if chunk.page_number:
bem_url_for_page += "#page=" + str(chunk.page_number)

citation_heading = f"<h4>Citation {citation_number}:</h4>"
chunk_headings = "<p>" + " → ".join(chunk.headings) + "</p>" if chunk.headings else ""
citation_body = f'<div class="margin-left-2 border-left-1 border-base-lighter padding-left-2">{formatted_chunk}</div>'
citation_link = (
(
f"<p><a href={bem_url_for_page!r}>Open document to page {chunk.page_number}</a></p>"
)
if chunk.page_number
else ""
)
citations += citation_heading + chunk_headings + citation_body + citation_link

citation_number += 1

citation_number_end = citation_number - 1
citation_range = (
f"Citation {citation_number_start}"
if citation_number_start == citation_number_end
else f"Citations {citation_number_start} - {citation_number_end}"
)

html += f"""
<div class="usa-accordion" id=accordion-{_accordion_id}>
<h4 class="usa-accordion__heading">
<button
type="button"
class="usa-accordion__button"
aria-expanded="false"
aria-controls="a-{_accordion_id}"
>
<a href="{get_bem_url(document.name)}">{document.name}</a> ({citation_range})
</button>
</h4>
<div id="a-{_accordion_id}" class="usa-accordion__content usa-prose" hidden>
{citations}
</div>
</div>""" # noqa: B907

return "\n<h3>Source(s)</h3>" + html if html else ""


def _add_ellipses_for_bem(chunk: Chunk) -> str:
chunk_content = chunk.content
if chunk.num_splits != 0:
if chunk.split_index == 0:
return f"{chunk_content} ..."
elif chunk.split_index == chunk.num_splits:
return f"... {chunk_content}"
else:
return f"... {chunk_content} ..."
return chunk_content


def reify_citations(
response: str, subsections: Sequence[Subsection], config: FormattingConfig
) -> str:
Expand Down
Loading
Loading