Skip to content

Commit

Permalink
fix: Commit after ingesting Guru cards (#42)
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinJBoyer authored Aug 7, 2024
1 parent 60c75f3 commit 8531fc8
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 10 deletions.
3 changes: 2 additions & 1 deletion app/src/ingest_guru_cards.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from smart_open import open

from src.adapters import db
from src.app_config import app_config
from src.db.models.document import Chunk, Document
from src.util.html import get_text_from_html
Expand All @@ -21,10 +22,10 @@


def _ingest_cards(
db_session: db.Session,
guru_cards_filepath: str,
doc_attribs: dict[str, str],
) -> None:
db_session = app_config.db_session()
with open(guru_cards_filepath, "r") as guru_cards_file:
cards_as_json = json.load(guru_cards_file)

Expand Down
3 changes: 1 addition & 2 deletions app/src/ingest_policy_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@


def _ingest_policy_pdfs(
db_session: db.Session,
pdf_file_dir: str,
doc_attribs: dict[str, str],
) -> None:
file_list = get_files(pdf_file_dir)
embedding_model = app_config.sentence_transformer
db_session = app_config.db_session()

logger.info(f"Processing pdfs {pdf_file_dir} using {embedding_model} with {doc_attribs}")
for file in file_list:
Expand All @@ -37,7 +37,6 @@ def _ingest_policy_pdfs(
parse_pdf_and_add_to_db(
contents=output_string, doc_attribs=doc_attribs, db_session=db_session
)
db_session.commit()


def parse_pdf_and_add_to_db(
Expand Down
2 changes: 1 addition & 1 deletion app/src/util/ingest_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def process_and_ingest_sys_args(sys: ModuleType, logger: Logger, ingestion_call:
}

with app_config.db_session() as db_session:
ingestion_call(pdf_file_dir, doc_attribs)
ingestion_call(db_session, pdf_file_dir, doc_attribs)
db_session.commit()

logger.info("Finished processing")
8 changes: 4 additions & 4 deletions app/tests/src/test_ingest_guru_cards.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ def test__ingest_cards(app_config, db_session, guru_local_file, guru_s3_file, fi
db_session.execute(delete(Document))

if file_location == "local":
_ingest_cards(guru_local_file, doc_attribs)
_ingest_cards(db_session, guru_local_file, doc_attribs)
else:
_ingest_cards(guru_s3_file, doc_attribs)
_ingest_cards(db_session, guru_s3_file, doc_attribs)

documents = db_session.execute(select(Document).order_by(Document.name)).scalars().all()
assert len(documents) == 3
Expand All @@ -79,7 +79,7 @@ def test__ingest_cards(app_config, db_session, guru_local_file, guru_s3_file, fi
assert documents[2].chunks[0].content == "This is a test content for card 2.\nWith extra HTML."


def test__ingest_cards_warns_on_max_seq_length(caplog, app_config, guru_local_file):
def test__ingest_cards_warns_on_max_seq_length(caplog, app_config, db_session, guru_local_file):
with caplog.at_level(logging.WARNING):
_ingest_cards(guru_local_file, doc_attribs)
_ingest_cards(db_session, guru_local_file, doc_attribs)
assert "exceeds the embedding model's max sequence length" in caplog.messages[0]
4 changes: 2 additions & 2 deletions app/tests/src/test_ingest_policy_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ def test__ingest_policy_pdfs(caplog, app_config, db_session, policy_s3_file, fil

with caplog.at_level(logging.INFO):
if file_location == "local":
_ingest_policy_pdfs("/app/tests/docs/", doc_attribs)
_ingest_policy_pdfs(db_session, "/app/tests/docs/", doc_attribs)
else:
_ingest_policy_pdfs(policy_s3_file, doc_attribs)
_ingest_policy_pdfs(db_session, policy_s3_file, doc_attribs)

assert any(text.startswith("Processing pdf file:") for text in caplog.messages)
documents = db_session.execute(select(Document).order_by(Document.name)).scalars().all()
Expand Down

0 comments on commit 8531fc8

Please sign in to comment.