refactor: Add general ingest_runner (#189)

navapbc · Jan 22, 2025 · 53a1ffb · 53a1ffb
1 parent ae29de6
commit 53a1ffb
Show file tree

Hide file tree

Showing 15 changed files with 314 additions and 314 deletions.
diff --git a/app/Makefile b/app/Makefile
@@ -255,15 +255,9 @@ endif
 scrape-ca-public-charge:
 	$(PY_RUN_CMD) scrapy-runner ca_public_charge
 
-ingest-ca-public-charge: check-ingest-arguments
-	$(PY_RUN_CMD) ingest-ca-public-charge "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
-
 scrape-edd-web:
 	$(PY_RUN_CMD) scrapy-runner edd
 
-ingest-edd-web: check-ingest-arguments
-	$(PY_RUN_CMD) ingest-edd-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
-
 
 scrape-imagine-la:
 	cd src/ingestion/imagine_la/scrape; uv run --no-project scrape_content_hub.py https://socialbenefitsnavigator25.web.app/contenthub $(CONTENTHUB_PASSWORD)
@@ -279,16 +273,17 @@ scrape-la-county-policy:
 	# Now that we have the expanded nav bar, scrape all the links in the nav bar
 	$(PY_RUN_CMD) scrapy-runner la_policy 2>&1 | tee out.log
 
-ingest-la-county-policy: check-ingest-arguments
-	$(PY_RUN_CMD) ingest-la-policy "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
-
-
 scrape-irs-web:
 	$(PY_RUN_CMD) scrapy-runner irs
 
-ingest-irs-web: check-ingest-arguments
-	$(PY_RUN_CMD) ingest-irs-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
-
-
 scrape-ca-ftb:
 	$(PY_RUN_CMD) scrapy-runner ca_ftb
+
+ingest-runner:
+ifndef DATASET_ID
+	$(error DATASET_ID is undefined)
+endif
+ifndef FILEPATH
+	$(error FILEPATH is undefined)
+endif
+	$(PY_RUN_CMD) ingest-runner "$(DATASET_ID)" "$(FILEPATH)" $(INGEST_ARGS)
diff --git a/app/pyproject.toml b/app/pyproject.toml
@@ -68,15 +68,12 @@ build-backend = "poetry.core.masonry.api"
 db-migrate = "src.db.migrations.run:up"
 db-migrate-down = "src.db.migrations.run:down"
 db-migrate-down-all = "src.db.migrations.run:downall"
-ingest-ca-public-charge = "src.ingest_ca_public_charge:main"
-ingest-edd-web = "src.ingest_edd_web:main"
 scrape-edd-web = "src.ingestion.scrape_edd_web:main"
 ingest-imagine-la = "src.ingestion.imagine_la.ingest:main"
 scrape-la-policy = "src.ingestion.scrape_la_policy:main"
-ingest-la-policy = "src.ingest_la_county_policy:main"
-ingest-irs-web = "src.ingest_irs_web:main"
 
 scrapy-runner = "src.ingestion.scrapy_runner:main"
+ingest-runner = "src.ingest_runner:main"
 
 [tool.black]
 line-length = 100

diff --git a/app/src/ingest_ca_public_charge.py b/app/src/ingest_ca_public_charge.py
diff --git a/app/src/ingest_irs_web.py b/app/src/ingest_irs_web.py
diff --git a/app/src/ingest_la_county_policy.py b/app/src/ingest_la_county_policy.py
diff --git a/app/src/ingest_runner.py b/app/src/ingest_runner.py
@@ -0,0 +1,130 @@
+import argparse
+import logging
+import re
+import sys
+
+from src.ingester import ingest_json
+from src.util.ingest_utils import DefaultChunkingConfig, IngestConfig, start_ingestion
+
+logger = logging.getLogger(__name__)
+
+
+def edd_web_config(dataset_id: str, benefit_program: str, benefit_region: str) -> IngestConfig:
+    def _fix_input_markdown(markdown: str) -> str:
+        # Fix ellipsis text that causes markdown parsing errors
+        # '. . .' is parsed as sublists on the same line
+        # in https://edd.ca.gov/en/uibdg/total_and_partial_unemployment_tpu_5/
+        markdown = markdown.replace(". . .", "...")
+
+        # Nested sublist '* + California's New Application' created without parent list
+        # in https://edd.ca.gov/en/about_edd/eddnext
+        markdown = markdown.replace("* + ", "    + ")
+
+        # Blank sublist '* ###" in https://edd.ca.gov/en/unemployment/Employer_Information/
+        # Tab labels are parsed into list items with headings; remove them
+        markdown = re.sub(r"^\s*\* #+", "", markdown, flags=re.MULTILINE)
+
+        # Blank sublist '* +" in https://edd.ca.gov/en/unemployment/Employer_Information/
+        # Empty sublist '4. * ' in https://edd.ca.gov/en/about_edd/your-benefit-payment-options/
+        # Remove empty nested sublists
+        markdown = re.sub(
+            r"^\s*(\w+\.|\*|\+|\-) (\w+\.|\*|\+|\-)\s*$", "", markdown, flags=re.MULTILINE
+        )
+        return markdown
+
+    def prep_json_item(item: dict[str, str]) -> None:
+        markdown = item.get("main_content", item.get("main_primary", None))
+        assert markdown, f"Item {item['url']} has no main_content or main_primary"
+        item["markdown"] = _fix_input_markdown(markdown)
+
+    return IngestConfig(
+        dataset_id,
+        benefit_program,
+        benefit_region,
+        "https://edd.ca.gov/en/",
+        "edd_web_md",
+        prep_json_item,
+    )
+
+
+def la_county_policy_config(
+    dataset_id: str, benefit_program: str, benefit_region: str
+) -> IngestConfig:
+    chunking_config = DefaultChunkingConfig()
+    # The document name is the same as item["h2"], so it is redundant to include it in the headings
+    chunking_config.include_doc_name_in_headings = False
+
+    def prep_json_item(item: dict[str, str]) -> None:
+        # More often than not, the h2 heading is better suited as the title
+        item["title"] = item["h2"]
+
+        # Include the program name in the document title
+        program_name = item["h1"]
+        item["title"] = f"{program_name}: {item['title']}"
+
+    return IngestConfig(
+        dataset_id,
+        benefit_program,
+        benefit_region,
+        "https://epolicy.dpss.lacounty.gov/epolicy/epolicy/server/general/projects_responsive/ePolicyMaster/mergedProjects/",
+        "la_policy_md",
+        prep_json_item,
+        chunking_config,
+    )
+
+
+def ca_public_charge_config(
+    dataset_id: str, benefit_program: str, benefit_region: str
+) -> IngestConfig:
+    def prep_json_item(item: dict[str, str]) -> None:
+        markdown = item.get("main_content", item.get("main_primary", None))
+        assert markdown, f"Item {item['url']} has no main_content or main_primary"
+        item["markdown"] = markdown
+
+    return IngestConfig(
+        dataset_id,
+        benefit_program,
+        benefit_region,
+        "https://keepyourbenefits.org/en/ca/",
+        "ca_public_charge_md",
+        prep_json_item,
+    )
+
+
+def get_ingester_config(dataset_id: str) -> IngestConfig:
+    match dataset_id:
+        case "CA EDD":
+            return edd_web_config(dataset_id, "employment", "California")
+        case "DPSS Policy":
+            return la_county_policy_config(dataset_id, "mixed", "California:LA County")
+        case "IRS":
+            return IngestConfig(
+                dataset_id, "tax credit", "US", "https://www.irs.gov/", "irs_web_md"
+            )
+        case "Keep Your Benefits":
+            return ca_public_charge_config(dataset_id, "mixed", "California")
+        case _:
+            raise ValueError(f"Unknown dataset_id: {dataset_id}")
+
+
+# Print INFO messages since this is often run from the terminal during local development
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+
+
+def main() -> None:  # pragma: no cover
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset_id")
+    parser.add_argument("file_path")
+    parser.add_argument("--resume", action="store_true")
+    parser.add_argument("--skip_db", action="store_true")
+    args = parser.parse_args(sys.argv[1:])
+
+    config = get_ingester_config(sys.argv[1])
+    start_ingestion(
+        logger,
+        ingest_json,
+        args.file_path,
+        config,
+        skip_db=args.skip_db,
+        resume=args.resume,
+    )