Skip to content

Commit

Permalink
refactor: Add general ingest_runner (#189)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoomlam authored Jan 22, 2025
1 parent ae29de6 commit 53a1ffb
Show file tree
Hide file tree
Showing 15 changed files with 314 additions and 314 deletions.
23 changes: 9 additions & 14 deletions app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -255,15 +255,9 @@ endif
scrape-ca-public-charge:
$(PY_RUN_CMD) scrapy-runner ca_public_charge

ingest-ca-public-charge: check-ingest-arguments
$(PY_RUN_CMD) ingest-ca-public-charge "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)

scrape-edd-web:
$(PY_RUN_CMD) scrapy-runner edd

ingest-edd-web: check-ingest-arguments
$(PY_RUN_CMD) ingest-edd-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-imagine-la:
cd src/ingestion/imagine_la/scrape; uv run --no-project scrape_content_hub.py https://socialbenefitsnavigator25.web.app/contenthub $(CONTENTHUB_PASSWORD)
Expand All @@ -279,16 +273,17 @@ scrape-la-county-policy:
# Now that we have the expanded nav bar, scrape all the links in the nav bar
$(PY_RUN_CMD) scrapy-runner la_policy 2>&1 | tee out.log

ingest-la-county-policy: check-ingest-arguments
$(PY_RUN_CMD) ingest-la-policy "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-irs-web:
$(PY_RUN_CMD) scrapy-runner irs

ingest-irs-web: check-ingest-arguments
$(PY_RUN_CMD) ingest-irs-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-ca-ftb:
$(PY_RUN_CMD) scrapy-runner ca_ftb

ingest-runner:
ifndef DATASET_ID
$(error DATASET_ID is undefined)
endif
ifndef FILEPATH
$(error FILEPATH is undefined)
endif
$(PY_RUN_CMD) ingest-runner "$(DATASET_ID)" "$(FILEPATH)" $(INGEST_ARGS)
5 changes: 1 addition & 4 deletions app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,12 @@ build-backend = "poetry.core.masonry.api"
db-migrate = "src.db.migrations.run:up"
db-migrate-down = "src.db.migrations.run:down"
db-migrate-down-all = "src.db.migrations.run:downall"
ingest-ca-public-charge = "src.ingest_ca_public_charge:main"
ingest-edd-web = "src.ingest_edd_web:main"
scrape-edd-web = "src.ingestion.scrape_edd_web:main"
ingest-imagine-la = "src.ingestion.imagine_la.ingest:main"
scrape-la-policy = "src.ingestion.scrape_la_policy:main"
ingest-la-policy = "src.ingest_la_county_policy:main"
ingest-irs-web = "src.ingest_irs_web:main"

scrapy-runner = "src.ingestion.scrapy_runner:main"
ingest-runner = "src.ingest_runner:main"

[tool.black]
line-length = 100
Expand Down
41 changes: 0 additions & 41 deletions app/src/ingest_ca_public_charge.py

This file was deleted.

33 changes: 0 additions & 33 deletions app/src/ingest_irs_web.py

This file was deleted.

46 changes: 0 additions & 46 deletions app/src/ingest_la_county_policy.py

This file was deleted.

130 changes: 130 additions & 0 deletions app/src/ingest_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import argparse
import logging
import re
import sys

from src.ingester import ingest_json
from src.util.ingest_utils import DefaultChunkingConfig, IngestConfig, start_ingestion

logger = logging.getLogger(__name__)


def edd_web_config(dataset_id: str, benefit_program: str, benefit_region: str) -> IngestConfig:
def _fix_input_markdown(markdown: str) -> str:
# Fix ellipsis text that causes markdown parsing errors
# '. . .' is parsed as sublists on the same line
# in https://edd.ca.gov/en/uibdg/total_and_partial_unemployment_tpu_5/
markdown = markdown.replace(". . .", "...")

# Nested sublist '* + California's New Application' created without parent list
# in https://edd.ca.gov/en/about_edd/eddnext
markdown = markdown.replace("* + ", " + ")

# Blank sublist '* ###" in https://edd.ca.gov/en/unemployment/Employer_Information/
# Tab labels are parsed into list items with headings; remove them
markdown = re.sub(r"^\s*\* #+", "", markdown, flags=re.MULTILINE)

# Blank sublist '* +" in https://edd.ca.gov/en/unemployment/Employer_Information/
# Empty sublist '4. * ' in https://edd.ca.gov/en/about_edd/your-benefit-payment-options/
# Remove empty nested sublists
markdown = re.sub(
r"^\s*(\w+\.|\*|\+|\-) (\w+\.|\*|\+|\-)\s*$", "", markdown, flags=re.MULTILINE
)
return markdown

def prep_json_item(item: dict[str, str]) -> None:
markdown = item.get("main_content", item.get("main_primary", None))
assert markdown, f"Item {item['url']} has no main_content or main_primary"
item["markdown"] = _fix_input_markdown(markdown)

return IngestConfig(
dataset_id,
benefit_program,
benefit_region,
"https://edd.ca.gov/en/",
"edd_web_md",
prep_json_item,
)


def la_county_policy_config(
dataset_id: str, benefit_program: str, benefit_region: str
) -> IngestConfig:
chunking_config = DefaultChunkingConfig()
# The document name is the same as item["h2"], so it is redundant to include it in the headings
chunking_config.include_doc_name_in_headings = False

def prep_json_item(item: dict[str, str]) -> None:
# More often than not, the h2 heading is better suited as the title
item["title"] = item["h2"]

# Include the program name in the document title
program_name = item["h1"]
item["title"] = f"{program_name}: {item['title']}"

return IngestConfig(
dataset_id,
benefit_program,
benefit_region,
"https://epolicy.dpss.lacounty.gov/epolicy/epolicy/server/general/projects_responsive/ePolicyMaster/mergedProjects/",
"la_policy_md",
prep_json_item,
chunking_config,
)


def ca_public_charge_config(
dataset_id: str, benefit_program: str, benefit_region: str
) -> IngestConfig:
def prep_json_item(item: dict[str, str]) -> None:
markdown = item.get("main_content", item.get("main_primary", None))
assert markdown, f"Item {item['url']} has no main_content or main_primary"
item["markdown"] = markdown

return IngestConfig(
dataset_id,
benefit_program,
benefit_region,
"https://keepyourbenefits.org/en/ca/",
"ca_public_charge_md",
prep_json_item,
)


def get_ingester_config(dataset_id: str) -> IngestConfig:
match dataset_id:
case "CA EDD":
return edd_web_config(dataset_id, "employment", "California")
case "DPSS Policy":
return la_county_policy_config(dataset_id, "mixed", "California:LA County")
case "IRS":
return IngestConfig(
dataset_id, "tax credit", "US", "https://www.irs.gov/", "irs_web_md"
)
case "Keep Your Benefits":
return ca_public_charge_config(dataset_id, "mixed", "California")
case _:
raise ValueError(f"Unknown dataset_id: {dataset_id}")


# Print INFO messages since this is often run from the terminal during local development
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


def main() -> None: # pragma: no cover
parser = argparse.ArgumentParser()
parser.add_argument("dataset_id")
parser.add_argument("file_path")
parser.add_argument("--resume", action="store_true")
parser.add_argument("--skip_db", action="store_true")
args = parser.parse_args(sys.argv[1:])

config = get_ingester_config(sys.argv[1])
start_ingestion(
logger,
ingest_json,
args.file_path,
config,
skip_db=args.skip_db,
resume=args.resume,
)
Loading

0 comments on commit 53a1ffb

Please sign in to comment.