Skip to content

Commit

Permalink
Index pages for suggestions and full-text search
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 29, 2024
1 parent 44483b1 commit 4b51e41
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 10 deletions.
13 changes: 3 additions & 10 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
logger,
web_session,
)
from mindtouch2zim.html import get_soup


class MindtouchParsingError(Exception):
Expand Down Expand Up @@ -173,7 +174,7 @@ def get_home(self) -> MindtouchHome:
"""Retrieves data about home page by crawling home page"""
home_content = self._get_text("/")

soup = _get_soup(home_content)
soup = get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return MindtouchHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
Expand All @@ -192,7 +193,7 @@ def get_deki_token(self) -> str:

home_content = self._get_text("/")

soup = _get_soup(home_content)
soup = get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

Expand Down Expand Up @@ -290,14 +291,6 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
return LibraryPageContent(html_body=tree["body"][0])


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
"""Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")
Expand Down
17 changes: 17 additions & 0 deletions scraper/src/mindtouch2zim/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from bs4 import BeautifulSoup


def get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from HTML content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def get_text(content: str) -> str:
"""Return text data from HTML content
This is typically meant to extract content to index in the ZIM
"""
return get_soup(content).getText("\n", strip=True)
41 changes: 41 additions & 0 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
logger,
web_session,
)
from mindtouch2zim.html import get_text
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -539,6 +540,13 @@ def _process_page(
by_alias=True
),
)
self._add_indexing_item_to_zim(
creator=creator,
title=page.title,
content=get_text(rewriten.content),
fname=f"page_{page.id}",
zimui_redirect=page.path,
)

def _report_progress(self):
"""report progress to stats file"""
Expand Down Expand Up @@ -604,6 +612,39 @@ def _fetch_favicon_from_illustration(self, illustration: BytesIO) -> BytesIO:
)
return favicon

def _add_indexing_item_to_zim(
self,
creator: Creator,
title: str,
content: str,
fname: str,
zimui_redirect: str,
):
"""Add a 'fake' item to the ZIM, with proper indexing data
This is mandatory for suggestions and fulltext search to work properly, since
we do not really have pages to search for.
This item is a very basic HTML which automatically redirect to proper Vue.JS URL
"""

redirect_url = f"../index.html#/{zimui_redirect}"
html_content = (
f"<html><head><title>{title}</title>"
f'<meta http-equiv="refresh" content="0;URL=\'{redirect_url}\'" />'
f"</head><body></body></html>"
)

logger.debug(f"Adding {fname} to ZIM index")
add_item_for(
creator=creator,
title=title,
path="index/" + fname,
content=bytes(html_content, "utf-8"),
mimetype="text/html",
index_data=IndexData(title=title, content=content),
)


# remove all standard rules, they are not adapted to Vue.JS UI
html_rules.rewrite_attribute_rules.clear()
Expand Down

0 comments on commit 4b51e41

Please sign in to comment.