Skip to content

Commit

Permalink
Update config and crawler scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
walnuts1018 committed Jan 1, 2024
1 parent 0963dcc commit 387b8a4
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 7 deletions.
6 changes: 4 additions & 2 deletions config/scrapbox.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os

SCRAPBOX_ENDPOINT = "https://scrapbox.io/"
SCRAPBOX_PROJECT = "kmc"
SCRAPBOX_CONNECT_SID = "<cookieから取得>"
ELASTIC_SEARCH_ENDPOINT = "http://heineken-elasticsearch.default.svc.cluster.local:9200/"
SCRAPBOX_CONNECT_SID = os.getenv("SCRAPBOX_CONNECT_SID")
ELASTIC_SEARCH_ENDPOINT = os.getenv("ELASTIC_SEARCH_ENDPOINT", "http://heineken-elasticsearch.default.svc.cluster.local:9200/")
INDEX = "scrapbox"
INDEX_FILE = "index/scrapbox.json"
18 changes: 13 additions & 5 deletions scrapbox-crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,19 @@ def crawl(args):
for i in range(0, len(modified_page_titles), BULK_SIZE):
logger.info(f"start bulk create: {i} - {i+BULK_SIZE} / {len(modified_page_titles)}")
bulk_string = "\n".join(_create_page_json_for_bulk(_get_page_data(cookies, x)) for x in modified_page_titles[i:i+BULK_SIZE]) + "\n"
if bulk_string.strip() == "":
continue
logger.info(client.bulk(bulk_string).read().decode("utf-8"))

# streamのeventから削除対象ページを取得
response = requests.get(urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, f"/api/stream/{config.SCRAPBOX_PROJECT}"), cookies=cookies)
response.raise_for_status()
raw_stream = response.json()
deleted_page_ids =[]
for stream in raw_stream:
if stream["type"] == "page.delete" and stream["id"] in els_ids and stream["id"] not in deleted_page_ids:
deleted_page_ids.append(stream["id"])

for event in raw_stream["events"]:
if event["type"] == "page.delete" and event["pageId"] in els_ids and event["pageId"] not in deleted_page_ids:
deleted_page_ids.append(event["pageId"])
logger.info(f"number of deleted pages: {len(deleted_page_ids)}")
if len(deleted_page_ids) > 0:
deleted_page_query = {
"query": {
Expand All @@ -116,12 +118,18 @@ def crawl(args):
logger.info(client.delete_by_query(json.dumps(deleted_page_query)).read().decode("utf-8"))

def _create_page_json_for_bulk(data):
if data == {}:
return ""
head = json.dumps({"index" : { "_index": config.INDEX, "_id": data.pop("_id") }})
return head + "\n" + json.dumps(data)

def _get_page_data(cookies, page_title):
# page_titleが"."のpageがあり、pathが空白になってしまうので除外する
if page_title == ".":
return {}
# デフォルトで/はエンコードされないので、safe=''を指定する
url = urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, f"/api/pages/{config.SCRAPBOX_PROJECT}/{urllib.parse.quote(page_title, safe='')}")
path = f"/api/pages/{config.SCRAPBOX_PROJECT}/{urllib.parse.quote(page_title, safe='')}"
url = urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, path)
response = requests.get(url, cookies=cookies)
response.raise_for_status()
page = response.json()
Expand Down

0 comments on commit 387b8a4

Please sign in to comment.