Skip to content

Commit

Permalink
Merge pull request #6 from kmc-jp/add-scrapbox
Browse files Browse the repository at this point in the history
Scrapboxのcrawlを追加
  • Loading branch information
walnuts1018 authored Jan 1, 2024
2 parents 4941e14 + 387b8a4 commit fbe645d
Show file tree
Hide file tree
Showing 7 changed files with 427 additions and 2 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,28 @@ $ poetry run python3 paragate-crawler.py crawl
$ poetry run python3 paragate-crawler.py add-index
```

#### Scrapbox

- Setup

```shell
$ edit config/scrapbox.py
```

`SCRAPBOX_CONNECT_SID` はブラウザの開発者ツールから `cookie[connect.sid]` を取得してください。

- Crawl

```shell
$ poetry run python3 scrapbox-crawler.py crawl
```

- Create index

```shell
$ poetry run python3 scrapbox-crawler.py add-index
```

## Tips

To access dev app in kubernetes...
Expand Down
8 changes: 8 additions & 0 deletions config/scrapbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import os

SCRAPBOX_ENDPOINT = "https://scrapbox.io/"
SCRAPBOX_PROJECT = "kmc"
SCRAPBOX_CONNECT_SID = os.getenv("SCRAPBOX_CONNECT_SID")
ELASTIC_SEARCH_ENDPOINT = os.getenv("ELASTIC_SEARCH_ENDPOINT", "http://heineken-elasticsearch.default.svc.cluster.local:9200/")
INDEX = "scrapbox"
INDEX_FILE = "index/scrapbox.json"
31 changes: 31 additions & 0 deletions index/scrapbox-memo.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
[共通部分は pukiwiki 参照](./pukiwiki-memo.md)

### 設定

# mapping

```json
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "jp_analyzer",
"term_vector" : "with_positions_offsets",
"fields": {
"keyword": { "type": "keyword" }
}
},
"body": {
"type": "text",
"analyzer": "jp_analyzer",
"term_vector" : "with_positions_offsets"
},
"modified": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
```

- ほとんど pukiwiki と一緒
44 changes: 44 additions & 0 deletions index/scrapbox.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"settings": {
"analysis": {
"analyzer": {
"jp_analyzer": {
"tokenizer": "jp_tokenizer",
"char_filter": ["html_strip", "icu_normalizer"],
"filter": []
}
},

"tokenizer": {
"jp_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 2,
"token_chars": ["letter", "digit", "symbol", "punctuation"]
}
}
}
},

"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "jp_analyzer",
"term_vector": "with_positions_offsets",
"fields": {
"keyword": { "type": "keyword" }
}
},
"body": {
"type": "text",
"analyzer": "jp_analyzer",
"term_vector": "with_positions_offsets"
},
"modified": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
}
162 changes: 160 additions & 2 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ license = "CC0"
python = "^3.9"
beautifulsoup4 = "*"
lxml = "^4.9.3"
requests = "^2.31.0"

[tool.poetry.dev-dependencies]

Expand Down
161 changes: 161 additions & 0 deletions scrapbox-crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import argparse
import json
import urllib.parse
import urllib.request
from urllib.error import HTTPError
import requests
from datetime import datetime
import time

from config import scrapbox as config
from els.client import ElsClient
import logging

INTERVAL_SEC = 1
BULK_SIZE = 30

client = ElsClient(config.ELASTIC_SEARCH_ENDPOINT, config.INDEX)

fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s"
logging.basicConfig(level=logging.INFO, format=fmt)
logger = logging.getLogger(__name__)

def add_index(args):
# Add index if not exists
try:
client.get_index()
except HTTPError as e:
if e.status == 404:
with open(config.INDEX_FILE) as f:
logger.info(client.add_index(f.read()).read().decode("utf-8"))
else:
raise


def delete_index(args):
logger.info(client.delete_index().read().decode("utf-8"))


def crawl(args):
cookies = {"connect.sid": config.SCRAPBOX_CONNECT_SID}
all_query = {
"sort": { "modified": "desc" },
"query": {"match_all": {}},
"_source": ["modified"],
# els size limit
# TODO: paging
"size": 10000
}

all_entries = json.loads(
client.search(json.dumps(all_query)).read().decode("utf-8")
)
if all_entries["hits"]["total"]["relation"] in ("gte", "eq") and all_entries["hits"]["total"]["value"] > 0:
last_modified = all_entries["hits"]["hits"][0]["_source"]["modified"]
else:
last_modified = 0

logger.info(f"last modified: {datetime.fromtimestamp(last_modified)}")
els_ids = set(map(lambda x: x["_id"], all_entries["hits"]["hits"]))


modified_page_titles = []
offset=0
logger.info("start list pages")
while True:
params = {"skip": offset, "limit": 100, "sort": "modified"}
response = requests.get(urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, f'/api/pages/{config.SCRAPBOX_PROJECT}'), cookies=cookies, params=params)
response.raise_for_status()
raw_pages = response.json()


if raw_pages["pages"]:
for page in raw_pages["pages"]:
isRequireUpdate = page["updated"] > last_modified or page["id"] not in els_ids
if isRequireUpdate:
modified_page_titles.append(page["title"])

# 一番最後のpageが更新されていなければ、次ページの探索をやめる
if len(modified_page_titles) !=0 and raw_pages["pages"][-1]["title"] != modified_page_titles[-1]:
break
offset += 100
if offset > raw_pages["count"]:
break

logger.info("number of modified pages: "+str(len(modified_page_titles)))

if len(modified_page_titles) > 0:
# BULK_SIZE個ずつbulk createする
for i in range(0, len(modified_page_titles), BULK_SIZE):
logger.info(f"start bulk create: {i} - {i+BULK_SIZE} / {len(modified_page_titles)}")
bulk_string = "\n".join(_create_page_json_for_bulk(_get_page_data(cookies, x)) for x in modified_page_titles[i:i+BULK_SIZE]) + "\n"
if bulk_string.strip() == "":
continue
logger.info(client.bulk(bulk_string).read().decode("utf-8"))

# streamのeventから削除対象ページを取得
response = requests.get(urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, f"/api/stream/{config.SCRAPBOX_PROJECT}"), cookies=cookies)
response.raise_for_status()
raw_stream = response.json()
deleted_page_ids =[]
for event in raw_stream["events"]:
if event["type"] == "page.delete" and event["pageId"] in els_ids and event["pageId"] not in deleted_page_ids:
deleted_page_ids.append(event["pageId"])
logger.info(f"number of deleted pages: {len(deleted_page_ids)}")
if len(deleted_page_ids) > 0:
deleted_page_query = {
"query": {
"bool": {
"must_not": {
"terms": {
"_id": deleted_page_ids
}
}
}
},
"_source": ["title"]
}
logger.info(client.delete_by_query(json.dumps(deleted_page_query)).read().decode("utf-8"))

def _create_page_json_for_bulk(data):
if data == {}:
return ""
head = json.dumps({"index" : { "_index": config.INDEX, "_id": data.pop("_id") }})
return head + "\n" + json.dumps(data)

def _get_page_data(cookies, page_title):
# page_titleが"."のpageがあり、pathが空白になってしまうので除外する
if page_title == ".":
return {}
# デフォルトで/はエンコードされないので、safe=''を指定する
path = f"/api/pages/{config.SCRAPBOX_PROJECT}/{urllib.parse.quote(page_title, safe='')}"
url = urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, path)
response = requests.get(url, cookies=cookies)
response.raise_for_status()
page = response.json()
time.sleep(INTERVAL_SEC)
logger.info("fetched: "+url)
return {
"_id": page["id"],
"title": page["title"],
"body": "\n".join(x["text"] for x in page["lines"]),
"modified": page["updated"]
}

parser = argparse.ArgumentParser(description='Scrapbox crawler for elasticsearch')
subparsers = parser.add_subparsers()

parser_add = subparsers.add_parser('add-index', help='add index')
parser_add.set_defaults(func=add_index)

parser_add = subparsers.add_parser('delete-index', help='delete index')
parser_add.set_defaults(func=delete_index)

parser_crawl = subparsers.add_parser('crawl', help='crawl')
parser_crawl.set_defaults(func=crawl)

args = parser.parse_args()
if hasattr(args, 'func'):
args.func(args)
else:
parser.print_help()

0 comments on commit fbe645d

Please sign in to comment.