-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from kmc-jp/add-scrapbox
Scrapboxのcrawlを追加
- Loading branch information
Showing
7 changed files
with
427 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import os | ||
|
||
SCRAPBOX_ENDPOINT = "https://scrapbox.io/" | ||
SCRAPBOX_PROJECT = "kmc" | ||
SCRAPBOX_CONNECT_SID = os.getenv("SCRAPBOX_CONNECT_SID") | ||
ELASTIC_SEARCH_ENDPOINT = os.getenv("ELASTIC_SEARCH_ENDPOINT", "http://heineken-elasticsearch.default.svc.cluster.local:9200/") | ||
INDEX = "scrapbox" | ||
INDEX_FILE = "index/scrapbox.json" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
[共通部分は pukiwiki 参照](./pukiwiki-memo.md) | ||
|
||
### 設定 | ||
|
||
# mapping | ||
|
||
```json | ||
"mappings": { | ||
"properties": { | ||
"title": { | ||
"type": "text", | ||
"analyzer": "jp_analyzer", | ||
"term_vector" : "with_positions_offsets", | ||
"fields": { | ||
"keyword": { "type": "keyword" } | ||
} | ||
}, | ||
"body": { | ||
"type": "text", | ||
"analyzer": "jp_analyzer", | ||
"term_vector" : "with_positions_offsets" | ||
}, | ||
"modified": { | ||
"type": "date", | ||
"format": "strict_date_optional_time||epoch_millis" | ||
} | ||
} | ||
} | ||
``` | ||
|
||
- ほとんど pukiwiki と一緒 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
{ | ||
"settings": { | ||
"analysis": { | ||
"analyzer": { | ||
"jp_analyzer": { | ||
"tokenizer": "jp_tokenizer", | ||
"char_filter": ["html_strip", "icu_normalizer"], | ||
"filter": [] | ||
} | ||
}, | ||
|
||
"tokenizer": { | ||
"jp_tokenizer": { | ||
"type": "ngram", | ||
"min_gram": 2, | ||
"max_gram": 2, | ||
"token_chars": ["letter", "digit", "symbol", "punctuation"] | ||
} | ||
} | ||
} | ||
}, | ||
|
||
"mappings": { | ||
"properties": { | ||
"title": { | ||
"type": "text", | ||
"analyzer": "jp_analyzer", | ||
"term_vector": "with_positions_offsets", | ||
"fields": { | ||
"keyword": { "type": "keyword" } | ||
} | ||
}, | ||
"body": { | ||
"type": "text", | ||
"analyzer": "jp_analyzer", | ||
"term_vector": "with_positions_offsets" | ||
}, | ||
"modified": { | ||
"type": "date", | ||
"format": "strict_date_optional_time||epoch_millis" | ||
} | ||
} | ||
} | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
import argparse | ||
import json | ||
import urllib.parse | ||
import urllib.request | ||
from urllib.error import HTTPError | ||
import requests | ||
from datetime import datetime | ||
import time | ||
|
||
from config import scrapbox as config | ||
from els.client import ElsClient | ||
import logging | ||
|
||
INTERVAL_SEC = 1 | ||
BULK_SIZE = 30 | ||
|
||
client = ElsClient(config.ELASTIC_SEARCH_ENDPOINT, config.INDEX) | ||
|
||
fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s" | ||
logging.basicConfig(level=logging.INFO, format=fmt) | ||
logger = logging.getLogger(__name__) | ||
|
||
def add_index(args): | ||
# Add index if not exists | ||
try: | ||
client.get_index() | ||
except HTTPError as e: | ||
if e.status == 404: | ||
with open(config.INDEX_FILE) as f: | ||
logger.info(client.add_index(f.read()).read().decode("utf-8")) | ||
else: | ||
raise | ||
|
||
|
||
def delete_index(args): | ||
logger.info(client.delete_index().read().decode("utf-8")) | ||
|
||
|
||
def crawl(args): | ||
cookies = {"connect.sid": config.SCRAPBOX_CONNECT_SID} | ||
all_query = { | ||
"sort": { "modified": "desc" }, | ||
"query": {"match_all": {}}, | ||
"_source": ["modified"], | ||
# els size limit | ||
# TODO: paging | ||
"size": 10000 | ||
} | ||
|
||
all_entries = json.loads( | ||
client.search(json.dumps(all_query)).read().decode("utf-8") | ||
) | ||
if all_entries["hits"]["total"]["relation"] in ("gte", "eq") and all_entries["hits"]["total"]["value"] > 0: | ||
last_modified = all_entries["hits"]["hits"][0]["_source"]["modified"] | ||
else: | ||
last_modified = 0 | ||
|
||
logger.info(f"last modified: {datetime.fromtimestamp(last_modified)}") | ||
els_ids = set(map(lambda x: x["_id"], all_entries["hits"]["hits"])) | ||
|
||
|
||
modified_page_titles = [] | ||
offset=0 | ||
logger.info("start list pages") | ||
while True: | ||
params = {"skip": offset, "limit": 100, "sort": "modified"} | ||
response = requests.get(urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, f'/api/pages/{config.SCRAPBOX_PROJECT}'), cookies=cookies, params=params) | ||
response.raise_for_status() | ||
raw_pages = response.json() | ||
|
||
|
||
if raw_pages["pages"]: | ||
for page in raw_pages["pages"]: | ||
isRequireUpdate = page["updated"] > last_modified or page["id"] not in els_ids | ||
if isRequireUpdate: | ||
modified_page_titles.append(page["title"]) | ||
|
||
# 一番最後のpageが更新されていなければ、次ページの探索をやめる | ||
if len(modified_page_titles) !=0 and raw_pages["pages"][-1]["title"] != modified_page_titles[-1]: | ||
break | ||
offset += 100 | ||
if offset > raw_pages["count"]: | ||
break | ||
|
||
logger.info("number of modified pages: "+str(len(modified_page_titles))) | ||
|
||
if len(modified_page_titles) > 0: | ||
# BULK_SIZE個ずつbulk createする | ||
for i in range(0, len(modified_page_titles), BULK_SIZE): | ||
logger.info(f"start bulk create: {i} - {i+BULK_SIZE} / {len(modified_page_titles)}") | ||
bulk_string = "\n".join(_create_page_json_for_bulk(_get_page_data(cookies, x)) for x in modified_page_titles[i:i+BULK_SIZE]) + "\n" | ||
if bulk_string.strip() == "": | ||
continue | ||
logger.info(client.bulk(bulk_string).read().decode("utf-8")) | ||
|
||
# streamのeventから削除対象ページを取得 | ||
response = requests.get(urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, f"/api/stream/{config.SCRAPBOX_PROJECT}"), cookies=cookies) | ||
response.raise_for_status() | ||
raw_stream = response.json() | ||
deleted_page_ids =[] | ||
for event in raw_stream["events"]: | ||
if event["type"] == "page.delete" and event["pageId"] in els_ids and event["pageId"] not in deleted_page_ids: | ||
deleted_page_ids.append(event["pageId"]) | ||
logger.info(f"number of deleted pages: {len(deleted_page_ids)}") | ||
if len(deleted_page_ids) > 0: | ||
deleted_page_query = { | ||
"query": { | ||
"bool": { | ||
"must_not": { | ||
"terms": { | ||
"_id": deleted_page_ids | ||
} | ||
} | ||
} | ||
}, | ||
"_source": ["title"] | ||
} | ||
logger.info(client.delete_by_query(json.dumps(deleted_page_query)).read().decode("utf-8")) | ||
|
||
def _create_page_json_for_bulk(data): | ||
if data == {}: | ||
return "" | ||
head = json.dumps({"index" : { "_index": config.INDEX, "_id": data.pop("_id") }}) | ||
return head + "\n" + json.dumps(data) | ||
|
||
def _get_page_data(cookies, page_title): | ||
# page_titleが"."のpageがあり、pathが空白になってしまうので除外する | ||
if page_title == ".": | ||
return {} | ||
# デフォルトで/はエンコードされないので、safe=''を指定する | ||
path = f"/api/pages/{config.SCRAPBOX_PROJECT}/{urllib.parse.quote(page_title, safe='')}" | ||
url = urllib.parse.urljoin(config.SCRAPBOX_ENDPOINT, path) | ||
response = requests.get(url, cookies=cookies) | ||
response.raise_for_status() | ||
page = response.json() | ||
time.sleep(INTERVAL_SEC) | ||
logger.info("fetched: "+url) | ||
return { | ||
"_id": page["id"], | ||
"title": page["title"], | ||
"body": "\n".join(x["text"] for x in page["lines"]), | ||
"modified": page["updated"] | ||
} | ||
|
||
parser = argparse.ArgumentParser(description='Scrapbox crawler for elasticsearch') | ||
subparsers = parser.add_subparsers() | ||
|
||
parser_add = subparsers.add_parser('add-index', help='add index') | ||
parser_add.set_defaults(func=add_index) | ||
|
||
parser_add = subparsers.add_parser('delete-index', help='delete index') | ||
parser_add.set_defaults(func=delete_index) | ||
|
||
parser_crawl = subparsers.add_parser('crawl', help='crawl') | ||
parser_crawl.set_defaults(func=crawl) | ||
|
||
args = parser.parse_args() | ||
if hasattr(args, 'func'): | ||
args.func(args) | ||
else: | ||
parser.print_help() |