-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathretrieve.py
80 lines (61 loc) · 2.24 KB
/
retrieve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import datetime
from pathlib import Path
import time
import requests
CHANNEL = 'tfwiki'
DAY = datetime.timedelta(days=1)
FIRST = datetime.date(2017, 4, 19) # Earliest logs since Wind's DB got killed
DELAY = 5 # Wind recommended one request every five seconds
TIMEZONE_OFFSET = 5
def get_last_in_dir(path: Path) -> Path:
return sorted(path.iterdir(), key=lambda p: p.stem)[-1]
def get_last_date(target: Path) -> datetime.date:
last_log = \
get_last_in_dir( # Get last log
get_last_in_dir( # Get last month
get_last_in_dir( # Get last year
target
)
)
)
return datetime.date.fromisoformat(last_log.stem)
# the until parameter is exclusive
def retrieve(source: str, target: Path, after: datetime.date, until: datetime.date):
date = after
# Iter over years
while True:
year = date.year
# Iter over months
while True:
month = date.month
month_dir = target / str(year) / str(month).zfill(2)
if not month_dir.exists():
month_dir.mkdir(parents=True)
# Iter over days
while True:
if date >= until:
return
file = (month_dir / date.isoformat()).with_suffix('.txt')
url = source + f'/{date.isoformat()}.txt'
try:
print("Retrieving", url)
text = requests.get(url).text
if text.strip():
file.write_text(text, 'utf-8')
except Exception as e:
print(date.isoformat(), e)
date += DAY
time.sleep(DELAY)
if date.month != month:
break
if date.year != year:
break
def main(source: str, target: Path):
if not target.exists():
target.mkdir(parents=True)
start_date = FIRST
else:
start_date = get_last_date(target) + DAY
retrieve(source, target, start_date, (datetime.datetime.now() - datetime.timedelta(hours=TIMEZONE_OFFSET)).date())
if __name__ == '__main__':
main(f'https://irc.biringa.com/channel/{CHANNEL}', Path(f'./logs/{CHANNEL}'))