-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
84 lines (71 loc) · 2.65 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#! python
import re
from bs4 import BeautifulSoup as bfs
from copy import copy
import sys
from getopt import gnu_getopt, GetoptError
HELP_MSG = 'preprocess.py <HTML file> -s <CSS file> -o <Target HTML file>'
def getargs(args):
origin_file = ''
css_file = 'whu-cs-report.css'
output_file = ''
try:
opts, iargs = gnu_getopt(args[1:], 's:o:h',['style=','output=','help'])
except GetoptError:
print('Parse Error! Type "{} -h" to get help'.format(args[0]))
sys.exit(2)
for opt, arg in opts:
if opt in ['-h','--help'] :
print(HELP_MSG)
sys.exit(0)
elif opt in ['-s', '--style']:
css_file = arg
elif opt in ['-o', '--output']:
output_file = arg
else:
print('unknown argumnet "{}". Type "{} -h" to get help'.format(opt,args[0]))
if len(iargs) != 0 :
origin_file = iargs[0]
else:
print('Please specify an html file! Type "{} -h" to get help'.format(args[0]))
sys.exit(2)
if output_file == '' :
output_file = origin_file[:-5] + '_processed' + '.html'
return origin_file, css_file, output_file
def main(args):
origin_file, css_file, output_file = getargs(args)
origin_html = ''
custom_css = ''
with open(origin_file, encoding='utf-8') as f:
origin_html = f.read()
with open(css_file, encoding='utf-8') as f:
custom_css = f.read()
# 将生成的html里有改动的custom CSS换成原版
pat = r'\.custom-css-head.*\n(.+\n)+'
new_html = re.sub(pat, custom_css, origin_html)
# 将class为table-caption的div转成表标题
soup = bfs(new_html, 'lxml')
for tbltitle in soup.select('.table-caption'):
cap = copy(tbltitle)
del cap['class']
cap.name = 'caption'
tbltitle.next_sibling.table.insert(0,cap)
tbltitle.decompose()
# 匹配目录和各级标题的ID
cnt = 1;
for toc_a in soup.select('.md-toc-h2 a'):
dest = soup.select('a[name="{}"]'.format(toc_a['href'][1:]))[0]
dest['id'] = 'h2-{}'.format(cnt)
toc_a['href'] = '#h2-{}'.format(cnt)
cnt += 1
cnt = 1;
for toc_a in soup.select('.md-toc-h3 a'):
dest = soup.select('a[name="{}"]'.format(toc_a['href'][1:]))[0]
dest['id'] = 'h3-{}'.format(cnt)
toc_a['href'] = '#h3-{}'.format(cnt)
cnt += 1
# BeautifulSoup会把 \3000 转成 À0 ,这里转回来
with open(output_file, 'w', encoding='utf-8') as f:
f.write(str(soup).replace('À0',r'\3000'))
if __name__ == '__main__':
main(sys.argv)