-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdocx_parser.py
106 lines (96 loc) · 4.17 KB
/
docx_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
#
# docx_parser.py
#
# VERSION: 2.0.0
# UPDATED: 2021-12-20
#
##############################################################################
# PUBLIC DOMAIN NOTICE #
##############################################################################
# This software is freely available to the public for use. #
# #
# Although all reasonable efforts have been taken to ensure the accuracy and #
# reliability of the software, the author does not and cannot warrant the #
# performance or results that may be obtained by using this software. #
# The author disclaims all warranties, express or implied, including #
# warranties of performance, merchantability or fitness for any particular #
# purpose. #
# #
# Please cite the author in any work or product based on this material. #
# Tyler W. Davis, PhD #
# https://github.com/dt-woods/ #
##############################################################################
#
##############################################################################
# REQUIRED MODULES
##############################################################################
import docx
from docx_utils import delete_paragraph
from docx_utils import find_word_files
from docx_utils import list_paragraph_styles
##############################################################################
# FUNCTIONS
##############################################################################
def get_title(text):
"""
TODO: create a specialized method of handing text from break styles
to return a title for the output file.
"""
return "%s.docx" % text.replace(" ", "_")
def parse_file(doc, style, idx):
"""
Name: parse_file
Inputs: - str, file path to .docx (doc)
- str, the .docx paragraph style ID to break on (style)
- int, the index of style ID to parse; zero indexed (idx)
Features: Finds paragraphs of the given style and breaks it into a
separate document.
Depends: - delete_paragraph
- get_title
"""
d = docx.Document(doc)
out_name = None
para_num = len(d.paragraphs)
j = 0 # track paragraphs with matching styles
f = False # track all paragraphs between matching styles
for para in d.paragraphs:
if para.style.style_id == style:
if j == idx:
f = True
out_name = get_title(para.text)
else:
f = False
delete_paragraph(para)
j += 1
elif not f:
delete_paragraph(para)
if out_name is None:
out_name = "DOCUMENT-%d.docx" % (idx)
d.save(out_name)
##############################################################################
# MAIN
##############################################################################
if __name__ == '__main__':
# User inputs:
my_dir = "examples" # where to look for the input document
my_key = "example-1" # keyword for finding the right input document
br_style = "Heading1" # the paragraph style used to parse the input document
# Step 1: find the input word file(s)
my_files = find_word_files(my_dir, my_key)
if len(my_files) == 1:
my_file = my_files[0]
elif len(my_files) > 1:
print("Found several word files; "
"please use keywords to specify the one you want.")
else:
print("Failed to find docx. Please check and try again.")
my_file = None
if my_file:
# Step 2 - Find all styles and see if break style is there
my_doc = docx.Document(my_file)
my_styles = list_paragraph_styles(my_doc)
if br_style in my_styles.keys():
# Step 3 - For each break style, parse:
for i in range(my_styles[br_style]['count']):
parse_file(my_file, br_style, i)