-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfiles_manager.py
120 lines (106 loc) · 4.35 KB
/
files_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import shutil
from pypdf import PdfReader
import requests
import os
import mammoth
import json
import pypandoc
from pdfminer.high_level import extract_text as fallback_text_extraction
def manage_pdf(file_content):
query_parameters = {"downloadformat": "pdf"}
response = requests.get(file_content.download_url, params=query_parameters)
with open(file_content.name, mode="wb") as file:
file.write(response.content)
file.close()
text = ""
try:
reader = PdfReader(file_content.name)
for page in reader.pages:
page_text = page.extract_text()
text = text + page_text + "\n"
reader.close()
except Exception:
try:
text = fallback_text_extraction(file_content.name)
except Exception:
text = ""
os.remove(file_content.name)
return text
def manage_word(file_content):
query_parameters = {"downloadformat": "docx"}
response = requests.get(file_content.download_url, params=query_parameters)
with open(file_content.name, mode="wb") as file:
file.write(response.content)
file.close()
with open(file_content.name, "rb") as docx_file:
result = mammoth.convert_to_markdown(docx_file)
docx_file.close()
os.remove(file_content.name)
return result
def manage_json(file_content):
query_parameters = {"downloadformat": "json"}
response = requests.get(file_content.download_url, params=query_parameters)
result = pypandoc.convert_text(json.dumps(response.content.decode("utf-8")), 'json', 'md')
return result
def run(github_access):
fic = github_access.get_organization("FAIRiCUBE")
files = 0
for repo in fic.get_repos():
path = 'repos/' + repo.name + '/files/'
if not os.path.exists(path):
os.makedirs(path)
# print('\n')
# print('--------------------------------------')
# print(repo.name)
# print('--------------------------------------')
contents = repo.get_contents("")
while contents:
file_content = contents.pop(0)
if file_content.type == "dir":
contents.extend(repo.get_contents(file_content.path))
else:
try:
ext = file_content.name.split('.')
if len(ext) < 2:
continue
ext = file_content.name.split('.')[1]
# PDF
if ext == 'pdf':
text = manage_pdf(file_content)
f = open(path + file_content.name.split('.')[0] + '.md', 'w', encoding="utf-8")
f.write(text)
f.close()
files += 1
# MARKDOWN
elif ext == 'md':
f = open(path + file_content.name, 'wb')
f.write(file_content.decoded_content)
f.close()
files += 1
# TXT
elif ext == 'txt':
f = open(path + file_content.name.split('.')[0] + '.md', 'wb')
f.write(file_content.decoded_content)
f.close()
files += 1
# WORD
elif ext == 'docx':
result = manage_word(file_content)
with open(path + file_content.name.split('.')[0] + '.md', "w", encoding="utf-8") as markdown_file:
markdown_file.write(result.value)
markdown_file.close()
files += 1
# JSON
elif ext == 'json':
result = manage_json(file_content)
with open(path + file_content.name.split('.')[0] + '.md', "w", encoding="utf-8") as markdown_file:
markdown_file.write(result)
markdown_file.close()
files += 1
except Exception:
print(Exception)
print(file_content.name)
continue
if len(os.listdir(path)) == 0:
shutil.rmtree(path)
return files