Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

添加“解析PDF到md(MinerU)”插件 #2112

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 37 additions & 32 deletions crazy_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from toolbox import trimmed_format_exc
from loguru import logger


def get_crazy_functions():
from crazy_functions.读文章写摘要 import 读文章写摘要
from crazy_functions.生成函数注释 import 批量生成函数注释
Expand Down Expand Up @@ -105,7 +106,7 @@ def get_crazy_functions():
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info" : "基于当前对话或文件生成多种Mermaid图表,图表类型由模型判断",
"Info": "基于当前对话或文件生成多种Mermaid图表,图表类型由模型判断",
"Function": None,
"Class": Mermaid_Gen
},
Expand All @@ -115,7 +116,7 @@ def get_crazy_functions():
"AsButton": True,
"Info": "Arixv论文精细翻译 | 输入参数arxiv论文的ID,比如1812.10695",
"Function": HotReload(Latex翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Class": Arxiv_Localize, # 新一代插件需要注册Class
"Class": Arxiv_Localize, # 新一代插件需要注册Class
},
"批量总结Word文档": {
"Group": "学术",
Expand Down Expand Up @@ -229,8 +230,8 @@ def get_crazy_functions():
"Color": "stop",
"AsButton": True,
"Info": "保存当前的对话 | 不需要输入参数",
"Function": HotReload(对话历史存档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Class": Conversation_To_File_Wrap # 新一代插件需要注册Class
"Function": HotReload(对话历史存档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Class": Conversation_To_File_Wrap # 新一代插件需要注册Class
},
"[多线程Demo]解析此项目本身(源码自译解)": {
"Group": "对话|编程",
Expand All @@ -245,22 +246,22 @@ def get_crazy_functions():
"AsButton": True, # 加入下拉菜单中
# "Info": "连接网络回答问题(需要访问谷歌)| 输入参数是一个问题",
"Function": HotReload(连接网络回答问题),
"Class": NetworkGPT_Wrap # 新一代插件需要注册Class
"Class": NetworkGPT_Wrap # 新一代插件需要注册Class
},
"历史上的今天": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info": "查看历史上的今天事件 (这是一个面向开发者的插件Demo) | 不需要输入参数",
"Function": None,
"Class": Demo_Wrap, # 新一代插件需要注册Class
"Class": Demo_Wrap, # 新一代插件需要注册Class
},
"精准翻译PDF论文": {
"Group": "学术",
"Color": "stop",
"AsButton": True,
"Info": "精准翻译PDF论文为中文 | 输入参数为路径",
"Function": HotReload(批量翻译PDF文档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Function": HotReload(批量翻译PDF文档), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Class": PDF_Tran, # 新一代插件需要注册Class
},
"询问多个GPT模型": {
Expand Down Expand Up @@ -354,7 +355,7 @@ def get_crazy_functions():
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
"Info": "Arixv论文精细翻译 | 输入参数arxiv论文的ID,比如1812.10695",
"Function": HotReload(Latex翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Class": Arxiv_Localize, # 新一代插件需要注册Class
"Class": Arxiv_Localize, # 新一代插件需要注册Class
},
"📚本地Latex论文精细翻译(上传Latex项目)[需Latex]": {
"Group": "学术",
Expand All @@ -376,8 +377,8 @@ def get_crazy_functions():
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
"Info": "PDF翻译中文,并重新编译PDF | 输入参数为路径",
"Function": HotReload(PDF翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Class": PDF_Localize # 新一代插件需要注册Class
"Function": HotReload(PDF翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Class": PDF_Localize # 新一代插件需要注册Class
}
}

Expand All @@ -388,7 +389,7 @@ def get_crazy_functions():
"Color": "stop",
"AsButton": False,
"Info": "使用 DALLE2/DALLE3 生成图片 | 输入参数字符串,提供图像的内容",
"Function": HotReload(图片生成_DALLE2), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Function": HotReload(图片生成_DALLE2), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用
"Class": ImageGen_Wrap # 新一代插件需要注册Class
},
}
Expand All @@ -407,14 +408,6 @@ def get_crazy_functions():
}
)









# -=--=- 尚未充分测试的实验性插件 & 需要额外依赖的插件 -=--=-
try:
from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要
Expand Down Expand Up @@ -475,7 +468,8 @@ def get_crazy_functions():
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False)
"ArgsReminder": '输入时用逗号隔开, *代表通配符, 加了^代表不匹配; 不输入代表全部匹配。例如: "*.c, ^*.cpp, config.toml, ^*.toml"', # 高级参数输入区的显示提示
"ArgsReminder": '输入时用逗号隔开, *代表通配符, 加了^代表不匹配; 不输入代表全部匹配。例如: "*.c, ^*.cpp, config.toml, ^*.toml"',
# 高级参数输入区的显示提示
"Function": HotReload(解析任意code项目),
},
}
Expand Down Expand Up @@ -503,8 +497,6 @@ def get_crazy_functions():
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")



try:
from crazy_functions.总结音视频 import 总结音视频

Expand Down Expand Up @@ -617,7 +609,6 @@ def get_crazy_functions():
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")


try:
from toolbox import get_conf

Expand Down Expand Up @@ -726,6 +717,23 @@ def get_crazy_functions():
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")

try:
from crazy_functions.PDF_Convert import 解析PDF文档

function_plugins.update({
"解析PDF到md(MinerU)": {
"Group": "学术",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "请输入Conda环境名称,默认为“MinerU”",
"Info": "上传PDF,并转换为Markdown | 输入参数为Anaconda环境名称",
"Function": HotReload(解析PDF文档),
}
})
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")

# try:
# from crazy_functions.高级功能函数模板 import 测试图表渲染
Expand All @@ -741,7 +749,6 @@ def get_crazy_functions():
# logger.error(trimmed_format_exc())
# print('Load function plugin failed')


"""
设置默认值:
- 默认 Group = 对话
Expand All @@ -762,21 +769,19 @@ def get_crazy_functions():
return function_plugins




def get_multiplex_button_functions():
"""多路复用主提交按钮的功能映射
"""
return {
"常规对话":
"",

"多模型对话":
"询问多个GPT模型", # 映射到上面的 `询问多个GPT模型` 插件
"多模型对话":
"询问多个GPT模型", # 映射到上面的 `询问多个GPT模型` 插件

"智能召回 RAG":
"Rag智能召回", # 映射到上面的 `Rag智能召回` 插件
"智能召回 RAG":
"Rag智能召回", # 映射到上面的 `Rag智能召回` 插件

"多媒体查询":
"多媒体智能体", # 映射到上面的 `多媒体智能体` 插件
"多媒体查询":
"多媒体智能体", # 映射到上面的 `多媒体智能体` 插件
}
79 changes: 79 additions & 0 deletions crazy_functions/PDF_Convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from toolbox import CatchException, report_exception, get_log_folder, gen_time_str
from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from crazy_functions.crazy_utils import read_and_clean_pdf_text
from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf
from shared_utils.colorful import *
import copy
import os
import math
import logging
import time


@CatchException
def 解析PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
disable_auto_promotion(chatbot)
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"使用`MinerU`解析PDF文档到Markdown。(支持版本-1.0.1)\n\n"
"由于MinerU环境与gpt_academic冲突,需要事先创建好名字为`MinerU`的Conda环境。\n\n"
"安装命令如下:\n\n"
"```sh\n"
"conda create -n MinerU python=3.10\n"
"conda activate MinerU\n"
"pip install -U 'magic-pdf[full]' --extra-index-url https://wheels.myhloli.com\n```\n\n"
"默认使用CPU,使用GPU加速至少需要8GB显存,需要修改 `~/magic-pdf.json` 中的 `device-mode` 为 `cuda`\n\n"
"函数插件贡献者: Xunge-Jiang"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

# 清空历史,以免输入溢出
history = []

from crazy_functions.crazy_utils import get_files_from_everything
success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')

# success_md, file_manifest_md, _ = get_files_from_everything(txt, type='.md')
# success = success or success_md
# file_manifest += file_manifest_md
chatbot.append(["文件列表:", ", ".join([e.split('/')[-1] for e in file_manifest])])
yield from update_ui(chatbot=chatbot, history=history)
# 检测输入参数,如没有给定输入参数,直接退出
if not success:
if txt == "": txt = '空空如也的输入栏'

# 如果没找到任何文件
if len(file_manifest) == 0:
report_exception(chatbot, history,
a=f"解析项目: {txt}", b=f"找不到任何.pdf拓展名的文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 开始正式执行任务
yield from 解析PDF_基于MinerU(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history,
system_prompt)


def 解析PDF_基于MinerU(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
from crazy_functions.crazy_utils import mineru_interface
mineru_handle = mineru_interface()
for index, fp in enumerate(file_manifest):
if fp.endswith('pdf'):
chatbot.append(["当前进度:", f"正在解析论文,请稍候。"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop(
"advanced_arg")
conda_env = plugin_kwargs.get("advanced_arg", 'MinerU')
md_path, zip_path = yield from mineru_handle.mineru_parse_pdf(fp, chatbot, history, conda_env)
chatbot.append((f"成功啦", '请查收结果...'))
yield from update_ui(chatbot=chatbot, history=history)
time.sleep(1) # 刷新界面
promote_file_to_downloadzone(md_path, rename_file=None, chatbot=chatbot)
promote_file_to_downloadzone(zip_path, rename_file=None, chatbot=chatbot)
else:
chatbot.append(["当前论文无法解析:", fp]);
yield from update_ui(chatbot=chatbot, history=history)

yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
113 changes: 113 additions & 0 deletions crazy_functions/crazy_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,119 @@ def NOUGAT_parse_pdf(self, fp, chatbot, history):
return res[0]


@Singleton
class mineru_interface():
def __init__(self):
self.threadLock = threading.Lock()

def mineru_with_timeout(self, command, cwd, timeout=3600):
import subprocess
from toolbox import ProxyNetworkActivate

try:
with ProxyNetworkActivate("MinerU"):
process = subprocess.Popen(
command,
shell=True,
cwd=cwd,
env=os.environ,
executable="/bin/bash" # 指定使用 Bash 执行命令
)
stdout, stderr = process.communicate(timeout=timeout)

except subprocess.TimeoutExpired:
process.kill()
stdout, stderr = process.communicate()
logger.error("Process timed out!")
return False

# 检查返回码
if process.returncode != 0:
logger.error(f"Command failed with return code {process.returncode}: {stderr.decode()}")
return False
return True

def compress_to_zip(self, dst):
"""
将指定路径 dst 压缩为 zip 文件,并返回压缩文件的路径。
"""
import shutil
from pathlib import Path

dst_path = Path(dst).resolve() # 解析为绝对路径
dst_parent = dst_path.parent # 父目录
dst_name = dst_path.name # 目录名

# 设置压缩文件存储路径(与 dst 的父目录相同)
zip_path = dst_parent / f"{dst_name}.zip"
shutil.make_archive(base_name=str(zip_path.with_suffix('')),
format='zip',
root_dir=str(dst_path))

return str(zip_path)

def get_conda_activate_command(self):
# 构造激活 Conda 环境的命令
conda_prefix = os.environ.get("CONDA_PREFIX")
conda_path_split = conda_prefix.split('/')
conda_base_path = '/'.join(conda_path_split[:4])

import platform
# 检测操作系统
system = platform.system()
if system == "Windows":
# Windows 下的 conda 初始化脚本
conda_init = os.path.join(conda_base_path, "condabin", "conda.bat")
if not os.path.exists(conda_init):
self.threadLock.release()
raise FileNotFoundError(f"找不到 conda 初始化脚本: {conda_init}")
activate_command = f'call "{conda_init}" activate '
else:
# Linux/Mac 下的 conda 初始化脚本
conda_init = os.path.join(conda_base_path, "etc", "profile.d", "conda.sh")
if not os.path.exists(conda_init):
self.threadLock.release()
raise FileNotFoundError(f"找不到 conda 初始化脚本: {conda_init}")
activate_command = f"source {conda_init} && conda activate "
return activate_command

def mineru_parse_pdf(self, fp, chatbot, history, conda_env):
from toolbox import update_ui_lastest_msg

yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在排队, 等待线程锁...",
chatbot=chatbot, history=history, delay=0)
self.threadLock.acquire()
import glob, threading, os
from toolbox import get_log_folder, gen_time_str
dst = os.path.join(get_log_folder(plugin_name='mineru'), gen_time_str())
os.makedirs(dst)

yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载MinerU... ",
chatbot=chatbot, history=history, delay=0)
command = ['magic-pdf', '-p', os.path.abspath(fp), '-o', os.path.abspath(dst), ]
import shlex
# 确保命令中的参数安全(转义空格等特殊字符)
safe_command = ' '.join([shlex.quote(arg) for arg in command])
yield from update_ui_lastest_msg(f"正在执行命令 {safe_command} 在 Conda 环境 '{conda_env}' 中。",
chatbot=chatbot, history=history, delay=0)
activate_command = self.get_conda_activate_command()
activate_command += f"{conda_env} && CUDA_VISIBLE_DEVICES=0 {safe_command}"
logger.info(f"正在执行命令 {activate_command}")

self.mineru_with_timeout(activate_command, cwd=os.getcwd(), timeout=3600)
pdf_name = os.path.basename(fp)
# 去掉后缀
name_without_ext = os.path.splitext(pdf_name)[0]
new_dst_dir = os.path.join(dst, name_without_ext)

res = glob.glob(os.path.join(new_dst_dir, 'auto', '*.md'))
if len(res) == 0:
self.threadLock.release()
raise RuntimeError("MinerU解析论文失败。")
self.threadLock.release()
md_path = res[0]
zip_path = self.compress_to_zip(new_dst_dir)
return md_path, zip_path


def try_install_deps(deps, reload_m=[]):
Expand Down