From 8dbfeddfe47c16b52afc8a3ab01199ee659101f0 Mon Sep 17 00:00:00 2001 From: panyy Date: Wed, 24 Jun 2026 10:19:11 +0800 Subject: [PATCH] =?UTF-8?q?feat(=E6=80=9D=E7=BB=B4=E5=AF=BC=E5=9B=BE?= =?UTF-8?q?=E5=8A=A9=E6=89=8B)=EF=BC=9A=E6=80=9D=E7=BB=B4=E5=AF=BC?= =?UTF-8?q?=E5=9B=BE=E5=8A=A9=E6=89=8B=E5=A2=9E=E5=8A=A0=E6=99=BA=E8=83=BD?= =?UTF-8?q?=E6=95=B4=E7=90=86=E6=80=BB=E7=BB=93-=E5=88=86=E6=89=B9?= =?UTF-8?q?=E6=AC=A1=E6=95=B4=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mineru/cli/fast_api.py | 174 ++++++++++++++++-- web_ui/src/components/ConfigPanel.vue | 82 ++++----- web_ui/src/components/MindMapRenderer.vue | 5 + .../src/composables/useDocumentProcessor.ts | 41 ++++- web_ui/src/views/DocumentProcessor.vue | 20 +- 5 files changed, 257 insertions(+), 65 deletions(-) diff --git a/mineru/cli/fast_api.py b/mineru/cli/fast_api.py index 2a49f65..eeaaee8 100644 --- a/mineru/cli/fast_api.py +++ b/mineru/cli/fast_api.py @@ -388,8 +388,117 @@ DEFAULT_MINDMAP_ORGANIZE_PROMPT = """你是文档结构整理助手。请基于 8. 节点标题尽量简短,正文说明使用短句列表。 9. 只输出 Markdown,不要输出解释、代码块围栏或额外说明。""" +MINDMAP_MERGE_PROMPT = """你是 Markdown 思维导图结构校对助手。下面是多个分块已经整理总结后的局部 Markdown 大纲。 -def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optional[str] = None, task_id: Optional[str] = None) -> str: +任务: +1. 合并这些局部大纲为一份完整 Markdown。 +2. 只检查和调整标题层级结构、顺序、重复标题和父子关系。 +3. 不要重新总结正文,不要扩写内容,不要新增原文没有的信息。 +4. 保留各局部大纲中的源语言:英文保持英文,中文保持中文,多语言分别保留。 +5. 最大层级不超过 4 层。 +6. 每个父节点下最多 8 个子节点,必要时只合并相近标题。 +7. 只输出 Markdown,不要输出解释、代码块围栏或额外说明。""" + + +def _estimate_tokens(text: str) -> int: + ascii_chars = 0 + non_ascii_chars = 0 + for ch in text: + if ch.isspace(): + continue + if ord(ch) < 128: + ascii_chars += 1 + else: + non_ascii_chars += 1 + return max(1, int(ascii_chars / 4) + int(non_ascii_chars * 1.5)) + + +def _get_mindmap_context_budget(prompt: str, reserve_output_tokens: int = 4096) -> tuple[int, int]: + max_context_tokens = int(os.getenv("MINDMAP_LLM_MAX_CONTEXT_TOKENS", "32768")) + prompt_tokens = _estimate_tokens(prompt) + safety_tokens = int(os.getenv("MINDMAP_LLM_SAFETY_TOKENS", "1024")) + input_budget_tokens = max(2048, max_context_tokens - prompt_tokens - reserve_output_tokens - safety_tokens) + logger.info( + "Mindmap context budget max_context_tokens={} prompt_tokens={} reserve_output_tokens={} safety_tokens={} input_budget_tokens={}", + max_context_tokens, prompt_tokens, reserve_output_tokens, safety_tokens, input_budget_tokens + ) + return max_context_tokens, input_budget_tokens + + +def _split_markdown_blocks(markdown: str) -> list[str]: + lines = markdown.splitlines() + blocks: list[str] = [] + current: list[str] = [] + + heading_pattern = re.compile(r"^#{1,6}\s+") + for line in lines: + if heading_pattern.match(line) and current: + blocks.append("\n".join(current).strip()) + current = [line] + else: + current.append(line) + + if current: + blocks.append("\n".join(current).strip()) + + return [block for block in blocks if block] + + +def _split_large_block(block: str, max_tokens: int) -> list[str]: + paragraphs = re.split(r"\n{2,}", block) + chunks: list[str] = [] + current: list[str] = [] + current_tokens = 0 + + for paragraph in paragraphs: + paragraph = paragraph.strip() + if not paragraph: + continue + paragraph_tokens = _estimate_tokens(paragraph) + if current and current_tokens + paragraph_tokens > max_tokens: + chunks.append("\n\n".join(current)) + current = [paragraph] + current_tokens = paragraph_tokens + else: + current.append(paragraph) + current_tokens += paragraph_tokens + + if current: + chunks.append("\n\n".join(current)) + return chunks or [block] + + +def _chunk_markdown_by_headings(markdown: str, max_tokens: int) -> list[str]: + blocks = _split_markdown_blocks(markdown) + chunks: list[str] = [] + current: list[str] = [] + current_tokens = 0 + + for block in blocks: + block_tokens = _estimate_tokens(block) + if block_tokens > max_tokens: + if current: + chunks.append("\n\n".join(current)) + current = [] + current_tokens = 0 + chunks.extend(_split_large_block(block, max_tokens)) + continue + + if current and current_tokens + block_tokens > max_tokens: + chunks.append("\n\n".join(current)) + current = [block] + current_tokens = block_tokens + else: + current.append(block) + current_tokens += block_tokens + + if current: + chunks.append("\n\n".join(current)) + + return chunks or [markdown] + + +def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optional[str] = None, task_id: Optional[str] = None, request_role: str = "organize") -> str: base_url = os.getenv("MINDMAP_LLM_BASE_URL", "").rstrip("/") model = os.getenv("MINDMAP_LLM_MODEL", "gemma-4-26B") api_key = os.getenv("MINDMAP_LLM_API_KEY", "") @@ -399,10 +508,6 @@ def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optiona raise RuntimeError("未配置智能整理模型服务,请设置 MINDMAP_LLM_BASE_URL") compact_markdown = markdown.strip() - max_chars = int(os.getenv("MINDMAP_LLM_MAX_INPUT_CHARS", "30000")) - if len(compact_markdown) > max_chars: - compact_markdown = compact_markdown[:max_chars] + "\n\n...(后续内容已截断)" - prompt_template = (custom_prompt or "").strip() or DEFAULT_MINDMAP_ORGANIZE_PROMPT prompt = f"""{prompt_template} @@ -410,8 +515,8 @@ def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optiona {compact_markdown} """ logger.info( - "Mindmap LLM request start task_id={} model={} base_url={} mode={} input_chars={} prompt_chars={}", - task_id or "-", model, base_url, mode, len(compact_markdown), len(prompt_template) + "Mindmap LLM request start task_id={} role={} model={} base_url={} mode={} input_chars={} input_tokens_est={} prompt_chars={}", + task_id or "-", request_role, model, base_url, mode, len(compact_markdown), _estimate_tokens(compact_markdown), len(prompt_template) ) payload = { @@ -444,12 +549,60 @@ def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optiona if not organized: raise RuntimeError("智能整理模型未返回有效内容") logger.info( - "Mindmap LLM request completed task_id={} output_chars={}", - task_id or "-", len(organized) + "Mindmap LLM request completed task_id={} role={} output_chars={} output_tokens_est={}", + task_id or "-", request_role, len(organized), _estimate_tokens(organized) ) return organized +def _organize_mindmap_markdown(markdown: str, mode: str, custom_prompt: Optional[str], task_id: str) -> str: + prompt_template = (custom_prompt or "").strip() or DEFAULT_MINDMAP_ORGANIZE_PROMPT + _, input_budget_tokens = _get_mindmap_context_budget(prompt_template) + source_tokens = _estimate_tokens(markdown) + logger.info( + "Mindmap organize strategy task_id={} source_chars={} source_tokens_est={} input_budget_tokens={}", + task_id, len(markdown), source_tokens, input_budget_tokens + ) + + if source_tokens <= input_budget_tokens: + _update_task_progress(task_id, 35, "调用智能整理模型") + return _call_mindmap_llm(markdown, mode, prompt_template, task_id, "single") + + chunks = _chunk_markdown_by_headings(markdown, input_budget_tokens) + logger.info("Mindmap large input split task_id={} chunks={}", task_id, len(chunks)) + partial_results: list[str] = [] + for index, chunk in enumerate(chunks, start=1): + progress = 20 + int(index / max(len(chunks), 1) * 55) + _update_task_progress(task_id, progress, f"智能整理分块 {index}/{len(chunks)}") + logger.info( + "Mindmap chunk organize task_id={} chunk={}/{} chars={} tokens_est={}", + task_id, index, len(chunks), len(chunk), _estimate_tokens(chunk) + ) + partial = _call_mindmap_llm(chunk, mode, prompt_template, task_id, f"chunk-{index}") + partial_results.append(partial) + + merged_input = "\n\n".join( + f"\n{partial}" + for index, partial in enumerate(partial_results, start=1) + ) + _, merge_budget_tokens = _get_mindmap_context_budget(MINDMAP_MERGE_PROMPT) + merge_tokens = _estimate_tokens(merged_input) + if merge_tokens > merge_budget_tokens: + logger.warning( + "Mindmap merged outline still exceeds context task_id={} tokens_est={} budget={} chunks={}", + task_id, merge_tokens, merge_budget_tokens, len(partial_results) + ) + merge_chunks = _chunk_markdown_by_headings(merged_input, merge_budget_tokens) + merged_round: list[str] = [] + for index, chunk in enumerate(merge_chunks, start=1): + _update_task_progress(task_id, 78 + int(index / max(len(merge_chunks), 1) * 10), f"合并局部大纲 {index}/{len(merge_chunks)}") + merged_round.append(_call_mindmap_llm(chunk, mode, MINDMAP_MERGE_PROMPT, task_id, f"merge-round-{index}")) + merged_input = "\n\n".join(merged_round) + + _update_task_progress(task_id, 90, "全局整理标题结构") + return _call_mindmap_llm(merged_input, mode, MINDMAP_MERGE_PROMPT, task_id, "merge") + + async def _run_mindmap_organize_task(task_id: str, markdown: str, mode: str, prompt: Optional[str]): try: _store_task_progress(task_id, { @@ -460,8 +613,7 @@ async def _run_mindmap_organize_task(task_id: str, markdown: str, mode: str, pro "file_names": "", "result_md": None, }) - _update_task_progress(task_id, 35, "调用智能整理模型") - organized = await asyncio.to_thread(_call_mindmap_llm, markdown, mode, prompt, task_id) + organized = await asyncio.to_thread(_organize_mindmap_markdown, markdown, mode, prompt, task_id) state = _get_task_progress(task_id) or {} state.update({ "progress": 100, diff --git a/web_ui/src/components/ConfigPanel.vue b/web_ui/src/components/ConfigPanel.vue index 7ca0ae4..f67b3ba 100644 --- a/web_ui/src/components/ConfigPanel.vue +++ b/web_ui/src/components/ConfigPanel.vue @@ -14,29 +14,6 @@ /> - - - - - -
- PNG/JPEG 导出长边上限;默认 8K,导出文件会按该最大长边生效。 -
-
- -
- -
思维导图智能整理
- - - -
- 切换到“智能整理”时发送给后端大模型;默认保留标题结构、总结段落要点,并保持原文语言。 -
-
- -
-
{{ $t('config.recognitionOptions') }}
@@ -144,6 +102,46 @@ + +
+ +
导出与智能整理
+ + + + + + +
+ PNG/JPEG 导出长边上限;默认 8K,导出文件会按该最大长边生效。 +
+
+ + + +
+ 切换到“智能整理”时发送给后端大模型;默认保留标题结构、总结段落要点,并保持原文语言。 +
+
重置 确认 diff --git a/web_ui/src/components/MindMapRenderer.vue b/web_ui/src/components/MindMapRenderer.vue index 55925bd..53b0f06 100644 --- a/web_ui/src/components/MindMapRenderer.vue +++ b/web_ui/src/components/MindMapRenderer.vue @@ -10,6 +10,7 @@ type="primary" size="small" class="action-button primary" + :disabled="!content" >