diff --git a/mineru/cli/fast_api.py b/mineru/cli/fast_api.py
index 2a49f65..eeaaee8 100644
--- a/mineru/cli/fast_api.py
+++ b/mineru/cli/fast_api.py
@@ -388,8 +388,117 @@ DEFAULT_MINDMAP_ORGANIZE_PROMPT = """你是文档结构整理助手。请基于
8. 节点标题尽量简短,正文说明使用短句列表。
9. 只输出 Markdown,不要输出解释、代码块围栏或额外说明。"""
+MINDMAP_MERGE_PROMPT = """你是 Markdown 思维导图结构校对助手。下面是多个分块已经整理总结后的局部 Markdown 大纲。
-def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optional[str] = None, task_id: Optional[str] = None) -> str:
+任务:
+1. 合并这些局部大纲为一份完整 Markdown。
+2. 只检查和调整标题层级结构、顺序、重复标题和父子关系。
+3. 不要重新总结正文,不要扩写内容,不要新增原文没有的信息。
+4. 保留各局部大纲中的源语言:英文保持英文,中文保持中文,多语言分别保留。
+5. 最大层级不超过 4 层。
+6. 每个父节点下最多 8 个子节点,必要时只合并相近标题。
+7. 只输出 Markdown,不要输出解释、代码块围栏或额外说明。"""
+
+
+def _estimate_tokens(text: str) -> int:
+ ascii_chars = 0
+ non_ascii_chars = 0
+ for ch in text:
+ if ch.isspace():
+ continue
+ if ord(ch) < 128:
+ ascii_chars += 1
+ else:
+ non_ascii_chars += 1
+ return max(1, int(ascii_chars / 4) + int(non_ascii_chars * 1.5))
+
+
+def _get_mindmap_context_budget(prompt: str, reserve_output_tokens: int = 4096) -> tuple[int, int]:
+ max_context_tokens = int(os.getenv("MINDMAP_LLM_MAX_CONTEXT_TOKENS", "32768"))
+ prompt_tokens = _estimate_tokens(prompt)
+ safety_tokens = int(os.getenv("MINDMAP_LLM_SAFETY_TOKENS", "1024"))
+ input_budget_tokens = max(2048, max_context_tokens - prompt_tokens - reserve_output_tokens - safety_tokens)
+ logger.info(
+ "Mindmap context budget max_context_tokens={} prompt_tokens={} reserve_output_tokens={} safety_tokens={} input_budget_tokens={}",
+ max_context_tokens, prompt_tokens, reserve_output_tokens, safety_tokens, input_budget_tokens
+ )
+ return max_context_tokens, input_budget_tokens
+
+
+def _split_markdown_blocks(markdown: str) -> list[str]:
+ lines = markdown.splitlines()
+ blocks: list[str] = []
+ current: list[str] = []
+
+ heading_pattern = re.compile(r"^#{1,6}\s+")
+ for line in lines:
+ if heading_pattern.match(line) and current:
+ blocks.append("\n".join(current).strip())
+ current = [line]
+ else:
+ current.append(line)
+
+ if current:
+ blocks.append("\n".join(current).strip())
+
+ return [block for block in blocks if block]
+
+
+def _split_large_block(block: str, max_tokens: int) -> list[str]:
+ paragraphs = re.split(r"\n{2,}", block)
+ chunks: list[str] = []
+ current: list[str] = []
+ current_tokens = 0
+
+ for paragraph in paragraphs:
+ paragraph = paragraph.strip()
+ if not paragraph:
+ continue
+ paragraph_tokens = _estimate_tokens(paragraph)
+ if current and current_tokens + paragraph_tokens > max_tokens:
+ chunks.append("\n\n".join(current))
+ current = [paragraph]
+ current_tokens = paragraph_tokens
+ else:
+ current.append(paragraph)
+ current_tokens += paragraph_tokens
+
+ if current:
+ chunks.append("\n\n".join(current))
+ return chunks or [block]
+
+
+def _chunk_markdown_by_headings(markdown: str, max_tokens: int) -> list[str]:
+ blocks = _split_markdown_blocks(markdown)
+ chunks: list[str] = []
+ current: list[str] = []
+ current_tokens = 0
+
+ for block in blocks:
+ block_tokens = _estimate_tokens(block)
+ if block_tokens > max_tokens:
+ if current:
+ chunks.append("\n\n".join(current))
+ current = []
+ current_tokens = 0
+ chunks.extend(_split_large_block(block, max_tokens))
+ continue
+
+ if current and current_tokens + block_tokens > max_tokens:
+ chunks.append("\n\n".join(current))
+ current = [block]
+ current_tokens = block_tokens
+ else:
+ current.append(block)
+ current_tokens += block_tokens
+
+ if current:
+ chunks.append("\n\n".join(current))
+
+ return chunks or [markdown]
+
+
+def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optional[str] = None, task_id: Optional[str] = None, request_role: str = "organize") -> str:
base_url = os.getenv("MINDMAP_LLM_BASE_URL", "").rstrip("/")
model = os.getenv("MINDMAP_LLM_MODEL", "gemma-4-26B")
api_key = os.getenv("MINDMAP_LLM_API_KEY", "")
@@ -399,10 +508,6 @@ def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optiona
raise RuntimeError("未配置智能整理模型服务,请设置 MINDMAP_LLM_BASE_URL")
compact_markdown = markdown.strip()
- max_chars = int(os.getenv("MINDMAP_LLM_MAX_INPUT_CHARS", "30000"))
- if len(compact_markdown) > max_chars:
- compact_markdown = compact_markdown[:max_chars] + "\n\n...(后续内容已截断)"
-
prompt_template = (custom_prompt or "").strip() or DEFAULT_MINDMAP_ORGANIZE_PROMPT
prompt = f"""{prompt_template}
@@ -410,8 +515,8 @@ def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optiona
{compact_markdown}
"""
logger.info(
- "Mindmap LLM request start task_id={} model={} base_url={} mode={} input_chars={} prompt_chars={}",
- task_id or "-", model, base_url, mode, len(compact_markdown), len(prompt_template)
+ "Mindmap LLM request start task_id={} role={} model={} base_url={} mode={} input_chars={} input_tokens_est={} prompt_chars={}",
+ task_id or "-", request_role, model, base_url, mode, len(compact_markdown), _estimate_tokens(compact_markdown), len(prompt_template)
)
payload = {
@@ -444,12 +549,60 @@ def _call_mindmap_llm(markdown: str, mode: str = "smart", custom_prompt: Optiona
if not organized:
raise RuntimeError("智能整理模型未返回有效内容")
logger.info(
- "Mindmap LLM request completed task_id={} output_chars={}",
- task_id or "-", len(organized)
+ "Mindmap LLM request completed task_id={} role={} output_chars={} output_tokens_est={}",
+ task_id or "-", request_role, len(organized), _estimate_tokens(organized)
)
return organized
+def _organize_mindmap_markdown(markdown: str, mode: str, custom_prompt: Optional[str], task_id: str) -> str:
+ prompt_template = (custom_prompt or "").strip() or DEFAULT_MINDMAP_ORGANIZE_PROMPT
+ _, input_budget_tokens = _get_mindmap_context_budget(prompt_template)
+ source_tokens = _estimate_tokens(markdown)
+ logger.info(
+ "Mindmap organize strategy task_id={} source_chars={} source_tokens_est={} input_budget_tokens={}",
+ task_id, len(markdown), source_tokens, input_budget_tokens
+ )
+
+ if source_tokens <= input_budget_tokens:
+ _update_task_progress(task_id, 35, "调用智能整理模型")
+ return _call_mindmap_llm(markdown, mode, prompt_template, task_id, "single")
+
+ chunks = _chunk_markdown_by_headings(markdown, input_budget_tokens)
+ logger.info("Mindmap large input split task_id={} chunks={}", task_id, len(chunks))
+ partial_results: list[str] = []
+ for index, chunk in enumerate(chunks, start=1):
+ progress = 20 + int(index / max(len(chunks), 1) * 55)
+ _update_task_progress(task_id, progress, f"智能整理分块 {index}/{len(chunks)}")
+ logger.info(
+ "Mindmap chunk organize task_id={} chunk={}/{} chars={} tokens_est={}",
+ task_id, index, len(chunks), len(chunk), _estimate_tokens(chunk)
+ )
+ partial = _call_mindmap_llm(chunk, mode, prompt_template, task_id, f"chunk-{index}")
+ partial_results.append(partial)
+
+ merged_input = "\n\n".join(
+ f"\n{partial}"
+ for index, partial in enumerate(partial_results, start=1)
+ )
+ _, merge_budget_tokens = _get_mindmap_context_budget(MINDMAP_MERGE_PROMPT)
+ merge_tokens = _estimate_tokens(merged_input)
+ if merge_tokens > merge_budget_tokens:
+ logger.warning(
+ "Mindmap merged outline still exceeds context task_id={} tokens_est={} budget={} chunks={}",
+ task_id, merge_tokens, merge_budget_tokens, len(partial_results)
+ )
+ merge_chunks = _chunk_markdown_by_headings(merged_input, merge_budget_tokens)
+ merged_round: list[str] = []
+ for index, chunk in enumerate(merge_chunks, start=1):
+ _update_task_progress(task_id, 78 + int(index / max(len(merge_chunks), 1) * 10), f"合并局部大纲 {index}/{len(merge_chunks)}")
+ merged_round.append(_call_mindmap_llm(chunk, mode, MINDMAP_MERGE_PROMPT, task_id, f"merge-round-{index}"))
+ merged_input = "\n\n".join(merged_round)
+
+ _update_task_progress(task_id, 90, "全局整理标题结构")
+ return _call_mindmap_llm(merged_input, mode, MINDMAP_MERGE_PROMPT, task_id, "merge")
+
+
async def _run_mindmap_organize_task(task_id: str, markdown: str, mode: str, prompt: Optional[str]):
try:
_store_task_progress(task_id, {
@@ -460,8 +613,7 @@ async def _run_mindmap_organize_task(task_id: str, markdown: str, mode: str, pro
"file_names": "",
"result_md": None,
})
- _update_task_progress(task_id, 35, "调用智能整理模型")
- organized = await asyncio.to_thread(_call_mindmap_llm, markdown, mode, prompt, task_id)
+ organized = await asyncio.to_thread(_organize_mindmap_markdown, markdown, mode, prompt, task_id)
state = _get_task_progress(task_id) or {}
state.update({
"progress": 100,
diff --git a/web_ui/src/components/ConfigPanel.vue b/web_ui/src/components/ConfigPanel.vue
index 7ca0ae4..f67b3ba 100644
--- a/web_ui/src/components/ConfigPanel.vue
+++ b/web_ui/src/components/ConfigPanel.vue
@@ -14,29 +14,6 @@
/>
-