From ac2b198cd72ed3584a74e90372f70452f725bd17 Mon Sep 17 00:00:00 2001 From: panyy Date: Tue, 23 Jun 2026 17:17:57 +0800 Subject: [PATCH] =?UTF-8?q?feat(=E6=80=9D=E7=BB=B4=E5=AF=BC=E5=9B=BE?= =?UTF-8?q?=E5=8A=A9=E6=89=8B)=EF=BC=9A=E6=80=9D=E7=BB=B4=E5=AF=BC?= =?UTF-8?q?=E5=9B=BE=E5=8A=A9=E6=89=8B=E5=A2=9E=E5=8A=A0=E6=99=BA=E8=83=BD?= =?UTF-8?q?=E6=95=B4=E7=90=86=E6=80=BB=E7=BB=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mineru/cli/fast_api.py | 144 +++++++++++++++++++++ web_ui/src/api/document.ts | 19 +++ web_ui/src/views/DocumentProcessor.vue | 165 +++++++++++++++++++++++-- 3 files changed, 318 insertions(+), 10 deletions(-) diff --git a/mineru/cli/fast_api.py b/mineru/cli/fast_api.py index 442d466..d30c4c3 100644 --- a/mineru/cli/fast_api.py +++ b/mineru/cli/fast_api.py @@ -10,12 +10,15 @@ import json import uvicorn import click import zipfile +import urllib.request +import urllib.error from pathlib import Path import glob os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") from fastapi import Depends, FastAPI, HTTPException, UploadFile, File, Form, APIRouter, Header +from pydantic import BaseModel from fastapi.middleware.gzip import GZipMiddleware from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, FileResponse @@ -358,6 +361,116 @@ def get_infer_result(file_suffix_identifier: str, pdf_name: str, parse_dir: str) api_router = APIRouter(prefix="/api") +class MindmapOrganizeRequest(BaseModel): + markdown: str + mode: str = "smart" + + +def _extract_json_object(text: str) -> str: + content = text.strip() + if content.startswith("```"): + content = re.sub(r"^```(?:json|markdown|md)?\s*", "", content, flags=re.IGNORECASE) + content = re.sub(r"\s*```$", "", content) + return content.strip() + + +def _call_mindmap_llm(markdown: str, mode: str = "smart") -> str: + base_url = os.getenv("MINDMAP_LLM_BASE_URL", "").rstrip("/") + model = os.getenv("MINDMAP_LLM_MODEL", "gemma-4-26B") + api_key = os.getenv("MINDMAP_LLM_API_KEY", "") + timeout = int(os.getenv("MINDMAP_LLM_TIMEOUT", "180")) + + if not base_url: + raise RuntimeError("未配置智能整理模型服务,请设置 MINDMAP_LLM_BASE_URL") + + compact_markdown = markdown.strip() + max_chars = int(os.getenv("MINDMAP_LLM_MAX_INPUT_CHARS", "30000")) + if len(compact_markdown) > max_chars: + compact_markdown = compact_markdown[:max_chars] + "\n\n...(后续内容已截断)" + + style_instruction = "保留原文标题结构,并将段落总结成要点。" if mode == "hybrid" else "重新整理文档结构,提炼主题并合并相近段落。" + prompt = f"""你是文档结构整理助手。请基于用户提供的 Markdown 生成适合思维导图展示的 Markdown。 + +要求: +1. {style_instruction} +2. 不要逐段照抄原文,要归纳、合并、总结。 +3. 不要编造原文没有的信息。 +4. 保留关键数字、公式、专有名词、步骤和结论。 +5. 最大层级不超过 4 层。 +6. 每个父节点下最多 8 个子节点。 +7. 节点标题尽量简短,正文说明使用短句列表。 +8. 只输出 Markdown,不要输出解释、代码块围栏或额外说明。 + +原始 Markdown: +{compact_markdown} +""" + + payload = { + "model": model, + "messages": [ + {"role": "system", "content": "你擅长把长文档整理成结构清晰、层次合理的中文思维导图 Markdown。"}, + {"role": "user", "content": prompt}, + ], + "temperature": float(os.getenv("MINDMAP_LLM_TEMPERATURE", "0.2")), + } + data = json.dumps(payload, ensure_ascii=False).encode("utf-8") + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + url = f"{base_url}/chat/completions" + req = urllib.request.Request(url, data=data, headers=headers, method="POST") + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + result = json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="ignore") + raise RuntimeError(f"智能整理模型请求失败: HTTP {exc.code} {detail}") from exc + except Exception as exc: + raise RuntimeError(f"智能整理模型请求失败: {exc}") from exc + + message = result.get("choices", [{}])[0].get("message", {}) + content = message.get("content", "") + organized = _extract_json_object(content) + if not organized: + raise RuntimeError("智能整理模型未返回有效内容") + return organized + + +async def _run_mindmap_organize_task(task_id: str, markdown: str, mode: str): + try: + _store_task_progress(task_id, { + "progress": 10, + "stage": "准备智能整理", + "status": "processing", + "error": None, + "file_names": "", + "result_md": None, + }) + _update_task_progress(task_id, 35, "调用智能整理模型") + organized = await asyncio.to_thread(_call_mindmap_llm, markdown, mode) + state = _get_task_progress(task_id) or {} + state.update({ + "progress": 100, + "stage": "智能整理完成", + "status": "completed", + "error": None, + "result_md": organized, + }) + _store_task_progress(task_id, state) + except Exception as exc: + logger.exception(exc) + state = _get_task_progress(task_id) or {} + state.update({ + "progress": 100, + "stage": "智能整理失败", + "status": "failed", + "error": str(exc), + "result_md": None, + }) + _store_task_progress(task_id, state) + + @api_router.post("/parse_tasks/{task_id}", status_code=201) async def create_parse_task(task_id: str): """Register a task before the multipart upload starts.""" @@ -383,6 +496,37 @@ async def get_parse_progress(task_id: str): return state +@api_router.post("/mindmap_tasks/{task_id}", status_code=201) +async def create_mindmap_task(task_id: str, request: MindmapOrganizeRequest): + """Create an async task that organizes Markdown into summarized mindmap Markdown.""" + markdown = request.markdown.strip() + if not markdown: + raise HTTPException(status_code=400, detail="Markdown content is required") + + state = { + "progress": 0, + "stage": "等待智能整理", + "status": "pending", + "error": None, + "file_names": "", + "result_md": None, + } + _store_task_progress(task_id, state) + asyncio.create_task(_run_mindmap_organize_task(task_id, markdown, request.mode)) + logger.info(f"Registered mindmap organize task pid={os.getpid()} task_id={task_id}") + return state + + +@api_router.get("/mindmap_progress/{task_id}") +async def get_mindmap_progress(task_id: str): + """Query async mindmap organization progress and result.""" + state = _get_task_progress(task_id) + if state is None: + logger.warning(f"Mindmap task not found pid={os.getpid()} task_id={task_id}") + raise HTTPException(status_code=404, detail="Task not found") + return state + + @api_router.post(path="/file_parse", dependencies=[Depends(limit_concurrency)]) async def parse_pdf( files: List[UploadFile] = File(..., description="Upload pdf, image, or Word files for parsing"), diff --git a/web_ui/src/api/document.ts b/web_ui/src/api/document.ts index 170a99f..e05c5e7 100644 --- a/web_ui/src/api/document.ts +++ b/web_ui/src/api/document.ts @@ -37,6 +37,10 @@ export interface ParseProgress { file_names: string } +export interface MindmapOrganizeProgress extends ParseProgress { + result_md?: string | null +} + export const documentApi = { /** * 解析文档 @@ -96,5 +100,20 @@ export const documentApi = { return request.get(`/api/parse_progress/${taskId}`).then(result => { return result as unknown as ParseProgress }) + }, + + createMindmapTask(taskId: string, markdown: string, mode = 'smart'): Promise { + return request.post(`/api/mindmap_tasks/${encodeURIComponent(taskId)}`, { + markdown, + mode + }).then(result => { + return result as unknown as MindmapOrganizeProgress + }) + }, + + getMindmapProgress(taskId: string): Promise { + return request.get(`/api/mindmap_progress/${encodeURIComponent(taskId)}`).then(result => { + return result as unknown as MindmapOrganizeProgress + }) } } diff --git a/web_ui/src/views/DocumentProcessor.vue b/web_ui/src/views/DocumentProcessor.vue index d6888f9..0a26b23 100644 --- a/web_ui/src/views/DocumentProcessor.vue +++ b/web_ui/src/views/DocumentProcessor.vue @@ -141,10 +141,26 @@
-
- {{ templateRenderError }} +
+ + + {{ smartOrganizeStage || '智能整理中' }} {{ smartOrganizeProgress }}% + + + {{ smartOrganizeError }} + + + 智能整理结果 + + + {{ templateRenderError }} +
-
('gfm') const sourceViewMode = ref<'markdown' | 'ast'>( loadStoredValue(SOURCE_VIEW_MODE_KEY, 'markdown', isSourceViewMode) ) +const sourceContentMode = ref<'source' | 'smart'>('source') +const sourceContentModeOptions = [ + { label: '源码', value: 'source' }, + { label: '智能整理', value: 'smart' } +] const showReplacementDialog = ref(false) const isMobileDialog = ref(false) const replacementRules = ref([]) const draftReplacementRules = ref([]) +const smartMarkdownContent = ref('') +const smartOrganizeTaskId = ref('') +const isSmartOrganizing = ref(false) +const smartOrganizeProgress = ref(0) +const smartOrganizeStage = ref('') +const smartOrganizeError = ref('') +let smartOrganizeTimer: ReturnType | null = null watch(markdownRenderMode, (value) => storeValue(MARKDOWN_RENDER_MODE_KEY, value)) watch(sourceViewMode, (value) => storeValue(SOURCE_VIEW_MODE_KEY, value)) const manualMarkdownContent = computed(() => results.value?.source || '') -const templateRenderResult = computed(() => renderMarkdownTemplate(manualMarkdownContent.value, replacementRules.value)) +const activeSourceMarkdownContent = computed(() => { + if (sourceContentMode.value === 'smart' && smartMarkdownContent.value) { + return smartMarkdownContent.value + } + return manualMarkdownContent.value +}) +const templateRenderResult = computed(() => renderMarkdownTemplate(activeSourceMarkdownContent.value, replacementRules.value)) const renderedMarkdownContent = computed(() => templateRenderResult.value.markdown) const renderedMindmapContent = computed(() => buildMindmapMarkdown(renderedMarkdownContent.value)) const templateRenderError = computed(() => templateRenderResult.value.error) @@ -337,15 +372,105 @@ const sourcePanelContent = computed({ if (sourceViewMode.value === 'ast') { return markdownToAstString(renderedMarkdownContent.value) } - return manualMarkdownContent.value + return activeSourceMarkdownContent.value }, set: (value: string) => { - if (sourceViewMode.value === 'markdown' && results.value) { + if (sourceViewMode.value !== 'markdown') return + if (sourceContentMode.value === 'smart') { + smartMarkdownContent.value = value + return + } + if (results.value) { results.value.source = value } } }) +const stopSmartOrganizePolling = () => { + if (smartOrganizeTimer) { + clearInterval(smartOrganizeTimer) + smartOrganizeTimer = null + } +} + +const resetSmartOrganizeState = () => { + stopSmartOrganizePolling() + sourceContentMode.value = 'source' + smartMarkdownContent.value = '' + smartOrganizeTaskId.value = '' + isSmartOrganizing.value = false + smartOrganizeProgress.value = 0 + smartOrganizeStage.value = '' + smartOrganizeError.value = '' +} + +const startSmartOrganizePolling = (taskId: string) => { + stopSmartOrganizePolling() + smartOrganizeTimer = setInterval(async () => { + try { + const data = await documentApi.getMindmapProgress(taskId) + smartOrganizeProgress.value = data.progress + smartOrganizeStage.value = data.stage + if (data.status === 'completed') { + stopSmartOrganizePolling() + isSmartOrganizing.value = false + smartMarkdownContent.value = data.result_md || '' + smartOrganizeError.value = '' + sourceContentMode.value = 'smart' + ElMessage.success('智能整理完成') + } else if (data.status === 'failed') { + stopSmartOrganizePolling() + isSmartOrganizing.value = false + sourceContentMode.value = 'source' + smartOrganizeError.value = data.error || '智能整理失败' + ElMessage.error(smartOrganizeError.value) + } + } catch (err) { + stopSmartOrganizePolling() + isSmartOrganizing.value = false + sourceContentMode.value = 'source' + smartOrganizeError.value = (err as Error).message || '智能整理进度查询失败' + ElMessage.error(smartOrganizeError.value) + } + }, 1500) +} + +const startSmartOrganize = async () => { + const markdown = manualMarkdownContent.value.trim() + if (!markdown) { + sourceContentMode.value = 'source' + ElMessage.warning('暂无 Markdown 源码可整理') + return + } + if (smartMarkdownContent.value) { + sourceContentMode.value = 'smart' + return + } + + const taskId = `mindmap-${Date.now()}-${Math.random().toString(36).slice(2, 8)}` + smartOrganizeTaskId.value = taskId + isSmartOrganizing.value = true + smartOrganizeProgress.value = 0 + smartOrganizeStage.value = '创建智能整理任务' + smartOrganizeError.value = '' + + try { + await documentApi.createMindmapTask(taskId, markdown, 'smart') + startSmartOrganizePolling(taskId) + } catch (err) { + isSmartOrganizing.value = false + sourceContentMode.value = 'source' + smartOrganizeError.value = (err as Error).message || '智能整理任务创建失败' + ElMessage.error(smartOrganizeError.value) + } +} + +watch(sourceContentMode, (value) => { + if (value === 'smart') { + startSmartOrganize() + } +}) + const toggleSettings = () => { if (showSettings.value) { const settingsPanel = document.querySelector('.settings-panel') @@ -410,6 +535,7 @@ const applyConfig = (nextConfig: typeof config) => { } const handleProcessDocument = async () => { + resetSmartOrganizeState() await processDocument() if (error.value) { ElMessage.error(error.value) @@ -465,13 +591,18 @@ const saveReplacementRules = () => { const handleMindmapNodeEdit = ({ oldText, newText }: { oldText: string; newText: string }) => { if (!results.value) return - const updated = replaceFirstMindmapText(results.value.source || results.value.markdown, oldText, newText) - if (updated === (results.value.source || results.value.markdown)) { + const currentMarkdown = activeSourceMarkdownContent.value + const updated = replaceFirstMindmapText(currentMarkdown, oldText, newText) + if (updated === currentMarkdown) { ElMessage.warning('未在 Markdown 中定位到该节点文本,可在源码区直接编辑') return } - results.value.source = updated - results.value.markdown = updated + if (sourceContentMode.value === 'smart') { + smartMarkdownContent.value = updated + } else { + results.value.source = updated + results.value.markdown = updated + } ElMessage.success('节点已同步到 Markdown') } @@ -485,6 +616,7 @@ const handleFileInputChange = async (event: Event) => { await handleFileUpload(input.files) input.value = '' if (uploadedFiles.value.length > 0) { + resetSmartOrganizeState() isUploadAreaCollapsed.value = true initializeManualResult() markdownRenderMode.value = 'markdown' @@ -501,6 +633,7 @@ const handleDrop = async (event: DragEvent) => { if (event.dataTransfer?.files) { await handleFileUpload(event.dataTransfer.files) if (uploadedFiles.value.length > 0) { + resetSmartOrganizeState() isUploadAreaCollapsed.value = true initializeManualResult() markdownRenderMode.value = 'markdown' @@ -518,6 +651,7 @@ const removeFile = (index: number) => { } const clearAllFiles = () => { + resetSmartOrganizeState() clearAll() isUploadAreaCollapsed.value = false } @@ -528,6 +662,7 @@ onMounted(() => { }) onUnmounted(() => { + stopSmartOrganizePolling() window.removeEventListener('resize', updateViewportState) }) @@ -957,6 +1092,16 @@ onUnmounted(() => { flex: 1; } +.source-mode-toolbar { + align-items: center; +} + +.smart-organize-status { + font-size: 12px; + color: #86909C; + white-space: nowrap; +} + .content-toolbar-actions { flex-shrink: 0; }