UnisMindMap/web_ui/src/composables/useDocumentProcessor.ts

329 lines
9.5 KiB
TypeScript

import { ref, reactive, watch } from 'vue'
import type { Ref } from 'vue'
import { documentApi, type ParseParams } from '@/api/document'
import { convertWordToPdf, isWordFile } from '@/utils/wordToPdf'
import { buildMindmapMarkdown } from '@/utils/mindmapMarkdown'
// 根据上一级标题自动补全下一级标题
export function autoPromoteParagraphsToSubheading(text: string): string {
return buildMindmapMarkdown(text)
}
export interface DocumentConfig {
maxPages: number
backend: string
serverUrl: string
tableEnable: boolean
formulaEnable: boolean
language: string
forceOcr: boolean
}
export interface ProcessResult {
markdown: string
source: string
mindmap: string
downloadUrl?: string
}
const CONFIG_STORAGE_KEY = 'mineru.documentProcessor.config'
function loadCachedConfig(): Partial<DocumentConfig> {
try {
const raw = window.localStorage.getItem(CONFIG_STORAGE_KEY)
return raw ? JSON.parse(raw) : {}
} catch {
return {}
}
}
export function useDocumentProcessor() {
// 文件相关
const uploadedFiles: Ref<File[]> = ref([])
const isUploading = ref(false)
// 配置相关
const defaultConfig: DocumentConfig = {
maxPages: 1000,
backend: 'hybrid-auto-engine',
serverUrl: 'http://localhost:30000',
tableEnable: true,
formulaEnable: true,
language: 'ch',
forceOcr: false
}
const config = reactive<DocumentConfig>({
...defaultConfig,
...loadCachedConfig()
})
watch(
config,
(value) => {
try {
window.localStorage.setItem(CONFIG_STORAGE_KEY, JSON.stringify(value))
} catch {
// 浏览器禁用本地存储时不影响当前页面使用。
}
},
{ deep: true }
)
// 结果相关
const results = ref<ProcessResult | null>(null)
const isProcessing = ref(false)
const processingStage = ref('')
const error = ref<string | null>(null)
// 后端选项
const backendOptions = [
{ value: 'pipeline', label: '传统管道解析' },
{ value: 'vlm-auto-engine', label: 'VLM本地引擎' },
{ value: 'hybrid-auto-engine', label: '混合本地引擎' },
{ value: 'vlm-http-client', label: 'VLM远程客户端' },
{ value: 'hybrid-http-client', label: '混合远程客户端' }
]
// 语言选项
const languageOptions = [
{ value: 'ch', label: '中文(简体)' },
{ value: 'en', label: '英语' },
{ value: 'korean', label: '韩语' },
{ value: 'japan', label: '日语' },
{ value: 'chinese_cht', label: '中文(繁体)' },
{ value: 'ta', label: '泰米尔语' },
{ value: 'te', label: '泰卢固语' },
{ value: 'ka', label: '卡纳达语' },
{ value: 'th', label: '泰语' },
{ value: 'el', label: '希腊语' },
{ value: 'latin', label: '拉丁语系' },
{ value: 'arabic', label: '阿拉伯语系' },
{ value: 'east_slavic', label: '东斯拉夫语系' },
{ value: 'cyrillic', label: '西里尔语系' },
{ value: 'devanagari', label: '梵文字母语系' }
]
// 文件上传处理
const handleFileUpload = async (files: FileList | null) => {
console.log('handleFileUpload called with files:', files)
if (!files || files.length === 0) return
const validFiles: File[] = []
const file = files[0]
uploadedFiles.value = []
if (files.length > 1) {
error.value = '一次只能选择一个文件,已使用第一个文件'
}
const fileType = file.type
const fileName = file.name.toLowerCase()
console.log('Processing file:', fileName, 'type:', fileType)
// 验证文件类型 - 支持 PDF、图片和 Word 文档
const isImage = fileType.startsWith('image/')
const isPdf = fileType === 'application/pdf'
const isWord = fileName.endsWith('.docx') || fileName.endsWith('.doc') || fileType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
console.log('File type checks - isImage:', isImage, 'isPdf:', isPdf, 'isWord:', isWord)
if (!isImage && !isPdf && !isWord) {
error.value = '不支持的文件类型'
console.log('Unsupported file type:', fileName, fileType)
return
}
// 验证文件大小 (100MB)
if (file.size > 100 * 1024 * 1024) {
error.value = '文件大小超出限制'
console.log('File too large:', fileName, file.size)
return
}
// 如果是 Word 文档,转换为 PDF
if (isWord) {
try {
isUploading.value = true
error.value = '正在将 Word 文档转换为 PDF...'
console.log('Converting Word to PDF:', fileName)
const pdfFile = await convertWordToPdf(file)
console.log('Conversion successful, PDF file:', pdfFile.name, pdfFile.size)
validFiles.push(pdfFile)
error.value = null
} catch (err) {
error.value = 'Word 转换为 PDF 失败: ' + (err as Error).message
console.error('Conversion failed:', (err as Error).message)
return
} finally {
isUploading.value = false
}
} else {
validFiles.push(file)
console.log('Adding file directly:', fileName)
}
uploadedFiles.value = validFiles
console.log('Final uploaded files:', validFiles.map(f => f.name))
}
// 清除所有数据
const clearAll = () => {
uploadedFiles.value = []
results.value = null
error.value = null
processingStage.value = ''
}
// 演示模式:上传文件后由用户手动粘贴 Markdown 源码
const initializeManualResult = () => {
results.value = {
markdown: '',
source: '',
mindmap: ''
}
error.value = null
}
// 进度相关
const progressPercent = ref(0)
const progressStage = ref('')
let progressTimer: ReturnType<typeof setInterval> | null = null
// 启动进度轮询
const startProgressPolling = (taskId: string) => {
stopProgressPolling()
progressPercent.value = 0
progressStage.value = '准备中'
progressTimer = setInterval(async () => {
try {
const data = await documentApi.getParseProgress(taskId)
progressPercent.value = data.progress
progressStage.value = data.stage
if (data.status === 'completed' || data.status === 'failed') {
stopProgressPolling()
}
} catch {
// 忽略轮询失败
}
}, 1000)
}
// 停止进度轮询
const stopProgressPolling = () => {
if (progressTimer) {
clearInterval(progressTimer)
progressTimer = null
}
}
// 处理文档转换
const processDocument = async () => {
if (uploadedFiles.value.length === 0) {
error.value = '请先上传文件'
return
}
isProcessing.value = true
error.value = null
processingStage.value = '准备提交解析任务'
// 生成任务ID
const taskId = `task-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
try {
await documentApi.createParseTask(taskId)
startProgressPolling(taskId)
processingStage.value = '提交文档到解析服务'
const params: ParseParams = {
files: uploadedFiles.value,
output_dir: './output',
lang_list: config.language,
backend: config.backend,
parse_method: config.forceOcr ? 'ocr' : 'auto',
formula_enable: config.formulaEnable,
table_enable: config.tableEnable,
start_page_id: 0,
end_page_id: config.maxPages - 1,
return_md: true,
return_middle_json: false,
response_format_zip: false
}
if (config.backend.includes('http-client') && config.serverUrl) {
params.server_url = config.serverUrl
}
processingStage.value = '服务端正在解析文档'
const response = await documentApi.parseDocument(params, taskId)
if (response.results) {
processingStage.value = '生成 Markdown 和思维导图'
const resultData = Object.values(response.results)[0]
const mdContent = resultData.md_content || ''
const mindmapContent = buildMindmapMarkdown(mdContent)
results.value = {
markdown: mdContent,
source: mdContent,
mindmap: mindmapContent
}
}
} catch (err: any) {
error.value = err.message || '转换失败'
} finally {
stopProgressPolling()
processingStage.value = ''
isProcessing.value = false
}
}
// 根据后端类型获取公式标签
const getFormulaLabel = (backend: string) => {
if (backend.startsWith('vlm')) {
return '启用行间公式识别'
} else if (backend === 'pipeline') {
return '启用公式识别'
} else if (backend.startsWith('hybrid')) {
return '启用行内公式识别'
}
return '启用公式识别'
}
// 根据后端类型获取公式说明
const getFormulaInfo = (backend: string) => {
if (backend.startsWith('vlm')) {
return '禁用后,行间公式将显示为图片。'
} else if (backend === 'pipeline') {
return '禁用后,行间公式将显示为图片,行内公式将不会被检测或解析。'
} else if (backend.startsWith('hybrid')) {
return '禁用后,行内公式将不会被检测或解析。'
}
return ''
}
return {
// 数据
uploadedFiles,
config,
results,
isUploading,
isProcessing,
processingStage,
progressPercent,
progressStage,
error,
// 选项
backendOptions,
languageOptions,
// 方法
handleFileUpload,
initializeManualResult,
clearAll,
processDocument,
getFormulaLabel,
getFormulaInfo
}
}