329 lines
9.5 KiB
TypeScript
329 lines
9.5 KiB
TypeScript
import { ref, reactive, watch } from 'vue'
|
|
import type { Ref } from 'vue'
|
|
import { documentApi, type ParseParams } from '@/api/document'
|
|
import { convertWordToPdf, isWordFile } from '@/utils/wordToPdf'
|
|
import { buildMindmapMarkdown } from '@/utils/mindmapMarkdown'
|
|
|
|
// 根据上一级标题自动补全下一级标题
|
|
export function autoPromoteParagraphsToSubheading(text: string): string {
|
|
return buildMindmapMarkdown(text)
|
|
}
|
|
|
|
export interface DocumentConfig {
|
|
maxPages: number
|
|
backend: string
|
|
serverUrl: string
|
|
tableEnable: boolean
|
|
formulaEnable: boolean
|
|
language: string
|
|
forceOcr: boolean
|
|
}
|
|
|
|
export interface ProcessResult {
|
|
markdown: string
|
|
source: string
|
|
mindmap: string
|
|
downloadUrl?: string
|
|
}
|
|
|
|
const CONFIG_STORAGE_KEY = 'mineru.documentProcessor.config'
|
|
|
|
function loadCachedConfig(): Partial<DocumentConfig> {
|
|
try {
|
|
const raw = window.localStorage.getItem(CONFIG_STORAGE_KEY)
|
|
return raw ? JSON.parse(raw) : {}
|
|
} catch {
|
|
return {}
|
|
}
|
|
}
|
|
|
|
export function useDocumentProcessor() {
|
|
// 文件相关
|
|
const uploadedFiles: Ref<File[]> = ref([])
|
|
const isUploading = ref(false)
|
|
|
|
// 配置相关
|
|
const defaultConfig: DocumentConfig = {
|
|
maxPages: 1000,
|
|
backend: 'hybrid-auto-engine',
|
|
serverUrl: 'http://localhost:30000',
|
|
tableEnable: true,
|
|
formulaEnable: true,
|
|
language: 'ch',
|
|
forceOcr: false
|
|
}
|
|
const config = reactive<DocumentConfig>({
|
|
...defaultConfig,
|
|
...loadCachedConfig()
|
|
})
|
|
|
|
watch(
|
|
config,
|
|
(value) => {
|
|
try {
|
|
window.localStorage.setItem(CONFIG_STORAGE_KEY, JSON.stringify(value))
|
|
} catch {
|
|
// 浏览器禁用本地存储时不影响当前页面使用。
|
|
}
|
|
},
|
|
{ deep: true }
|
|
)
|
|
|
|
// 结果相关
|
|
const results = ref<ProcessResult | null>(null)
|
|
const isProcessing = ref(false)
|
|
const processingStage = ref('')
|
|
const error = ref<string | null>(null)
|
|
|
|
// 后端选项
|
|
const backendOptions = [
|
|
{ value: 'pipeline', label: '传统管道解析' },
|
|
{ value: 'vlm-auto-engine', label: 'VLM本地引擎' },
|
|
{ value: 'hybrid-auto-engine', label: '混合本地引擎' },
|
|
{ value: 'vlm-http-client', label: 'VLM远程客户端' },
|
|
{ value: 'hybrid-http-client', label: '混合远程客户端' }
|
|
]
|
|
|
|
// 语言选项
|
|
const languageOptions = [
|
|
{ value: 'ch', label: '中文(简体)' },
|
|
{ value: 'en', label: '英语' },
|
|
{ value: 'korean', label: '韩语' },
|
|
{ value: 'japan', label: '日语' },
|
|
{ value: 'chinese_cht', label: '中文(繁体)' },
|
|
{ value: 'ta', label: '泰米尔语' },
|
|
{ value: 'te', label: '泰卢固语' },
|
|
{ value: 'ka', label: '卡纳达语' },
|
|
{ value: 'th', label: '泰语' },
|
|
{ value: 'el', label: '希腊语' },
|
|
{ value: 'latin', label: '拉丁语系' },
|
|
{ value: 'arabic', label: '阿拉伯语系' },
|
|
{ value: 'east_slavic', label: '东斯拉夫语系' },
|
|
{ value: 'cyrillic', label: '西里尔语系' },
|
|
{ value: 'devanagari', label: '梵文字母语系' }
|
|
]
|
|
|
|
// 文件上传处理
|
|
const handleFileUpload = async (files: FileList | null) => {
|
|
console.log('handleFileUpload called with files:', files)
|
|
if (!files || files.length === 0) return
|
|
|
|
const validFiles: File[] = []
|
|
const file = files[0]
|
|
uploadedFiles.value = []
|
|
if (files.length > 1) {
|
|
error.value = '一次只能选择一个文件,已使用第一个文件'
|
|
}
|
|
|
|
const fileType = file.type
|
|
const fileName = file.name.toLowerCase()
|
|
|
|
console.log('Processing file:', fileName, 'type:', fileType)
|
|
|
|
// 验证文件类型 - 支持 PDF、图片和 Word 文档
|
|
const isImage = fileType.startsWith('image/')
|
|
const isPdf = fileType === 'application/pdf'
|
|
const isWord = fileName.endsWith('.docx') || fileName.endsWith('.doc') || fileType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
|
|
console.log('File type checks - isImage:', isImage, 'isPdf:', isPdf, 'isWord:', isWord)
|
|
|
|
if (!isImage && !isPdf && !isWord) {
|
|
error.value = '不支持的文件类型'
|
|
console.log('Unsupported file type:', fileName, fileType)
|
|
return
|
|
}
|
|
|
|
// 验证文件大小 (100MB)
|
|
if (file.size > 100 * 1024 * 1024) {
|
|
error.value = '文件大小超出限制'
|
|
console.log('File too large:', fileName, file.size)
|
|
return
|
|
}
|
|
|
|
// 如果是 Word 文档,转换为 PDF
|
|
if (isWord) {
|
|
try {
|
|
isUploading.value = true
|
|
error.value = '正在将 Word 文档转换为 PDF...'
|
|
console.log('Converting Word to PDF:', fileName)
|
|
const pdfFile = await convertWordToPdf(file)
|
|
console.log('Conversion successful, PDF file:', pdfFile.name, pdfFile.size)
|
|
validFiles.push(pdfFile)
|
|
error.value = null
|
|
} catch (err) {
|
|
error.value = 'Word 转换为 PDF 失败: ' + (err as Error).message
|
|
console.error('Conversion failed:', (err as Error).message)
|
|
return
|
|
} finally {
|
|
isUploading.value = false
|
|
}
|
|
} else {
|
|
validFiles.push(file)
|
|
console.log('Adding file directly:', fileName)
|
|
}
|
|
|
|
uploadedFiles.value = validFiles
|
|
console.log('Final uploaded files:', validFiles.map(f => f.name))
|
|
}
|
|
|
|
// 清除所有数据
|
|
const clearAll = () => {
|
|
uploadedFiles.value = []
|
|
results.value = null
|
|
error.value = null
|
|
processingStage.value = ''
|
|
}
|
|
|
|
// 演示模式:上传文件后由用户手动粘贴 Markdown 源码
|
|
const initializeManualResult = () => {
|
|
results.value = {
|
|
markdown: '',
|
|
source: '',
|
|
mindmap: ''
|
|
}
|
|
error.value = null
|
|
}
|
|
|
|
// 进度相关
|
|
const progressPercent = ref(0)
|
|
const progressStage = ref('')
|
|
let progressTimer: ReturnType<typeof setInterval> | null = null
|
|
|
|
// 启动进度轮询
|
|
const startProgressPolling = (taskId: string) => {
|
|
stopProgressPolling()
|
|
progressPercent.value = 0
|
|
progressStage.value = '准备中'
|
|
progressTimer = setInterval(async () => {
|
|
try {
|
|
const data = await documentApi.getParseProgress(taskId)
|
|
progressPercent.value = data.progress
|
|
progressStage.value = data.stage
|
|
if (data.status === 'completed' || data.status === 'failed') {
|
|
stopProgressPolling()
|
|
}
|
|
} catch {
|
|
// 忽略轮询失败
|
|
}
|
|
}, 1000)
|
|
}
|
|
|
|
// 停止进度轮询
|
|
const stopProgressPolling = () => {
|
|
if (progressTimer) {
|
|
clearInterval(progressTimer)
|
|
progressTimer = null
|
|
}
|
|
}
|
|
|
|
// 处理文档转换
|
|
const processDocument = async () => {
|
|
if (uploadedFiles.value.length === 0) {
|
|
error.value = '请先上传文件'
|
|
return
|
|
}
|
|
|
|
isProcessing.value = true
|
|
error.value = null
|
|
processingStage.value = '准备提交解析任务'
|
|
|
|
// 生成任务ID
|
|
const taskId = `task-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
|
|
|
|
try {
|
|
await documentApi.createParseTask(taskId)
|
|
startProgressPolling(taskId)
|
|
processingStage.value = '提交文档到解析服务'
|
|
const params: ParseParams = {
|
|
files: uploadedFiles.value,
|
|
output_dir: './output',
|
|
lang_list: config.language,
|
|
backend: config.backend,
|
|
parse_method: config.forceOcr ? 'ocr' : 'auto',
|
|
formula_enable: config.formulaEnable,
|
|
table_enable: config.tableEnable,
|
|
start_page_id: 0,
|
|
end_page_id: config.maxPages - 1,
|
|
return_md: true,
|
|
return_middle_json: false,
|
|
response_format_zip: false
|
|
}
|
|
|
|
if (config.backend.includes('http-client') && config.serverUrl) {
|
|
params.server_url = config.serverUrl
|
|
}
|
|
|
|
processingStage.value = '服务端正在解析文档'
|
|
const response = await documentApi.parseDocument(params, taskId)
|
|
|
|
if (response.results) {
|
|
processingStage.value = '生成 Markdown 和思维导图'
|
|
const resultData = Object.values(response.results)[0]
|
|
const mdContent = resultData.md_content || ''
|
|
const mindmapContent = buildMindmapMarkdown(mdContent)
|
|
results.value = {
|
|
markdown: mdContent,
|
|
source: mdContent,
|
|
mindmap: mindmapContent
|
|
}
|
|
}
|
|
|
|
} catch (err: any) {
|
|
error.value = err.message || '转换失败'
|
|
} finally {
|
|
stopProgressPolling()
|
|
processingStage.value = ''
|
|
isProcessing.value = false
|
|
}
|
|
}
|
|
|
|
// 根据后端类型获取公式标签
|
|
const getFormulaLabel = (backend: string) => {
|
|
if (backend.startsWith('vlm')) {
|
|
return '启用行间公式识别'
|
|
} else if (backend === 'pipeline') {
|
|
return '启用公式识别'
|
|
} else if (backend.startsWith('hybrid')) {
|
|
return '启用行内公式识别'
|
|
}
|
|
return '启用公式识别'
|
|
}
|
|
|
|
// 根据后端类型获取公式说明
|
|
const getFormulaInfo = (backend: string) => {
|
|
if (backend.startsWith('vlm')) {
|
|
return '禁用后,行间公式将显示为图片。'
|
|
} else if (backend === 'pipeline') {
|
|
return '禁用后,行间公式将显示为图片,行内公式将不会被检测或解析。'
|
|
} else if (backend.startsWith('hybrid')) {
|
|
return '禁用后,行内公式将不会被检测或解析。'
|
|
}
|
|
return ''
|
|
}
|
|
|
|
return {
|
|
// 数据
|
|
uploadedFiles,
|
|
config,
|
|
results,
|
|
isUploading,
|
|
isProcessing,
|
|
processingStage,
|
|
progressPercent,
|
|
progressStage,
|
|
error,
|
|
|
|
// 选项
|
|
backendOptions,
|
|
languageOptions,
|
|
|
|
// 方法
|
|
handleFileUpload,
|
|
initializeManualResult,
|
|
clearAll,
|
|
processDocument,
|
|
getFormulaLabel,
|
|
getFormulaInfo
|
|
}
|
|
}
|