feat(思维导图助手):思维导图助手过程1

develop
panyy 2026-06-23 09:46:12 +08:00
parent 75835051b2
commit 59e2554238
1 changed files with 87 additions and 41 deletions

View File

@ -83,7 +83,8 @@ def _update_task_progress(task_id: Optional[str], progress: int, stage: str):
return
state = _get_task_progress(task_id)
if state is not None:
state["progress"] = min(progress, 100)
current = int(state.get("progress", 0) or 0)
state["progress"] = min(max(progress, current), 100)
state["stage"] = stage
_store_task_progress(task_id, state)
@ -162,7 +163,11 @@ class _StderrProgressCapture:
# tqdm 进度条模式:名称: 百分比|...| 当前/总数
_PATTERNS = [
(re.compile(r'Two Step Extraction:\s*(\d+)%.*?(\d+)/(\d+)'), 'extract'),
(re.compile(r'Layout Preparation:\s*(\d+)%.*?(\d+)/(\d+)'), 'layout_prepare'),
(re.compile(r'Layout Output Parsing:\s*(\d+)%.*?(\d+)/(\d+)'), 'layout_parse'),
(re.compile(r'Extract Preparation:\s*(\d+)%.*?(\d+)/(\d+)'), 'extract_prepare'),
(re.compile(r'Post Processing:\s*(\d+)%.*?(\d+)/(\d+)'), 'post_process'),
(re.compile(r'Two Step Extraction:\s*(\d+)%.*?(\d+)/(\d+)'), 'vlm_predict'),
(re.compile(r'MFD Predict:\s*(\d+)%.*?(\d+)/(\d+)'), 'mfd'),
(re.compile(r'MFR Predict:\s*(\d+)%.*?(\d+)/(\d+)'), 'mfr'),
(re.compile(r'OCR-det:\s*(\d+)%.*?(\d+)/(\d+)'), 'ocr_det'),
@ -170,22 +175,35 @@ class _StderrProgressCapture:
(re.compile(r'Loading safetensors.*?:\s*(\d+)%'), 'load_model'),
(re.compile(r'Capturing CUDA graphs.*?:\s*(\d+)%'), 'cuda_graph'),
]
_GENERIC_PREDICT_PATTERN = re.compile(r'^Predict:\s*(\d+)%.*?(\d+)/(\d+)')
# 各阶段的进度映射范围 [start%, end%]
_RANGES = {
'load_model': (12, 18),
'cuda_graph': (33, 37),
'extract': (42, 65),
'mfd': (75, 80),
'mfr': (80, 87),
'ocr_det': (87, 92),
'ocr_rec': (92, 96),
'layout_prepare': (42, 45),
'layout_predict': (45, 68),
'layout_parse': (68, 70),
'extract_prepare': (70, 72),
'extract_predict': (72, 88),
'vlm_predict': (45, 88),
'post_process': (88, 90),
'mfd': (90, 92),
'mfr': (92, 94),
'ocr_det': (94, 96),
'ocr_rec': (96, 97),
}
_STAGE_LABELS = {
'load_model': '加载模型权重',
'cuda_graph': '捕获CUDA计算图',
'extract': 'VLM文档分析',
'layout_prepare': '准备版面分析',
'layout_predict': '版面分析',
'layout_parse': '解析版面结果',
'extract_prepare': '准备内容抽取',
'extract_predict': '内容抽取',
'vlm_predict': 'VLM文档分析',
'post_process': '后处理',
'mfd': '数学公式检测',
'mfr': '数学公式识别',
'ocr_det': '文字区域检测',
@ -195,55 +213,83 @@ class _StderrProgressCapture:
def __init__(self, task_id: str):
self.task_id = task_id
self._active = False
self._thread: Optional[threading.Thread] = None
self._orig_stderr = None
self._buf = ""
self._last_anchor = ""
def start(self):
self._active = True
self._orig_stderr = sys.stderr
self._thread = threading.Thread(target=self._reader_loop, daemon=True)
self._thread.start()
sys.stderr = self
def stop(self):
self._active = False
if self._thread and self._thread.is_alive():
self._thread.join(timeout=2)
self._thread = None
if self._buf.strip():
self._parse_line(self._buf.strip())
self._buf = ""
if self._orig_stderr is not None and sys.stderr is self:
sys.stderr = self._orig_stderr
def _reader_loop(self):
buf = ""
orig = self._orig_stderr
while self._active:
try:
ch = orig.read(1)
if not ch:
break
buf += ch
# tqdm 用 \r 更新同一行,\n 表示新行
if ch == '\r' or ch == '\n':
if buf.strip():
self._parse_line(buf.strip())
buf = ""
except Exception:
break
def write(self, text):
if self._orig_stderr is not None:
self._orig_stderr.write(text)
if not self._active:
return len(text)
for ch in text:
self._buf += ch
# tqdm 用 \r 更新同一行,\n 表示新行
if ch == '\r' or ch == '\n':
if self._buf.strip():
self._parse_line(self._buf.strip())
self._buf = ""
return len(text)
def flush(self):
if self._orig_stderr is not None:
self._orig_stderr.flush()
def isatty(self):
return bool(self._orig_stderr and self._orig_stderr.isatty())
def fileno(self):
if self._orig_stderr is not None:
return self._orig_stderr.fileno()
raise OSError("stderr is not available")
def __getattr__(self, name):
if self._orig_stderr is not None:
return getattr(self._orig_stderr, name)
raise AttributeError(name)
def _parse_line(self, line: str):
if "Layout Preparation:" in line:
self._last_anchor = "layout"
elif "Extract Preparation:" in line:
self._last_anchor = "extract"
generic_predict = self._GENERIC_PREDICT_PATTERN.search(line)
if generic_predict:
stage = "extract_predict" if self._last_anchor == "extract" else "layout_predict"
self._update_from_match(generic_predict, stage)
return
for pattern, stage in self._PATTERNS:
m = pattern.search(line)
if m:
pct = int(m.group(1))
lo, hi = self._RANGES.get(stage, (0, 100))
mapped = lo + int((hi - lo) * pct / 100)
label = self._STAGE_LABELS.get(stage, stage)
if stage == 'extract' and len(m.groups()) >= 3:
cur, total = m.group(2), m.group(3)
label = f"VLM文档分析 ({cur}/{total}页)"
elif stage in ('mfd', 'mfr', 'ocr_det', 'ocr_rec') and len(m.groups()) >= 3:
cur, total = m.group(2), m.group(3)
label = f"{label} ({cur}/{total})"
_update_task_progress(self.task_id, mapped, label)
self._update_from_match(m, stage)
break
def _update_from_match(self, match, stage: str):
pct = int(match.group(1))
lo, hi = self._RANGES.get(stage, (0, 100))
mapped = lo + int((hi - lo) * pct / 100)
label = self._STAGE_LABELS.get(stage, stage)
if len(match.groups()) >= 3:
cur, total = match.group(2), match.group(3)
unit = "" if stage in ("layout_predict", "vlm_predict") else ""
label = f"{label} ({cur}/{total}{unit})"
_update_task_progress(self.task_id, mapped, label)
async def limit_concurrency():
if _request_semaphore is not None: