175 lines
6.4 KiB
Python
175 lines
6.4 KiB
Python
|
|
import hashlib
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
from typing import Optional
|
|||
|
|
|
|||
|
|
from config import config
|
|||
|
|
from extractor import MeetingExtraction, extract_meeting_info
|
|||
|
|
from graph_store import graph_store
|
|||
|
|
from meeting_state import MeetingStateStore
|
|||
|
|
from raw_store import raw_meeting_store
|
|||
|
|
from vector_store import meeting_vector_store
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
state_store = MeetingStateStore(config.state_path)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MeetingProcessor:
|
|||
|
|
def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]:
|
|||
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|||
|
|
text = f.read()
|
|||
|
|
return self.process_meeting_text(text, force=force)
|
|||
|
|
|
|||
|
|
def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]:
|
|||
|
|
content_hash = self._compute_content_hash(text)
|
|||
|
|
|
|||
|
|
if not force and state_store.has_content_hash(content_hash):
|
|||
|
|
print("\n检测到重复内容,已跳过。")
|
|||
|
|
logger.info("Duplicate content hash skipped: %s", content_hash[:12])
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
if not force:
|
|||
|
|
similar = meeting_vector_store.find_similar_text(text, threshold=0.92)
|
|||
|
|
if similar:
|
|||
|
|
meta = similar["metadata"]
|
|||
|
|
print(
|
|||
|
|
f"\n发现相似会议:{meta.get('title', '')} ({meta.get('date', '')}) "
|
|||
|
|
f"相似度 {similar['score']:.2%}"
|
|||
|
|
)
|
|||
|
|
while True:
|
|||
|
|
choice = input("选择 [s]跳过 / [o]覆盖(默认 s):").strip().lower() or "s"
|
|||
|
|
if choice == "s":
|
|||
|
|
logger.info("Skipped similar meeting: %s", meta.get("title", ""))
|
|||
|
|
return None
|
|||
|
|
if choice == "o":
|
|||
|
|
force = True
|
|||
|
|
break
|
|||
|
|
print("请输入 s 或 o。")
|
|||
|
|
|
|||
|
|
meeting_data = self._extract(text)
|
|||
|
|
if not meeting_data:
|
|||
|
|
logger.error("Failed to extract meeting information")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
data_dict = meeting_data.model_dump()
|
|||
|
|
data_dict["_content_hash"] = content_hash
|
|||
|
|
data_dict["_graph_meeting_id"] = meeting_vector_store._meeting_id(data_dict)
|
|||
|
|
|
|||
|
|
should_skip = self._handle_duplicate(data_dict, force)
|
|||
|
|
if should_skip:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
meeting_title = data_dict.get("title", "")
|
|||
|
|
meeting_date = data_dict.get("date", "")
|
|||
|
|
raw_path = raw_meeting_store.save(text, title=meeting_title, date=meeting_date)
|
|||
|
|
|
|||
|
|
data_dict["_original_text"] = text
|
|||
|
|
data_dict["_original_text_path"] = raw_path
|
|||
|
|
|
|||
|
|
meeting_filename = f"{meeting_vector_store._meeting_id(data_dict)}.md"
|
|||
|
|
|
|||
|
|
data_dict["action_items"] = state_store.merge_action_items(
|
|||
|
|
data_dict.get("action_items", []),
|
|||
|
|
meeting_title,
|
|||
|
|
meeting_date,
|
|||
|
|
meeting_filename,
|
|||
|
|
)
|
|||
|
|
data_dict["metrics"] = state_store.merge_metrics(
|
|||
|
|
data_dict.get("metrics", []),
|
|||
|
|
meeting_title,
|
|||
|
|
meeting_date,
|
|||
|
|
meeting_filename,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename)
|
|||
|
|
state_store.save()
|
|||
|
|
meeting_vector_store.add_meeting(data_dict)
|
|||
|
|
graph_store.upsert_meeting_subgraph(data_dict)
|
|||
|
|
|
|||
|
|
logger.info("Meeting processed: %s", meeting_title)
|
|||
|
|
return raw_path
|
|||
|
|
|
|||
|
|
def _handle_duplicate(self, data_dict: dict, force: bool) -> bool:
|
|||
|
|
title = data_dict.get("title", "")
|
|||
|
|
date = data_dict.get("date", "")
|
|||
|
|
existing = meeting_vector_store.find_meeting(title, date)
|
|||
|
|
|
|||
|
|
if not existing:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
if force:
|
|||
|
|
logger.info("Duplicate meeting found; overwriting in force mode: %s", title)
|
|||
|
|
self._remove_old(data_dict, existing)
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
print(f"\n发现重复会议:{title} ({date})")
|
|||
|
|
while True:
|
|||
|
|
choice = input("选择 [s]跳过 / [o]覆盖(默认 s):").strip().lower() or "s"
|
|||
|
|
if choice == "s":
|
|||
|
|
logger.info("Skipped duplicate meeting: %s", title)
|
|||
|
|
return True
|
|||
|
|
if choice == "o":
|
|||
|
|
self._remove_old(data_dict, existing)
|
|||
|
|
return False
|
|||
|
|
print("请输入 s 或 o。")
|
|||
|
|
|
|||
|
|
def _remove_old(self, data_dict: dict, existing: Optional[dict] = None):
|
|||
|
|
meeting_id = meeting_vector_store._meeting_id(data_dict)
|
|||
|
|
meeting_vector_store.remove_meeting(meeting_id)
|
|||
|
|
graph_store.remove_meeting_subgraph(meeting_id)
|
|||
|
|
|
|||
|
|
new_hash = data_dict.get("_content_hash", "")
|
|||
|
|
if new_hash:
|
|||
|
|
state_store.remove_content_hash(new_hash)
|
|||
|
|
|
|||
|
|
if existing:
|
|||
|
|
old_hash = existing.get("content_hash", "")
|
|||
|
|
if old_hash and old_hash != new_hash:
|
|||
|
|
state_store.remove_content_hash(old_hash)
|
|||
|
|
|
|||
|
|
logger.info("Removed old meeting artifacts: %s", data_dict.get("title", ""))
|
|||
|
|
|
|||
|
|
def _compute_content_hash(self, text: str) -> str:
|
|||
|
|
normalized = text.strip().replace("\r\n", "\n")
|
|||
|
|
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
|
|||
|
|
|
|||
|
|
def _extract(self, text: str) -> Optional[MeetingExtraction]:
|
|||
|
|
try:
|
|||
|
|
return extract_meeting_info(text)
|
|||
|
|
except Exception as exc:
|
|||
|
|
logger.error("LLM extraction failed: %s", exc)
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def query(self, question: str, top_k: int = 3) -> str:
|
|||
|
|
vector_context = meeting_vector_store.query_as_context(question, top_k=top_k)
|
|||
|
|
graph_results = graph_store.search_facts(question, limit=top_k)
|
|||
|
|
|
|||
|
|
parts = []
|
|||
|
|
if vector_context:
|
|||
|
|
parts.append("=== Vector Context ===\n" + vector_context)
|
|||
|
|
|
|||
|
|
if graph_results:
|
|||
|
|
graph_lines = []
|
|||
|
|
for idx, row in enumerate(graph_results, start=1):
|
|||
|
|
title = row.get("title", row.get("kind", "graph"))
|
|||
|
|
text = row.get("text", "")
|
|||
|
|
date = row.get("date", "")
|
|||
|
|
suffix = f" ({date})" if date else ""
|
|||
|
|
graph_lines.append(f"[{idx}] {title}{suffix}\n{text}")
|
|||
|
|
parts.append("=== Graph Facts ===\n" + "\n\n".join(graph_lines))
|
|||
|
|
|
|||
|
|
return "\n\n".join(parts)
|
|||
|
|
|
|||
|
|
def stats(self) -> dict:
|
|||
|
|
return {
|
|||
|
|
"vector_index": meeting_vector_store.get_stats(),
|
|||
|
|
"graph": graph_store.get_stats(),
|
|||
|
|
"state": state_store.get_stats(),
|
|||
|
|
"raw_dir": config.storage.raw_dir,
|
|||
|
|
"state_path": config.state_path,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
meeting_processor = MeetingProcessor()
|