meeting_memory/meeting_processor.py

175 lines
6.4 KiB
Python
Raw Normal View History

2026-06-09 02:38:24 +00:00
import hashlib
import logging
import os
from typing import Optional
from config import config
from extractor import MeetingExtraction, extract_meeting_info
from graph_store import graph_store
from meeting_state import MeetingStateStore
from raw_store import raw_meeting_store
from vector_store import meeting_vector_store
logger = logging.getLogger(__name__)
state_store = MeetingStateStore(config.state_path)
class MeetingProcessor:
def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]:
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
return self.process_meeting_text(text, force=force)
def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]:
content_hash = self._compute_content_hash(text)
if not force and state_store.has_content_hash(content_hash):
print("\n检测到重复内容,已跳过。")
logger.info("Duplicate content hash skipped: %s", content_hash[:12])
return None
if not force:
similar = meeting_vector_store.find_similar_text(text, threshold=0.92)
if similar:
meta = similar["metadata"]
print(
f"\n发现相似会议:{meta.get('title', '')} ({meta.get('date', '')}) "
f"相似度 {similar['score']:.2%}"
)
while True:
choice = input("选择 [s]跳过 / [o]覆盖(默认 s").strip().lower() or "s"
if choice == "s":
logger.info("Skipped similar meeting: %s", meta.get("title", ""))
return None
if choice == "o":
force = True
break
print("请输入 s 或 o。")
meeting_data = self._extract(text)
if not meeting_data:
logger.error("Failed to extract meeting information")
return None
data_dict = meeting_data.model_dump()
data_dict["_content_hash"] = content_hash
data_dict["_graph_meeting_id"] = meeting_vector_store._meeting_id(data_dict)
should_skip = self._handle_duplicate(data_dict, force)
if should_skip:
return None
meeting_title = data_dict.get("title", "")
meeting_date = data_dict.get("date", "")
raw_path = raw_meeting_store.save(text, title=meeting_title, date=meeting_date)
data_dict["_original_text"] = text
data_dict["_original_text_path"] = raw_path
meeting_filename = f"{meeting_vector_store._meeting_id(data_dict)}.md"
data_dict["action_items"] = state_store.merge_action_items(
data_dict.get("action_items", []),
meeting_title,
meeting_date,
meeting_filename,
)
data_dict["metrics"] = state_store.merge_metrics(
data_dict.get("metrics", []),
meeting_title,
meeting_date,
meeting_filename,
)
state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename)
state_store.save()
meeting_vector_store.add_meeting(data_dict)
graph_store.upsert_meeting_subgraph(data_dict)
logger.info("Meeting processed: %s", meeting_title)
return raw_path
def _handle_duplicate(self, data_dict: dict, force: bool) -> bool:
title = data_dict.get("title", "")
date = data_dict.get("date", "")
existing = meeting_vector_store.find_meeting(title, date)
if not existing:
return False
if force:
logger.info("Duplicate meeting found; overwriting in force mode: %s", title)
self._remove_old(data_dict, existing)
return False
print(f"\n发现重复会议:{title} ({date})")
while True:
choice = input("选择 [s]跳过 / [o]覆盖(默认 s").strip().lower() or "s"
if choice == "s":
logger.info("Skipped duplicate meeting: %s", title)
return True
if choice == "o":
self._remove_old(data_dict, existing)
return False
print("请输入 s 或 o。")
def _remove_old(self, data_dict: dict, existing: Optional[dict] = None):
meeting_id = meeting_vector_store._meeting_id(data_dict)
meeting_vector_store.remove_meeting(meeting_id)
graph_store.remove_meeting_subgraph(meeting_id)
new_hash = data_dict.get("_content_hash", "")
if new_hash:
state_store.remove_content_hash(new_hash)
if existing:
old_hash = existing.get("content_hash", "")
if old_hash and old_hash != new_hash:
state_store.remove_content_hash(old_hash)
logger.info("Removed old meeting artifacts: %s", data_dict.get("title", ""))
def _compute_content_hash(self, text: str) -> str:
normalized = text.strip().replace("\r\n", "\n")
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
def _extract(self, text: str) -> Optional[MeetingExtraction]:
try:
return extract_meeting_info(text)
except Exception as exc:
logger.error("LLM extraction failed: %s", exc)
return None
def query(self, question: str, top_k: int = 3) -> str:
vector_context = meeting_vector_store.query_as_context(question, top_k=top_k)
graph_results = graph_store.search_facts(question, limit=top_k)
parts = []
if vector_context:
parts.append("=== Vector Context ===\n" + vector_context)
if graph_results:
graph_lines = []
for idx, row in enumerate(graph_results, start=1):
title = row.get("title", row.get("kind", "graph"))
text = row.get("text", "")
date = row.get("date", "")
suffix = f" ({date})" if date else ""
graph_lines.append(f"[{idx}] {title}{suffix}\n{text}")
parts.append("=== Graph Facts ===\n" + "\n\n".join(graph_lines))
return "\n\n".join(parts)
def stats(self) -> dict:
return {
"vector_index": meeting_vector_store.get_stats(),
"graph": graph_store.get_stats(),
"state": state_store.get_stats(),
"raw_dir": config.storage.raw_dir,
"state_path": config.state_path,
}
meeting_processor = MeetingProcessor()