diff --git a/README.md b/README.md index 7033482..c1384f9 100644 --- a/README.md +++ b/README.md @@ -143,23 +143,9 @@ python .\main_cli.py --offline 单条离线导入测试: ```powershell -python .\main_cli.py --offline --once "导入 D:\github_project\my_code\meeting_agent\examples\huiyi.txt 知识库 合川分公司" +python .\main_cli.py --offline --once "导入 D:\github_project\my_code\meeting_agent\examples\huiyi.txt" ``` - -## 知识库选择约定 - -当前只是演示 demo,不做团队隔离。`data/` 下每个子文件夹视作一个独立知识库,例如: - -```text -data/ -├─ 合川分公司/ -└─ 另一个知识库/ -``` - -未来 Web 界面可以列出这些文件夹,让用户选择一个知识库后,把文件夹名作为 `knowledge_base_id` 传给工具进行针对性提问。`team_id` 只是旧兼容参数,后续推荐使用 `knowledge_base_id`。 - -在当前这版 Web 里,系统会把每个知识库自动映射成一个可进入的会话入口;选中对应会话后提问,就会优先基于该知识库回答。 ## Agent 工具 - `current_time`:获取当前时间。 @@ -170,11 +156,10 @@ data/ - `execute_shell`:执行 shell 命令做环境检查或调试。 - `run_python`:执行简短 Python 代码做快速处理。 - `tool_trace_query`:按需查询本地保存的工具调用轨迹。 -- `import_meeting_transcript`:导入会议原文,交给会议整理子 Agent 做项目化整理,并更新指定知识库的 Markdown 台账。 -- `query_knowledge`:查询指定知识库周会台账。 -- `list_knowledge_bases`:列出 `data/` 下已有知识库。 +- `store_meeting_memory`:把会议原文写入长期会议记忆,执行抽取、归档、状态合并与图谱索引。 +- `query_meeting_memory`:查询长期会议记忆中的会议上下文、实体关系、行动项和指标信息。 -LLM Agent 会在对话中自动选择这些工具。会议整理本身由 `prompt/zh/meeting_digest.yaml` 驱动的子 Agent 完成,主 Agent 只负责调工具和查询台账。 +LLM Agent 会在对话中自动选择这些工具。当前知识功能已经切换到图谱化的 `meeting_memory` 管线,不再维护旧版 Markdown 台账知识库。 ## 多轮会话机制 @@ -185,7 +170,7 @@ LLM Agent 会在对话中自动选择这些工具。会议整理本身由 `promp - 工具调用的参数、思考、结果会单独保存到本地 `tool_trace.json`。 - 默认不会把大量工具过程长期塞回主上下文;需要时由 Agent 调用 `tool_trace_query` 再查。 -CLI 默认使用会话 `cli_default`。Web 侧每个聊天或知识库会映射到独立的本地会话目录。 +CLI 默认使用会话 `cli_default`。Web 侧每个聊天会映射到独立的本地会话目录。 ## 输出数据 @@ -193,10 +178,8 @@ CLI 默认使用会话 `cli_default`。Web 侧每个聊天或知识库会映射 ```text data/ -└─ 合川分公司/ # 一个知识库示例 - ├─ state.json - ├─ team_ledger.md - └─ meetings/ +├─ meeting_state.json +└─ raw/ ``` `team_ledger.md` 是面向人和 Agent 共同读取的主文档;`state.json` 和 `meetings/*.json` 是可复用的结构化中间结果。 diff --git a/main_cli.py b/main_cli.py index 7aa54be..417f971 100644 --- a/main_cli.py +++ b/main_cli.py @@ -26,8 +26,8 @@ class TerminalRenderer: def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description="Meeting knowledge agent CLI.") - parser.add_argument("--data-dir", default="data", help="台账和结构化数据保存目录。") + parser = argparse.ArgumentParser(description="Meeting memory agent CLI.") + parser.add_argument("--data-dir", default="data", help="会议记忆与原文归档保存目录。") parser.add_argument("--once", default="", help="只执行一条消息,适合测试。") parser.add_argument("--offline", action="store_true", help="只用于本地演示:强制使用离线规则 provider,不调用大模型。") parser.add_argument("--list-models", action="store_true", help="列出当前 OpenAI-compatible 服务暴露的模型名。") @@ -66,9 +66,9 @@ def main() -> None: renderer.render_event(event) return - print("Meeting Knowledge Interactive Agent") - print(r"示例:导入 D:\github_project\my_code\meeting_agent\examples\huiyi.txt 到知识库 合川分公司") - print("示例:专线护航还有哪些待办?知识库 合川分公司") + print("Meeting Memory Interactive Agent") + print(r"示例:导入 D:\github_project\my_code\meeting_agent\examples\huiyi.txt") + print("示例:最近一次会议提到了哪些待办?") print("输入 exit / quit 退出。") while True: try: diff --git a/meeting_memory/__init__.py b/meeting_memory/__init__.py index c880a34..9a1087c 100644 --- a/meeting_memory/__init__.py +++ b/meeting_memory/__init__.py @@ -1,5 +1,3 @@ -"""Meeting knowledge demo package.""" +from meeting_memory.config import config -from .service import MeetingKnowledgeService - -__all__ = ["MeetingKnowledgeService"] +__all__ = ["config"] diff --git a/meeting_memory/config.py b/meeting_memory/config.py new file mode 100644 index 0000000..8bd9abe --- /dev/null +++ b/meeting_memory/config.py @@ -0,0 +1,46 @@ +import os + +from dotenv import load_dotenv +from pydantic import BaseModel, Field + +load_dotenv() + +PACKAGE_ROOT = os.path.dirname(os.path.abspath(__file__)) +PROJECT_ROOT = os.path.dirname(PACKAGE_ROOT) + + +class LLMConfig(BaseModel): + api_key: str = Field(default=os.getenv("LLM_API_KEY", "")) + base_url: str = Field(default=os.getenv("LLM_BASE_URL", "https://api.deepseek.com/v1")) + model: str = Field(default=os.getenv("LLM_MODEL", "deepseek-chat")) + max_tokens: int = Field(default=64000) + temperature: float = Field(default=0.95) + + +class EmbeddingConfig(BaseModel): + api_key: str = Field(default=os.getenv("EMBEDDING_API_KEY", "")) + api_base: str = Field(default=os.getenv("EMBEDDING_BASE_URL", "https://api.openai.com/v1")) + model: str = Field(default=os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")) + + +class StorageConfig(BaseModel): + data_dir: str = Field(default=os.path.join(PROJECT_ROOT, "data")) + raw_dir: str = Field(default=os.path.join(PROJECT_ROOT, "data", "raw")) + +class Neo4jConfig(BaseModel): + enabled: bool = Field(default=os.getenv("NEO4J_ENABLED", "false").lower() == "true") + uri: str = Field(default=os.getenv("NEO4J_URI", "bolt://localhost:7687")) + user: str = Field(default=os.getenv("NEO4J_USER", "neo4j")) + password: str = Field(default=os.getenv("NEO4J_PASSWORD", "")) + database: str = Field(default=os.getenv("NEO4J_DATABASE", "neo4j")) + + +class ProjectConfig(BaseModel): + llm: LLMConfig = Field(default_factory=LLMConfig) + embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig) + storage: StorageConfig = Field(default_factory=StorageConfig) + neo4j: Neo4jConfig = Field(default_factory=Neo4jConfig) + state_path: str = Field(default=os.path.join(PROJECT_ROOT, "data", "meeting_state.json")) + + +config = ProjectConfig() diff --git a/meeting_memory/dedup_helpers.py b/meeting_memory/dedup_helpers.py new file mode 100644 index 0000000..b2dd7e7 --- /dev/null +++ b/meeting_memory/dedup_helpers.py @@ -0,0 +1,143 @@ +import hashlib +import logging +import math +import re +from typing import Any, Dict, List, Set, Tuple + +logger = logging.getLogger(__name__) + +_MINHASH_SEEDS = [ + 5483, 7361, 8919, 1103, 4597, 6827, 2341, 9973, + 3851, 6217, 7589, 1447, 5021, 8363, 2711, 9539, + 1973, 6689, 4157, 8831, 3209, 7547, 1627, 5981, + 4363, 8707, 2857, 7193, 1543, 6299, 4729, 9067, +] + +_BAND_SIZE = 4 +_NUM_BANDS = len(_MINHASH_SEEDS) // _BAND_SIZE +_FUZZY_JACCARD_THRESHOLD = 0.9 + + +def normalize_name(name: str) -> str: + return re.sub(r'\s+', ' ', name.strip().lower()) + + +def shannon_entropy(text: str) -> float: + if not text: + return 0.0 + lowered = text.lower() + freq: Dict[str, int] = {} + for c in lowered: + freq[c] = freq.get(c, 0) + 1 + entropy = 0.0 + for count in freq.values(): + p = count / len(lowered) + entropy -= p * math.log2(p) + return entropy + + +def _shingle(s: str, k: int = 3) -> Set[str]: + return {s[i:i + k] for i in range(len(s) - k + 1)} + + +def _minhash_signature(text: str) -> List[int]: + shingles = _shingle(text) + sig = [] + for seed in _MINHASH_SEEDS: + min_hash = min( + int(hashlib.md5(f'{seed}{s}'.encode()).hexdigest()[:8], 16) + for s in shingles + ) + sig.append(min_hash) + return sig + + +def _lsh_bands(signature: List[int]) -> Set[Tuple[int, int]]: + bands = set() + for i in range(_NUM_BANDS): + start = i * _BAND_SIZE + band_hash = hash(tuple(signature[start:start + _BAND_SIZE])) + bands.add((i, band_hash)) + return bands + + +def _jaccard_similarity(sig_a: List[int], sig_b: List[int]) -> float: + if not sig_a or not sig_b: + return 0.0 + intersection = sum(1 for a, b in zip(sig_a, sig_b) if a == b) + return intersection / len(sig_a) + + +def resolve_with_similarity( + extracted: List[Dict[str, Any]], + existing: List[Dict[str, Any]], +) -> Tuple[List[int], List[int]]: + resolved_indices: List[int] = [] + unresolved_indices: List[int] = [] + + existing_norm: Dict[str, Dict] = {} + for c in existing: + norm = normalize_name(c.get('name', '')) + if norm: + existing_norm[norm] = c + + for c in existing: + c_name = c.get('name', '') + if c_name: + entropy = shannon_entropy(c_name) + tokens = normalize_name(c_name).split() + if len(c_name) >= 6 and len(tokens) >= 2 and entropy >= 1.5: + c['_minhash'] = _minhash_signature(c_name) + else: + c['_minhash'] = None + + for idx, entity in enumerate(extracted): + name = entity.get('name', '') + if not name: + unresolved_indices.append(idx) + continue + + norm = normalize_name(name) + + if norm in existing_norm: + match = existing_norm[norm] + entity['_resolved_to'] = match + entity['name'] = match.get('name', entity['name']) + resolved_indices.append(idx) + continue + + tokens = norm.split() + entropy = shannon_entropy(name) + can_fuzzy = len(name) >= 6 and len(tokens) >= 2 and entropy >= 1.5 + + if can_fuzzy: + entity_sig = _minhash_signature(name) + entity_bands = _lsh_bands(entity_sig) + best_match = None + best_score = 0.0 + + for c in existing: + c_sig = c.get('_minhash') + if c_sig is None: + continue + c_bands = _lsh_bands(c_sig) + if not entity_bands & c_bands: + continue + score = _jaccard_similarity(entity_sig, c_sig) + if score >= _FUZZY_JACCARD_THRESHOLD and score > best_score: + best_score = score + best_match = c + + if best_match is not None: + entity['_resolved_to'] = best_match + entity['name'] = best_match.get('name', entity['name']) + entity['_match_score'] = best_score + resolved_indices.append(idx) + continue + + unresolved_indices.append(idx) + + for c in existing: + c.pop('_minhash', None) + + return resolved_indices, unresolved_indices diff --git a/meeting_memory/extractor.py b/meeting_memory/extractor.py index 0ecaf6e..bc047bc 100644 --- a/meeting_memory/extractor.py +++ b/meeting_memory/extractor.py @@ -1,260 +1,633 @@ -from __future__ import annotations - -import hashlib +import json +import logging import re -from pathlib import Path +import sys +from enum import Enum +from typing import Any, List, Optional -from .models import ExtractedMeeting +from openai import OpenAI +from pydantic import BaseModel, Field + +from meeting_memory.config import config +from meeting_memory.prompts import extract_entities as prompt_extract_entities +from meeting_memory.prompts import extract_facts as prompt_extract_facts +from meeting_memory.prompts import resolve_entities as prompt_dedupe_nodes +from meeting_memory.prompts import resolve_facts as prompt_dedupe_edges +from meeting_memory.prompts import summarize_entity as prompt_summarize + +logger = logging.getLogger(__name__) + +client = OpenAI( + api_key=config.llm.api_key or None, + base_url=config.llm.base_url if config.llm.base_url else None, +) -TIME_RE = re.compile(r"^\d{1,2}:\d{2}$") -SPEAKER_RE = re.compile(r"^未知说话人\d+") +class EntityType(str, Enum): + DEPARTMENT = 'Department' + PROJECT = 'Project' + METRIC = 'Metric' + PERSON = 'Person' + SYSTEM = 'System' + DOCUMENT = 'Document' + PARTICIPANT = 'participant' + UNKNOWN = 'Unknown' -KEYWORDS = { - "action": ("要", "需要", "请", "争取", "预计", "本周", "下周", "今天", "月底", "会后", "汇报", "完成", "制定", "上报"), - "risk": ("问题", "风险", "未达标", "靠后", "恶化", "影响", "没有", "没达", "退单", "故障", "不满"), - "done": ("已完成", "完成了", "已经完成", "达标", "解决了", "已提交", "已汇报", "已报备"), + +# Normalization map: legacy LLM output → canonical type +_ENTITY_TYPE_ALIASES = { + '组织': 'Department', + 'organization': 'Department', + '部门': 'Department', + '指标': 'Metric', + 'kpi': 'Metric', + '项目': 'Project', } -METRIC_PATTERNS = ( - re.compile(r"(?P弱光).*?(?P\d+(?:\.\d+)?)"), - re.compile(r"(?P转化率).*?(?P\d+(?:\.\d+)?)"), - re.compile(r"(?P退单率).*?(?P\d+(?:\.\d+)?)"), - re.compile(r"(?P蓝单).*?(?P\d+(?:\.\d+)?)"), - re.compile(r"(?P满意度).*?(?P\d+(?:\.\d+)?)"), - re.compile(r"(?PKPI).*?(?P\d+(?:\.\d+)?)"), -) -TARGET_PATTERNS = ( - re.compile(r"目标(?P\d+(?:\.\d+)?%?)"), - re.compile(r"达到(?P\d+(?:\.\d+)?%?)"), - re.compile(r"低于(?P\d+(?:\.\d+)?%?)"), - re.compile(r"小于(?P\d+(?:\.\d+)?%?)"), -) +def _canonical_entity_type(raw: str) -> str: + normalized = raw.strip() + if normalized in _ENTITY_TYPE_ALIASES: + return _ENTITY_TYPE_ALIASES[normalized] + for member in EntityType: + if member.value.lower() == normalized.lower(): + return member.value + return EntityType.UNKNOWN.value -def extract_meeting(text: str, *, source_path: str | Path, meeting_date: str) -> ExtractedMeeting: - source = str(Path(source_path).resolve()) - meeting_id = _meeting_id(source, text) - utterances = _parse_utterances(text) - meeting = ExtractedMeeting(meeting_id=meeting_id, meeting_date=meeting_date, source_path=source) - - for item in utterances: - content = _clean_sentence(item["content"]) - if len(content) < 8: - continue - evidence = {"time": item["time"], "speaker": item["speaker"], "text": content} - meeting.evidence.append(evidence) - if _contains(content, KEYWORDS["done"]): - meeting.completed_items.append(_item(content, evidence, status="已完成/已处理")) - if _contains(content, KEYWORDS["risk"]): - meeting.risks.append(_risk(content, evidence)) - if _contains(content, KEYWORDS["action"]): - meeting.action_items.append(_action(content, evidence)) - metric = _metric(content, evidence) - if metric: - meeting.metrics.append(metric) - - meeting.summary = _make_summary(meeting) - meeting.followups = _make_followups(meeting) - meeting.metrics = _dedupe_dicts(meeting.metrics, ("name", "value", "evidence")) - meeting.action_items = _dedupe_dicts(meeting.action_items, ("title", "evidence")) - meeting.risks = _dedupe_dicts(meeting.risks, ("title", "evidence")) - meeting.completed_items = _dedupe_dicts(meeting.completed_items, ("title", "evidence")) - return meeting +def _neo4j_labels(entity_type: str) -> list[str]: + canonical = _canonical_entity_type(entity_type) + labels = ['Entity'] + if canonical != EntityType.UNKNOWN.value: + labels.append(canonical) + return labels -def _parse_utterances(text: str) -> list[dict[str, str]]: - lines = [line.strip() for line in text.splitlines() if line.strip()] - utterances: list[dict[str, str]] = [] - speaker = "" - time = "" - pending: list[str] = [] - - def flush() -> None: - nonlocal pending - if pending: - utterances.append({"speaker": speaker, "time": time, "content": " ".join(pending)}) - pending = [] - - for line in lines: - if SPEAKER_RE.match(line): - flush() - speaker = line - time = "" - continue - if TIME_RE.match(line): - flush() - time = line - continue - pending.append(line) - flush() - return utterances +class Entity(BaseModel): + name: str + entity_type: str = EntityType.UNKNOWN.value + description: str = '' -def _clean_sentence(text: str) -> str: - text = re.sub(r"\s+", " ", text).strip() - text = re.sub(r"[啊哈嗯呃]{2,}", "", text) - return text +class Relation(BaseModel): + source_entity_name: str + target_entity_name: str + relation_type: str + fact: str = '' + valid_at: str = '' + invalid_at: str = '' + evidence: str = '' + qualifiers: List[str] = Field(default_factory=list) + confidence: float = 0.0 -def _contains(text: str, words: tuple[str, ...]) -> bool: - return any(word in text for word in words) +class ActionItem(BaseModel): + task: str + assignee: str = '' + deadline: str = '' + status: str = '待办' + priority: str = '中' -def _item(content: str, evidence: dict[str, str], *, status: str) -> dict[str, str]: - return { - "title": _short_title(content), - "status": status, - "owner": _guess_owner(content), - "deadline": _guess_deadline(content), - "evidence": _evidence_ref(evidence), +class Decision(BaseModel): + content: str + proposer: str = '' + status: str = '已决' + + +class MeetingMetric(BaseModel): + metric_name: str + value: str + target: str = '' + owner: str = '' + trend: str = '' + unit: str = '' + + +class DepartmentInfo(BaseModel): + name: str + description: str = '' + projects: List[str] = Field(default_factory=list) + + +class MeetingExtraction(BaseModel): + title: str + date: str = '' + participants: List[str] = Field(default_factory=list) + agenda: List[str] = Field(default_factory=list) + entities: List[Entity] = Field(default_factory=list) + relations: List[Relation] = Field(default_factory=list) + action_items: List[ActionItem] = Field(default_factory=list) + decisions: List[Decision] = Field(default_factory=list) + metrics: List[MeetingMetric] = Field(default_factory=list) + departments: List[DepartmentInfo] = Field(default_factory=list) + summary: str = '' + + +def _call_llm( + messages: list[dict], + response_model: type | None = None, + stream: bool = False, + max_tokens: int | None = None, +) -> Any: + kwargs = { + 'model': config.llm.model, + 'messages': messages, + 'max_tokens': max_tokens or config.llm.max_tokens, + 'temperature': config.llm.temperature, } + if response_model is not None: + kwargs['response_format'] = {'type': 'json_object'} + if stream: + kwargs['stream'] = True + if not stream: + response = client.chat.completions.create(**kwargs) + content = response.choices[0].message.content + if content is None: + raise ValueError('LLM returned empty response') + return content -def _action(content: str, evidence: dict[str, str]) -> dict[str, str]: - return { - "title": _short_title(content), - "owner": _guess_owner(content), - "deadline": _guess_deadline(content), - "next_step": _short_title(content, limit=80), - "status": "待跟进", - "evidence": _evidence_ref(evidence), - } - - -def _risk(content: str, evidence: dict[str, str]) -> dict[str, str]: - return { - "title": _short_title(content), - "impact": _guess_impact(content), - "current_handling": _short_title(content, limit=90), - "evidence": _evidence_ref(evidence), - } - - -def _metric(content: str, evidence: dict[str, str]) -> dict[str, str] | None: - for pattern in METRIC_PATTERNS: - match = pattern.search(content) - if not match: + kwargs['stream'] = True + response = client.chat.completions.create(**kwargs) + chunks: List[str] = [] + print('\n[LLM] 开始流式输出:') + for event in response: + if not event.choices: continue - name = match.group("name") - value = match.group("value") - target = "" - for target_pattern in TARGET_PATTERNS: - target_match = target_pattern.search(content) - if target_match: - target = target_match.group("target") - break - status = "待判断" - if "达标" in content and "未达标" not in content: - status = "达标" - elif "没达" in content or "未达标" in content or "靠后" in content: - status = "未达标/靠后" - elif "向好" in content or "提升" in content or "改善" in content: - status = "改善中" - return { - "name": name, - "value": value, - "target": target, - "status": status, - "trend": _guess_trend(content), - "evidence": _evidence_ref(evidence), - } - return None + delta = event.choices[0].delta.content + if not delta: + continue + chunks.append(delta) + sys.stdout.write(delta) + sys.stdout.flush() + print('\n[LLM] 输出结束') + return ''.join(chunks) -def _make_summary(meeting: ExtractedMeeting) -> list[str]: - parts = [] - if meeting.metrics: - parts.append(f"抽取到 {len(meeting.metrics)} 条指标状态。") - if meeting.action_items: - parts.append(f"抽取到 {len(meeting.action_items)} 条待跟进事项。") - if meeting.risks: - parts.append(f"抽取到 {len(meeting.risks)} 条问题/风险线索。") - if meeting.completed_items: - parts.append(f"抽取到 {len(meeting.completed_items)} 条已完成或已处理事项。") - return parts or ["本次会议已导入,未抽取到明显结构化事项。"] - - -def _make_followups(meeting: ExtractedMeeting) -> list[dict[str, str]]: - followups = [] - for item in meeting.action_items[:20]: - if item["deadline"] or any(word in item["title"] for word in ("汇报", "完成", "上报", "回复")): - followups.append( - { - "title": item["title"], - "owner": item["owner"], - "deadline": item["deadline"], - "evidence": item["evidence"], - } - ) - return followups - - -def _short_title(text: str, *, limit: int = 44) -> str: - text = re.sub(r"[。!?].*", "", text).strip(" ,,。") - return text if len(text) <= limit else text[:limit].rstrip() + "..." - - -def _guess_owner(text: str) -> str: - for owner in ("运维", "商务部", "市场部", "综合部", "政企部", "施工", "客服", "分局", "四公司", "市公司"): - if owner in text: - return owner - return "待确认" - - -def _guess_deadline(text: str) -> str: - patterns = ( - r"今天", - r"本周内?", - r"下周", - r"月底", - r"\d+月\d+号?之前", - r"\d+月底", - r"\d+周之内", - r"会后", - ) - for pattern in patterns: - match = re.search(pattern, text) +def _try_parse_json(content: str) -> dict | list: + try: + return json.loads(content) + except json.JSONDecodeError: + logger.warning('JSON parsing failed; trying to repair extracted block') + match = re.search(r'\{.*\}|\[.*\]', content, re.DOTALL) if match: - return match.group(0) - return "" + try: + return json.loads(match.group()) + except json.JSONDecodeError as exc: + logger.error('Repaired JSON still failed to parse: %s', exc) + raise -def _guess_impact(text: str) -> str: - for word in ("指标", "考核", "转化率", "退单率", "满意度", "专线", "客户", "故障"): - if word in text: - return f"影响{word}相关工作" - return "影响待确认" +def _normalize_string(name: str) -> str: + return re.sub(r'[\s]+', ' ', name.strip().lower()) -def _guess_trend(text: str) -> str: - if "恶化" in text or "下降" in text: - return "变差/下降" - if "向好" in text or "提升" in text or "改善" in text: - return "向好/改善" - if "平稳" in text: - return "平稳" - return "" +def _format_episodes_for_context(episodes: list[dict] | None) -> str: + if not episodes: + return '' + return '\n'.join( + f'[Episode {i}] {ep.get("content", "")}' + for i, ep in enumerate(episodes) + ) -def _evidence_ref(evidence: dict[str, str]) -> str: - return f"{evidence.get('time', '')} {evidence.get('speaker', '')}".strip() +# ===== Step 1: 实体节点抽取 ===== + +def extract_entities_from_text( + text: str, + previous_episodes: list[dict] | None = None, + entity_types: list[dict] | None = None, + stream: bool = False, +) -> list[dict]: + context = { + 'episode_content': text, + 'previous_episodes': previous_episodes or [], + 'entity_types': entity_types or [], + } + messages = prompt_extract_entities(context) + content = _call_llm(messages, stream=stream) + try: + data = _try_parse_json(content) + except Exception as exc: + logger.error('Failed to parse entity extraction result: %s', exc) + return [] + if isinstance(data, dict): + data = data.get('entities', data.get('extracted_entities', [])) + if not isinstance(data, list): + return [] + result = [] + for item in data: + if isinstance(item, dict) and item.get('name', '').strip(): + result.append({ + 'name': item['name'].strip(), + 'entity_type': item.get('entity_type', 'Entity'), + 'description': item.get('description', ''), + 'evidence': item.get('evidence', ''), + }) + return result -def _dedupe_dicts(items: list[dict[str, str]], keys: tuple[str, ...]) -> list[dict[str, str]]: - seen: set[tuple[str, ...]] = set() - output: list[dict[str, str]] = [] - for item in items: - marker = tuple(str(item.get(key, "")) for key in keys) - if marker in seen: +# ===== Step 2: 实体去重 ===== + +def resolve_entities_against_graph( + extracted: list[dict], + existing: list[dict], + episode_content: str = '', +) -> list[dict]: + if not existing: + return extracted + + context = { + 'extracted_entities': extracted, + 'existing_entities': existing, + 'episode_content': episode_content, + } + messages = prompt_dedupe_nodes(context) + content = _call_llm(messages) + try: + data = _try_parse_json(content) + except Exception as exc: + logger.warning('LLM dedup failed, keeping all extracted: %s', exc) + return extracted + + if isinstance(data, dict): + data = data.get('entity_resolutions', data.get('resolutions', [])) + + extracted_by_id = {i: e for i, e in enumerate(extracted)} + existing_by_id = {c.get('candidate_id'): c for c in existing} + + for resolution in (data if isinstance(data, list) else []): + if not isinstance(resolution, dict): continue - seen.add(marker) - output.append(item) - return output + rid = resolution.get('id') + dup_id = resolution.get('duplicate_candidate_id', -1) + if rid is None or rid not in extracted_by_id: + continue + if dup_id >= 0 and dup_id in existing_by_id: + extracted_by_id[rid]['_resolved_to'] = existing_by_id[dup_id] + extracted_by_id[rid]['name'] = resolution.get('name', extracted_by_id[rid]['name']) + + return [e for e in extracted_by_id.values() if '_resolved_to' not in e] -def _meeting_id(source: str, text: str) -> str: - digest = hashlib.sha1((source + "\n" + text[:4000]).encode("utf-8", errors="ignore")).hexdigest()[:10] - return f"meeting_{digest}" +# ===== Step 3: 事实关系抽取 ===== +def extract_facts_from_text( + text: str, + entities: list[dict], + reference_time: str = '', + previous_episodes: list[dict] | None = None, + stream: bool = False, +) -> list[dict]: + if len(entities) < 2: + return [] + + context = { + 'episode_content': text, + 'entities': entities, + 'reference_time': reference_time, + 'previous_episodes': previous_episodes or [], + } + messages = prompt_extract_facts(context) + content = _call_llm(messages, stream=stream) + try: + data = _try_parse_json(content) + except Exception as exc: + logger.error('Failed to parse fact extraction result: %s', exc) + return [] + + if isinstance(data, dict): + data = data.get('edges', data.get('facts', data.get('relations', []))) + + if not isinstance(data, list): + return [] + + entity_names = {_normalize_string(e.get('name', '')) for e in entities} + result = [] + for item in data: + if not isinstance(item, dict): + continue + src = _normalize_string(item.get('source_entity_name', '')) + tgt = _normalize_string(item.get('target_entity_name', '')) + if src not in entity_names or tgt not in entity_names: + continue + if src == tgt: + continue + result.append({ + 'source_entity_name': item['source_entity_name'], + 'target_entity_name': item['target_entity_name'], + 'relation_type': item.get('relation_type', '关联'), + 'fact': item.get('fact', ''), + 'valid_at': item.get('valid_at', ''), + 'invalid_at': item.get('invalid_at', ''), + 'evidence': item.get('evidence', ''), + 'qualifiers': item.get('qualifiers', []), + 'confidence': item.get('confidence', 0.0), + }) + return result + + +# ===== Step 4: 事实去重/矛盾检测 ===== + +def resolve_facts_against_graph( + new_fact: dict, + existing_facts: list[dict], + invalidation_candidates: list[dict], +) -> dict: + if not existing_facts: + return {'is_duplicate': False, 'is_contradicted': False, 'resolved': new_fact} + + context = { + 'new_fact': new_fact.get('fact', ''), + 'existing_facts': existing_facts, + 'invalidation_candidates': invalidation_candidates, + } + messages = prompt_dedupe_edges(context) + content = _call_llm(messages) + try: + data = _try_parse_json(content) + except Exception as exc: + logger.warning('Fact dedup failed, treating as new: %s', exc) + return {'is_duplicate': False, 'is_contradicted': False, 'resolved': new_fact} + + if not isinstance(data, dict): + return {'is_duplicate': False, 'is_contradicted': False, 'resolved': new_fact} + return { + 'is_duplicate': len(data.get('duplicate_facts', [])) > 0, + 'is_contradicted': len(data.get('contradicted_facts', [])) > 0, + 'resolved': new_fact, + 'duplicate_facts': data.get('duplicate_facts', []), + 'contradicted_facts': data.get('contradicted_facts', []), + } + + +# ===== Step 5: 实体摘要 ===== + +def extract_entity_summary( + entity_name: str, + episodes: list[str], + existing_summary: str = '', + previous_episodes: list[dict] | None = None, +) -> str: + context = { + 'entity_name': entity_name, + 'episodes': episodes, + 'existing_summary': existing_summary, + 'previous_episodes': previous_episodes or [], + } + messages = prompt_summarize(context) + content = _call_llm(messages, max_tokens=1024) + try: + data = _try_parse_json(content) + except Exception: + logger.warning('Failed to parse summary, using empty') + return '' + if isinstance(data, dict): + return data.get('summary', '') + return '' + + +# ===== 统一入口(兼容原有接口) ===== + +EXTRACTION_SYSTEM_PROMPT = """ +你是一个专业的会议知识抽取助手。你的任务是从中文会议记录中抽取结构化事实,尤其要抽出更细粒度、更有语义深度的关系。 + +输出要求: +1. 只输出一个 JSON 对象,不要输出解释文字。 +2. 关系抽取不要停留在"部门汇报了工作"这种浅层描述,要尽可能向下细化到: + - 责任归属 + - 目标值 / 当前值 / 趋势 + - 约束条件 + - 因果 / 影响 + - 时间要求 + - 依赖关系 + - 部署 / 决策 / 要求 / 风险 / 支撑关系 +3. 每条关系尽量同时给出: + - subject / predicate / object + - fact: 一句自然语言事实表述 + - qualifiers: 限定条件、范围、状态、数值、约束等 + - evidence: 原文中的关键短句或压缩证据 + - confidence: 0 到 1 之间 + - valid_at / invalid_at: 如果文中明确提到时间,可填写;否则留空 +4. 如果原文存在多个事实,不要只抽象概括,要拆成多条关系。 +5. 避免空泛关系词,优先使用更具体的谓词,例如: + - 负责 / 汇报 / 目标值 / 当前值 / 低于 / 高于 / 要求 / 督导 / 推进 / 影响 / 支撑 / 依赖 / 计划 / 完成 / 截止于 +""" + + +def extract_meeting_info(text: str, stream: bool = False) -> MeetingExtraction: + user_prompt = f""" +请从下面会议记录中提取结构化信息,并重点做"深层关系抽取"和"层次结构识别"。 + +输出 JSON 字段: +- title +- date +- participants +- agenda +- entities: name, entity_type, description + - entity_type 请使用: Department(部门)、Project(项目)、Metric(指标)、Person(人物)、System(系统)、Document(文档) +- relations: + - source_entity_name: 源实体名称 + - target_entity_name: 目标实体名称 + - relation_type: 关系类型(如 HAS_PROJECT、HAS_METRIC、负责、汇报、目标值、推进、依赖) + - fact: 一句自然语言事实描述 + - valid_at(可选) + - invalid_at(可选) + - evidence: 原文证据 + - qualifiers: 限定条件列表 + - confidence: 0~1 +- action_items: task, assignee, deadline, status, priority +- decisions: content, proposer, status +- metrics: metric_name, value, target, owner, trend, unit +- departments: [{{"name": "部门名称", "description": "", "projects": ["项目名1", "项目名2"]}}] +- summary + +层次关系规则: +1. Department 管辖 Project → relation_type 用 HAS_PROJECT +2. Project 拥有 Metric → relation_type 用 HAS_METRIC +3. 其他事实关系(负责、汇报、目标值等)直接用 relation_type 表达 + +关系抽取规则: +1. 不要只抽"汇报了工作"这种会议动作,要尽量继续下钻出具体事实。 +2. 如果一句话里同时包含"主体 + 指标 + 当前值 + 目标值 + 负责人 + 趋势",应拆成多条关系或在 qualifiers 中保留这些细节。 +3. 对于"要求、部署、负责、依赖、影响、约束、目标、风险"类信息优先保留。 +4. fact 必须是一句完整、自然、可检索的事实描述。 +5. qualifiers 用于补充数值、范围、状态、条件、截止时间、优先级等信息。 +6. evidence 用原文中的关键词短句,不要太长。 +7. confidence 取值 0 到 1。 + +会议记录如下: +{text} +""" + content = _call_llm([ + {'role': 'system', 'content': EXTRACTION_SYSTEM_PROMPT}, + {'role': 'user', 'content': user_prompt}, + ], stream=stream) + data = _try_parse_json(content) + data = _normalize_meeting_data(data) + return MeetingExtraction(**data) + + +def _normalize_meeting_data(data: dict) -> dict: + if not isinstance(data, dict): + return {} + return { + 'title': _as_str(data.get('title')), + 'date': _as_str(data.get('date')), + 'participants': _as_str_list(data.get('participants')), + 'agenda': _as_str_list(data.get('agenda')), + 'entities': _normalize_entities(data.get('entities')), + 'relations': _normalize_relations(data.get('relations')), + 'action_items': _normalize_action_items(data.get('action_items')), + 'decisions': _normalize_decisions(data.get('decisions')), + 'metrics': _normalize_metrics(data.get('metrics')), + 'departments': _normalize_departments(data.get('departments')), + 'summary': _as_str(data.get('summary')), + } + + +def _as_str(value) -> str: + if value is None: + return '' + if isinstance(value, str): + return value + return str(value) + + +def _as_float(value) -> float: + if value is None or value == '': + return 0.0 + try: + numeric = float(value) + return max(0.0, min(1.0, numeric)) + except (TypeError, ValueError): + return 0.0 + + +def _as_str_list(value) -> List[str]: + if isinstance(value, dict): + items = [] + for key, item in value.items(): + key_text = _as_str(key) + value_text = _as_str(item) + if key_text and value_text: + items.append(f'{key_text}: {value_text}') + elif key_text: + items.append(key_text) + elif value_text: + items.append(value_text) + return items + if not isinstance(value, list): + return [] + return [_as_str(item) for item in value if item is not None] + + +def _normalize_entities(value) -> List[dict]: + if not isinstance(value, list): + return [] + items = [] + for entity in value: + if not isinstance(entity, dict): + continue + items.append({ + 'name': _as_str(entity.get('name')), + 'entity_type': _as_str(entity.get('entity_type')), + 'description': _as_str(entity.get('description')), + }) + return items + + +def _normalize_relations(value) -> List[dict]: + if not isinstance(value, list): + return [] + items = [] + for relation in value: + if not isinstance(relation, dict): + continue + source = _as_str(relation.get('source_entity_name') or relation.get('subject', '')) + target = _as_str(relation.get('target_entity_name') or relation.get('object', '')) + rtype = _as_str(relation.get('relation_type') or relation.get('predicate', '')) + fact = _as_str(relation.get('fact')) + if not fact and source and rtype and target: + fact = f'{source} {rtype} {target}' + items.append({ + 'source_entity_name': source, + 'target_entity_name': target, + 'relation_type': rtype, + 'fact': fact, + 'qualifiers': _as_str_list(relation.get('qualifiers')), + 'evidence': _as_str(relation.get('evidence')), + 'confidence': _as_float(relation.get('confidence')), + 'valid_at': _as_str(relation.get('valid_at')), + 'invalid_at': _as_str(relation.get('invalid_at')), + }) + return items + + +def _normalize_action_items(value) -> List[dict]: + if not isinstance(value, list): + return [] + items = [] + for action in value: + if not isinstance(action, dict): + continue + items.append({ + 'task': _as_str(action.get('task')), + 'assignee': _as_str(action.get('assignee')), + 'deadline': _as_str(action.get('deadline')), + 'status': _as_str(action.get('status')) or '待办', + 'priority': _as_str(action.get('priority')) or '中', + }) + return items + + +def _normalize_decisions(value) -> List[dict]: + if not isinstance(value, list): + return [] + items = [] + for decision in value: + if not isinstance(decision, dict): + continue + items.append({ + 'content': _as_str(decision.get('content')), + 'proposer': _as_str(decision.get('proposer')), + 'status': _as_str(decision.get('status')) or '已决', + }) + return items + + +def _normalize_metrics(value) -> List[dict]: + if not isinstance(value, list): + return [] + items = [] + for metric in value: + if not isinstance(metric, dict): + continue + items.append({ + 'metric_name': _as_str(metric.get('metric_name')), + 'value': _as_str(metric.get('value')), + 'target': _as_str(metric.get('target')), + 'owner': _as_str(metric.get('owner')), + 'trend': _as_str(metric.get('trend')), + 'unit': _as_str(metric.get('unit')), + }) + return items + + +def _normalize_departments(value) -> List[dict]: + if not isinstance(value, list): + return [] + items = [] + for dept in value: + if not isinstance(dept, dict): + continue + name = _as_str(dept.get('name')) + if not name: + continue + items.append({ + 'name': name, + 'description': _as_str(dept.get('description')), + 'projects': _as_str_list(dept.get('projects')), + }) + return items diff --git a/meeting_memory/graph_store.py b/meeting_memory/graph_store.py new file mode 100644 index 0000000..e01577a --- /dev/null +++ b/meeting_memory/graph_store.py @@ -0,0 +1,1127 @@ +import hashlib +import json +import logging +import time +from datetime import datetime, timezone +from enum import Enum +from typing import Any, Dict, List, Optional + +from meeting_memory.config import config +from meeting_memory.dedup_helpers import normalize_name, shannon_entropy +from meeting_memory.search_utils import ( + reciprocal_rank_fusion, + _cosine_similarity, + _keyword_score, + _keyword_terms, + search_bfs, +) +from meeting_memory.services.embedding_service import embedding_service + +logger = logging.getLogger(__name__) + + +class _EntityType(str, Enum): + DEPARTMENT = 'Department' + PROJECT = 'Project' + METRIC = 'Metric' + PERSON = 'Person' + SYSTEM = 'System' + DOCUMENT = 'Document' + PARTICIPANT = 'participant' + UNKNOWN = 'Unknown' + + +_ENTITY_TYPE_ALIASES = { + '组织': 'Department', + 'organization': 'Department', + '部门': 'Department', + '指标': 'Metric', + 'kpi': 'Metric', + '项目': 'Project', +} + + +def _canonical_entity_type(raw: str) -> str: + normalized = raw.strip() + if normalized in _ENTITY_TYPE_ALIASES: + return _ENTITY_TYPE_ALIASES[normalized] + for member in _EntityType: + if member.value.lower() == normalized.lower(): + return member.value + return _EntityType.UNKNOWN.value + + +def _neo4j_labels(entity_type: str) -> list[str]: + canonical = _canonical_entity_type(entity_type) + labels = ['Entity'] + if canonical != _EntityType.UNKNOWN.value: + labels.append(canonical) + return labels + + +class Neo4jGraphStore: + def __init__(self): + self._driver = None + self._enabled = False + self._uri = config.neo4j.uri + self._last_failure_at = 0.0 + self._retry_cooldown_seconds = 10.0 + self._connect() + + def _connect(self): + if not config.neo4j.enabled: + logger.info('Neo4j graph store disabled') + return + try: + from neo4j import GraphDatabase + except ImportError: + logger.warning('neo4j package is not installed') + return + if not config.neo4j.password: + logger.warning('Neo4j is enabled but NEO4J_PASSWORD is empty') + return + + tried_uris = [self._uri] + if self._uri.startswith('neo4j://'): + tried_uris.append('bolt://' + self._uri[len('neo4j://'):]) + + for uri in tried_uris: + driver = None + try: + driver = GraphDatabase.driver( + uri, + auth=(config.neo4j.user, config.neo4j.password), + ) + driver.verify_connectivity() + self._driver = driver + self._uri = uri + self._enabled = True + self._last_failure_at = 0.0 + if uri != config.neo4j.uri: + logger.warning('Neo4j routing URI unavailable; fell back to %s', uri) + return + except Exception as exc: + logger.warning('Neo4j connection failed for %s: %s', uri, exc) + try: + driver.close() + except Exception: + pass + self._mark_unavailable('Neo4j is currently unreachable') + + @property + def enabled(self) -> bool: + if not self._enabled and self._should_retry_connect(): + self._connect() + return self._enabled and self._driver is not None + + def _should_retry_connect(self) -> bool: + return (time.time() - self._last_failure_at) >= self._retry_cooldown_seconds + + def _mark_unavailable(self, reason: str = '') -> None: + if reason: + logger.warning('Neo4j temporarily disabled: %s', reason) + self._enabled = False + self._last_failure_at = time.time() + if self._driver is not None: + try: + self._driver.close() + except Exception: + pass + self._driver = None + + @staticmethod + def meeting_id(meeting_data: dict) -> str: + title = meeting_data.get('title', '') + date = meeting_data.get('date', '') + raw = f'{date}_{title}' + return f'meeting_{hashlib.md5(raw.encode("utf-8")).hexdigest()[:12]}' + + def close(self): + if self._driver is not None: + self._driver.close() + + def run_query(self, query: str, **params) -> List[Dict[str, Any]]: + if not self.enabled: + return [] + try: + with self._driver.session(database=config.neo4j.database) as session: + result = session.run(query, **params) + return [record.data() for record in result] + except Exception as exc: + logger.warning('Neo4j query failed: %s', exc) + self._mark_unavailable(str(exc)) + return [] + + def _try_query(self, query: str, **params) -> List[Dict[str, Any]]: + if not self._enabled or self._driver is None: + return [] + try: + with self._driver.session(database=config.neo4j.database) as session: + result = session.run(query, **params) + return [record.data() for record in result] + except Exception as exc: + logger.debug('Optional query failed (non-fatal): %s', exc) + return [] + + def initialize_schema(self): + if not self.enabled: + return + statements = [ + 'CREATE CONSTRAINT meeting_id IF NOT EXISTS FOR (m:Meeting) REQUIRE m.meeting_id IS UNIQUE', + 'CREATE CONSTRAINT episode_id IF NOT EXISTS FOR (e:Episode) REQUIRE e.episode_id IS UNIQUE', + 'CREATE CONSTRAINT entity_name IF NOT EXISTS FOR (e:Entity) REQUIRE e.name IS UNIQUE', + 'CREATE INDEX meeting_title IF NOT EXISTS FOR (m:Meeting) ON (m.title)', + 'CREATE INDEX episode_title IF NOT EXISTS FOR (e:Episode) ON (e.title)', + 'CREATE INDEX entity_type IF NOT EXISTS FOR (e:Entity) ON (e.entity_type)', + 'CREATE INDEX relates_to_name IF NOT EXISTS FOR ()-[r:RELATES_TO]-() ON (r.name)', + 'CREATE INDEX relates_to_fact IF NOT EXISTS FOR ()-[r:RELATES_TO]-() ON (r.fact)', + ] + for statement in statements: + self.run_query(statement) + self._create_fulltext_indexes() + + def _create_fulltext_indexes(self): + queries = [ + 'CREATE FULLTEXT INDEX entity_fulltext IF NOT EXISTS FOR (n:Entity) ON EACH [n.name, n.summary]', + 'CREATE FULLTEXT INDEX episode_fulltext IF NOT EXISTS FOR (n:Episode) ON EACH [n.title, n.content, n.summary]', + 'CREATE FULLTEXT INDEX meeting_fulltext IF NOT EXISTS FOR (n:Meeting) ON EACH [n.title, n.summary]', + ] + for q in queries: + try: + self.run_query(q) + except Exception as exc: + logger.debug('Fulltext index may already exist: %s', exc) + + def get_stats(self) -> Dict[str, Any]: + if not self.enabled: + return {'enabled': False} + rows = self.run_query(''' + CALL { MATCH (m:Meeting) RETURN count(m) AS meetings } + CALL { MATCH (ep:Episode) RETURN count(ep) AS episodes } + CALL { MATCH (e:Entity) RETURN count(e) AS entities } + CALL { MATCH ()-[r:RELATES_TO]->() RETURN count(r) AS relations } + CALL { MATCH (d:Department) RETURN count(d) AS departments } + CALL { MATCH (p:Project) RETURN count(p) AS projects } + CALL { MATCH (m:Metric) RETURN count(m) AS metrics } + RETURN meetings, episodes, entities, relations, departments, projects, metrics + ''') + if not rows: + return {'enabled': False} + return {'enabled': True, **rows[0]} + + # ==================== Entity Dedup (from Graphiti) ==================== + + def find_similar_entities( + self, name: str, threshold: float = 0.6, limit: int = 15 + ) -> List[Dict[str, Any]]: + if not self.enabled or not name.strip(): + return [] + query_embedding = embedding_service.embed_text(name) + rows = self.run_query(''' + MATCH (e:Entity) + RETURN e.name AS name, + e.entity_type AS entity_type, + e.summary AS summary, + e.description AS description, + e.name_embedding AS name_embedding + ''') + scored = [] + for row in rows: + score = _cosine_similarity(query_embedding, row.get('name_embedding', [])) + if score >= threshold: + scored.append({ + 'candidate_id': len(scored), + 'name': row.get('name', ''), + 'entity_type': row.get('entity_type', ''), + 'summary': row.get('summary', '') or row.get('description', ''), + 'score': score, + }) + scored.sort(key=lambda r: r['score'], reverse=True) + return scored[:limit] + + def get_entities_map(self) -> Dict[str, Dict[str, Any]]: + rows = self.run_query(''' + MATCH (e:Entity) + RETURN e.name AS name, + e.entity_type AS entity_type, + e.summary AS summary, + e.description AS description + ''') + return {r['name']: r for r in rows if r.get('name')} + + # ==================== Edge Dedup / Resolution (from Graphiti) ==================== + + def get_facts_between(self, source_name: str, target_name: str) -> List[Dict[str, Any]]: + return self.run_query(''' + MATCH (s:Entity {name: $source_name})-[r:RELATES_TO]->(t:Entity {name: $target_name}) + RETURN r.name AS relation_type, + r.fact AS fact, + r.qualifiers AS qualifiers, + r.confidence AS confidence, + r.valid_at AS valid_at, + r.invalid_at AS invalid_at, + r.expired_at AS expired_at, + r.meeting_id AS meeting_id + ORDER BY coalesce(r.valid_at, '') DESC + ''', source_name=source_name, target_name=target_name) + + def search_related_facts( + self, fact_text: str, group_id: str = '', limit: int = 10 + ) -> List[Dict[str, Any]]: + if not fact_text.strip(): + return [] + query_embedding = embedding_service.embed_text(fact_text) + rows = self.run_query(''' + MATCH (s:Entity)-[r:RELATES_TO]->(t:Entity) + RETURN r.fact AS fact, + r.name AS relation_type, + r.fact_embedding AS fact_embedding, + r.valid_at AS valid_at, + r.invalid_at AS invalid_at, + r.expired_at AS expired_at, + s.name AS source_name, + t.name AS target_name + ''') + scored = [] + for row in rows: + score = _cosine_similarity(query_embedding, row.get('fact_embedding', [])) + if score > 0.3: + scored.append({ + 'fact': row.get('fact', ''), + 'relation_type': row.get('relation_type', ''), + 'source_name': row.get('source_name', ''), + 'target_name': row.get('target_name', ''), + 'valid_at': row.get('valid_at', ''), + 'invalid_at': row.get('invalid_at', ''), + 'expired_at': row.get('expired_at', ''), + 'score': score, + }) + scored.sort(key=lambda r: r['score'], reverse=True) + return scored[:limit] + + # ==================== BM25 Fulltext Search (from Graphiti) ==================== + + def _bm25_fact_search(self, search_text: str, limit: int = 10) -> List[Dict[str, Any]]: + if not self.enabled or not search_text.strip(): + return [] + rows = self._try_query(''' + CALL db.index.fulltext.queryNodes('entity_fulltext', $q) + YIELD node, score + MATCH (node)-[r:RELATES_TO]->(t:Entity) + RETURN r.fact AS fact, + r.name AS relation_type, + r.fact_embedding AS fact_embedding, + node.name AS source_name, + t.name AS target_name, + r.valid_at AS valid_at, + r.invalid_at AS invalid_at, + r.expired_at AS expired_at, + score AS bm25_score + ORDER BY bm25_score DESC + LIMIT $max_results + ''', q=search_text, max_results=limit) + return [ + {**row, 'bm25_score': round(row.get('bm25_score', 0), 4)} + for row in rows + ] + + def _bm25_entity_search(self, search_text: str, limit: int = 10) -> List[Dict[str, Any]]: + if not self.enabled or not search_text.strip(): + return [] + rows = self._try_query(''' + CALL db.index.fulltext.queryNodes('entity_fulltext', $q) + YIELD node, score + RETURN node.name AS title, + coalesce(node.summary, node.description, '') AS text, + score AS bm25_score, + node.name_embedding AS embedding + ORDER BY bm25_score DESC + LIMIT $max_results + ''', q=search_text, max_results=limit) + return [ + {**row, 'bm25_score': round(row.get('bm25_score', 0), 4), 'kind': 'entity'} + for row in rows + ] + + def _bm25_episode_search(self, search_text: str, limit: int = 5) -> List[Dict[str, Any]]: + if not self.enabled or not search_text.strip(): + return [] + rows = self._try_query(''' + CALL db.index.fulltext.queryNodes('episode_fulltext', $q) + YIELD node, score + MATCH (m:Meeting)-[:HAS_EPISODE]->(node) + RETURN m.title AS title, + coalesce(node.summary, node.content, '') AS text, + node.date AS date, + m.title AS meeting_title, + score AS bm25_score, + node.content_embedding AS embedding + ORDER BY bm25_score DESC + LIMIT $max_results + ''', q=search_text, max_results=limit) + return [ + {**row, 'bm25_score': round(row.get('bm25_score', 0), 4), 'kind': 'episode'} + for row in rows + ] + + # ==================== BFS Graph Traversal (from Graphiti) ==================== + + def get_edges_for_entity(self, entity_name: str) -> List[Dict[str, Any]]: + if not self.enabled or not entity_name: + return [] + return self.run_query(''' + MATCH (s:Entity {name: $name})-[r:RELATES_TO]->(t:Entity) + RETURN s.name AS source_name, + t.name AS target_name, + r.name AS relation_type, + r.fact AS fact, + r.confidence AS confidence + UNION + MATCH (s:Entity)-[r:RELATES_TO]->(t:Entity {name: $name}) + RETURN s.name AS source_name, + t.name AS target_name, + r.name AS relation_type, + r.fact AS fact, + r.confidence AS confidence + ''', name=entity_name) + + def _bfs_search(self, question: str, max_depth: int = 2, limit: int = 20) -> List[Dict[str, Any]]: + if not self.enabled or not question.strip(): + return [] + seed_rows = self._try_query(''' + CALL db.index.fulltext.queryNodes('entity_fulltext', $q) + YIELD node, score + RETURN node.name AS name + ORDER BY score DESC + LIMIT 5 + ''', q=question) + if not seed_rows: + seed_rows = self._try_query(''' + MATCH (e:Entity) + WHERE toLower(e.name) CONTAINS toLower($q) + OR toLower(coalesce(e.summary, '')) CONTAINS toLower($q) + RETURN e.name AS name + LIMIT 5 + ''', q=question) + + seed_names = [r['name'] for r in seed_rows if r.get('name')] + if not seed_names: + return [] + + result = search_bfs( + seed_names, + get_edges_fn=self.get_edges_for_entity, + max_depth=max_depth, + max_nodes=min(limit * 2, 50), + ) + + items = [] + for node_name in result['nodes']: + items.append({ + 'kind': 'entity_bfs', + 'title': node_name, + 'text': node_name, + 'date': '', + 'meeting_title': '', + 'score': 1.0 / (1 + result['nodes'].index(node_name)), + }) + for edge in result['edges'][:limit]: + fact = edge.get('fact', '') + items.append({ + 'kind': 'fact_bfs', + 'title': f"{edge.get('source_name', '')} -[{edge.get('relation_type', '')}]-> {edge.get('target_name', '')}", + 'text': fact, + 'date': '', + 'meeting_title': '', + 'score': 0.5, + }) + return items + + # ==================== Hybrid Edge Candidate Search (from Graphiti) ==================== + + def search_related_facts_hybrid( + self, fact_text: str, limit: int = 10 + ) -> List[Dict[str, Any]]: + if not fact_text.strip(): + return [] + + bm25_results = self._bm25_fact_search(fact_text, limit=limit) + vector_results = self.search_related_facts(fact_text, limit=limit) + + for r in bm25_results: + r['_source'] = 'bm25' + for r in vector_results: + r['_source'] = 'vector' + + fused = reciprocal_rank_fusion( + [bm25_results, vector_results], + k=60, + key='fact', + ) + return fused[:limit] + + def mark_relation_expired(self, source_name: str, target_name: str, relation_type: str, expired_at: str | None = None): + if not expired_at: + expired_at = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + self.run_query(''' + MATCH (s:Entity {name: $source_name})-[r:RELATES_TO {name: $relation_type}]->(t:Entity {name: $target_name}) + SET r.expired_at = $expired_at, + r.invalid_at = $expired_at, + r.updated_at = datetime() + ''', source_name=source_name, target_name=target_name, relation_type=relation_type, expired_at=expired_at) + + # ==================== Core Write Operations ==================== + + def upsert_meeting_subgraph(self, meeting_data: dict) -> None: + if not self.enabled: + return + meeting_id = meeting_data.get('_graph_meeting_id') or self.meeting_id(meeting_data) + episode_text = self._build_episode_text(meeting_data) + episode_embedding = embedding_service.embed_text(episode_text) + + self.initialize_schema() + self.run_query(''' + MERGE (m:Meeting {meeting_id: $meeting_id}) + SET m.title = $title, + m.date = $date, + m.summary = $summary, + m.content_hash = $content_hash, + m.raw_path = $raw_path, + m.updated_at = datetime() + MERGE (ep:Episode {episode_id: $meeting_id}) + SET ep.title = $title, + ep.date = $date, + ep.summary = $summary, + ep.content = $content, + ep.content_hash = $content_hash, + ep.raw_path = $raw_path, + ep.participants = $participants, + ep.content_embedding = $content_embedding, + ep.updated_at = datetime() + MERGE (m)-[:HAS_EPISODE]->(ep) + ''', + meeting_id=meeting_id, + title=meeting_data.get('title', ''), + date=meeting_data.get('date', ''), + summary=meeting_data.get('summary', ''), + content_hash=meeting_data.get('_content_hash', ''), + raw_path=meeting_data.get('_original_text_path', ''), + content=episode_text, + participants=meeting_data.get('participants', []), + content_embedding=episode_embedding, + ) + + for entity in meeting_data.get('entities', []): + self._upsert_entity(meeting_id, entity) + + for participant in meeting_data.get('participants', []): + self._upsert_entity( + meeting_id, + {'name': participant, 'entity_type': 'participant', 'description': ''}, + ) + + for relation in meeting_data.get('relations', []): + self._upsert_direct_edge(meeting_id, relation, meeting_data.get('date', '')) + + self._upsert_hierarchy(meeting_id, meeting_data) + + for metric in meeting_data.get('metrics', []): + self._upsert_metric_node(meeting_id, metric, meeting_data.get('date', '')) + + def _upsert_entity(self, meeting_id: str, entity: dict) -> None: + name = entity.get('name', '').strip() + if not name: + return + raw_type = entity.get('entity_type', '').strip() + labels = _neo4j_labels(raw_type) + summary = self._entity_summary(entity) + name_embedding = embedding_service.embed_text(summary or name) + set_labels = ' SET ' + ', '.join(f'e:{label}' for label in labels[1:]) if len(labels) > 1 else '' + self.run_query(f''' + MATCH (:Meeting {{meeting_id: $meeting_id}})-[:HAS_EPISODE]->(ep:Episode {{episode_id: $meeting_id}}) + MERGE (e:Entity {{name: $name}}) + {set_labels} + SET e.entity_type = CASE + WHEN $type <> '' THEN $type + ELSE coalesce(e.entity_type, '') + END, + e.description = CASE + WHEN $description <> '' THEN $description + ELSE coalesce(e.description, '') + END, + e.summary = CASE + WHEN $summary <> '' THEN $summary + ELSE coalesce(e.summary, '') + END, + e.name_embedding = CASE + WHEN size($name_embedding) > 0 THEN $name_embedding + ELSE coalesce(e.name_embedding, []) + END, + e.updated_at = datetime() + MERGE (ep)-[:MENTIONS]->(e) + ''', + meeting_id=meeting_id, + name=name, + type=raw_type, + description=entity.get('description', ''), + summary=summary, + name_embedding=name_embedding, + ) + + def _upsert_direct_edge(self, meeting_id: str, relation: dict, meeting_date: str) -> None: + source = relation.get('source_entity_name', '').strip() + target = relation.get('target_entity_name', '').strip() + rtype = relation.get('relation_type', '').strip() + if not source or not target or not rtype: + return + + self._upsert_entity( + meeting_id, + {'name': source, 'entity_type': '', 'description': ''}, + ) + self._upsert_entity( + meeting_id, + {'name': target, 'entity_type': '', 'description': ''}, + ) + + fact_text = self._relation_text(relation) + fact_embedding = embedding_service.embed_text(fact_text) + + self.run_query(''' + MATCH (s:Entity {name: $source}) + MATCH (t:Entity {name: $target}) + MERGE (s)-[r:RELATES_TO {name: $rtype}]->(t) + SET r.fact = $fact, + r.fact_embedding = $fact_embedding, + r.evidence = $evidence, + r.qualifiers = $qualifiers, + r.confidence = $confidence, + r.valid_at = $valid_at, + r.invalid_at = $invalid_at, + r.meeting_id = $meeting_id, + r.meeting_date = $meeting_date, + r.updated_at = datetime() + ''', + meeting_id=meeting_id, + source=source, + target=target, + rtype=rtype, + fact=fact_text, + fact_embedding=fact_embedding, + evidence=relation.get('evidence', ''), + qualifiers=relation.get('qualifiers', []), + confidence=relation.get('confidence', 0.0), + valid_at=relation.get('valid_at', ''), + invalid_at=relation.get('invalid_at', ''), + meeting_date=meeting_date, + ) + + def _upsert_hierarchy(self, meeting_id: str, meeting_data: dict) -> None: + entities_map = {e['name']: e for e in meeting_data.get('entities', []) if e.get('name')} + + for rel in meeting_data.get('relations', []): + rtype = rel.get('relation_type', '') + if rtype not in ('HAS_PROJECT', 'HAS_METRIC', 'PART_OF'): + continue + source = rel.get('source_entity_name', '') + target = rel.get('target_entity_name', '') + if not source or not target: + continue + + if rtype == 'HAS_PROJECT' or rtype == 'PART_OF': + self.run_query(''' + MATCH (s:Entity {name: $source}) + MATCH (t:Entity {name: $target}) + MERGE (s)-[r:HAS_PROJECT]->(t) + SET r.updated_at = datetime(), + r.meeting_id = $meeting_id + ''', source=source, target=target, meeting_id=meeting_id) + elif rtype == 'HAS_METRIC': + self.run_query(''' + MATCH (s:Entity {name: $source}) + MATCH (t:Entity {name: $target}) + MERGE (s)-[r:HAS_METRIC]->(t) + SET r.updated_at = datetime(), + r.meeting_id = $meeting_id + ''', source=source, target=target, meeting_id=meeting_id) + + departments = meeting_data.get('departments', []) + for dept in departments: + dept_name = dept.get('name', '').strip() + if not dept_name or dept_name not in entities_map: + continue + for proj_name in dept.get('projects', []): + if proj_name in entities_map: + self.run_query(''' + MATCH (s:Entity {name: $source}) + MATCH (t:Entity {name: $target}) + MERGE (s)-[r:HAS_PROJECT]->(t) + SET r.updated_at = datetime(), + r.meeting_id = $meeting_id + ''', source=dept_name, target=proj_name, meeting_id=meeting_id) + + def _upsert_metric_node(self, meeting_id: str, metric: dict, meeting_date: str) -> None: + name = metric.get('metric_name', '').strip() + if not name: + return + entity = { + 'name': name, + 'entity_type': 'Metric', + 'description': f"{metric.get('value', '')} ({metric.get('unit', '')})" if metric.get('unit') else metric.get('value', ''), + } + self._upsert_entity(meeting_id, entity) + + self.run_query(''' + MATCH (e:Entity {name: $name}) + SET e.current_value = $value, + e.target = $target, + e.trend = $trend, + e.unit = $unit, + e.owner = $owner, + e.updated_at = datetime() + ''', + name=name, + value=metric.get('value', ''), + target=metric.get('target', ''), + trend=metric.get('trend', ''), + unit=metric.get('unit', ''), + owner=metric.get('owner', ''), + ) + + def remove_meeting_subgraph(self, meeting_id: str) -> None: + if not self.enabled: + return + # Phase 1: detach all entities mentioned by this episode + self.run_query(''' + MATCH (m:Meeting {meeting_id: $meeting_id})-[:HAS_EPISODE]->(ep:Episode) + OPTIONAL MATCH (ep)-[mention:MENTIONS]->(entity:Entity) + OPTIONAL MATCH (entity)-[er]-() + DELETE mention, er + ''', meeting_id=meeting_id) + # Phase 2: delete orphan entities no longer mentioned by any episode + self.run_query(''' + MATCH (entity:Entity) + WHERE NOT (entity)<-[:MENTIONS]-(:Episode) + DETACH DELETE entity + ''') + # Phase 3: delete episode and meeting + self.run_query(''' + MATCH (m:Meeting {meeting_id: $meeting_id})-[:HAS_EPISODE]->(ep:Episode) + DETACH DELETE ep, m + ''', meeting_id=meeting_id) + + # ==================== Retrieval ==================== + + def get_meeting(self, title: str, date: str = '') -> Optional[Dict[str, Any]]: + if not self.enabled: + return None + rows = self.run_query(''' + MATCH (m:Meeting) + WHERE m.title = $title AND ($date = '' OR m.date = $date) + RETURN m.meeting_id AS meeting_id, m.title AS title, m.date AS date, + m.summary AS summary, m.content_hash AS content_hash + LIMIT 1 + ''', title=title, date=date) + return rows[0] if rows else None + + def find_similar_episode(self, text: str, threshold: float = 0.92) -> Optional[Dict[str, Any]]: + if not self.enabled or not text.strip(): + return None + query_embedding = embedding_service.embed_text(text) + rows = self.run_query(''' + MATCH (m:Meeting)-[:HAS_EPISODE]->(ep:Episode) + RETURN m.meeting_id AS meeting_id, m.title AS title, m.date AS date, + m.content_hash AS content_hash, ep.content_embedding AS content_embedding + ''') + best_match = None + for row in rows: + score = _cosine_similarity(query_embedding, row.get('content_embedding', [])) + if score >= threshold and (best_match is None or score > best_match['score']): + best_match = { + 'metadata': { + 'meeting_id': row.get('meeting_id', ''), + 'title': row.get('title', ''), + 'date': row.get('date', ''), + 'content_hash': row.get('content_hash', ''), + }, + 'score': score, + } + return best_match + + def hybrid_search(self, question: str, limit: int = 5) -> List[Dict[str, Any]]: + if not self.enabled or not question.strip(): + return [] + query_embedding = embedding_service.embed_text(question) + + all_results: List[Dict[str, Any]] = [] + sources = [] + + for fact_row in self._load_fact_candidates(): + fact_row['_semantic'] = _cosine_similarity(query_embedding, fact_row.get('embedding', [])) + all_results.append(fact_row) + sources.append(self._rank_fact_candidates(question, all_results)) + + entity_candidates = [] + for ent_row in self._load_entity_candidates(): + ent_row['_semantic'] = _cosine_similarity(query_embedding, ent_row.get('embedding', [])) + entity_candidates.append(ent_row) + sources.append(self._rank_entity_candidates(question, entity_candidates)) + + episode_candidates = [] + for ep_row in self._load_episode_candidates(): + ep_row['_semantic'] = _cosine_similarity(query_embedding, ep_row.get('embedding', [])) + episode_candidates.append(ep_row) + sources.append(self._rank_episode_candidates(question, episode_candidates)) + + bm25_facts = self._bm25_fact_search(question, limit=limit) + if bm25_facts: + sources.append(bm25_facts) + bm25_entities = self._bm25_entity_search(question, limit=limit) + if bm25_entities: + sources.append(bm25_entities) + bm25_episodes = self._bm25_episode_search(question, limit=limit) + if bm25_episodes: + sources.append(bm25_episodes) + + bfs_results = self._bfs_search(question, max_depth=2, limit=limit) + if bfs_results: + sources.append(bfs_results) + + fused = reciprocal_rank_fusion([s for s in sources if s], k=60, key='text') + + for item in fused: + combined_text = ' '.join([ + str(item.get('title') or ''), + str(item.get('text') or ''), + str(item.get('meeting_title') or ''), + str(item.get('date') or ''), + ]) + semantic = item.get('_semantic', _cosine_similarity(query_embedding, item.get('embedding', []))) + lexical = _keyword_score(combined_text, question) + item['semantic_score'] = round(semantic, 4) + item['keyword_score'] = round(lexical, 4) + + return fused[:limit] + + def _rank_fact_candidates(self, question: str, candidates: List[Dict]) -> List[Dict]: + scored = [] + for item in candidates: + combined_text = ' '.join(filter(None, [item.get('title', ''), item.get('text', ''), item.get('meeting_title', ''), item.get('date', '')])) + semantic = item.get('_semantic', 0) + lexical = _keyword_score(combined_text, question) + score = semantic * 0.7 + lexical * 0.2 + 0.1 + if score > 0: + item['_score'] = round(score, 4) + scored.append(item) + scored.sort(key=lambda r: r['_score'], reverse=True) + return scored + + def _rank_entity_candidates(self, question: str, candidates: List[Dict]) -> List[Dict]: + scored = [] + for item in candidates: + combined_text = ' '.join(filter(None, [item.get('title', ''), item.get('text', '')])) + semantic = item.get('_semantic', 0) + lexical = _keyword_score(combined_text, question) + score = semantic * 0.7 + lexical * 0.2 + 0.05 + if score > 0: + item['_score'] = round(score, 4) + scored.append(item) + scored.sort(key=lambda r: r['_score'], reverse=True) + return scored + + def _rank_episode_candidates(self, question: str, candidates: List[Dict]) -> List[Dict]: + scored = [] + for item in candidates: + combined_text = ' '.join(filter(None, [item.get('title', ''), item.get('text', ''), item.get('meeting_title', '')])) + semantic = item.get('_semantic', 0) + lexical = _keyword_score(combined_text, question) + score = semantic * 0.7 + lexical * 0.2 + 0.05 + if score > 0: + item['_score'] = round(score, 4) + scored.append(item) + scored.sort(key=lambda r: r['_score'], reverse=True) + return scored + + def search_facts(self, question: str, limit: int = 5) -> List[Dict[str, Any]]: + return self.hybrid_search(question, limit=limit) + + def get_graph_kinds(self) -> List[Dict[str, Any]]: + if not self.enabled: + return [] + return self.run_query(''' + MATCH (n) + WHERE n:Meeting OR n:Episode OR n:Entity OR n:Department OR n:Project OR n:Metric + WITH [lbl IN labels(n) WHERE lbl IN ['Meeting','Episode','Entity','Department','Project','Metric']][0] AS kind + RETURN kind, count(*) AS count ORDER BY count DESC + ''') + + def get_entity_types(self) -> List[Dict[str, Any]]: + if not self.enabled: + return [] + rows = self.run_query(''' + MATCH (e:Entity) + WHERE coalesce(e.entity_type, '') <> '' + RETURN e.entity_type AS entity_type, count(*) AS count ORDER BY count DESC + ''') + return [ + {'entity_type': _canonical_entity_type(r.get('entity_type', '')), 'count': r.get('count', 0)} + for r in rows + ] + + def get_graph_snapshot( + self, + query: str = '', + entity_types: Optional[List[str]] = None, + kinds: Optional[List[str]] = None, + limit_nodes: int = 80, + limit_edges: int = 160, + ) -> Dict[str, Any]: + if not self.enabled: + return {'nodes': [], 'edges': [], 'stats': {'enabled': False}} + + keyword_terms = _keyword_terms(query) if query else [] + raw_nodes = self.run_query(f''' + MATCH (n) + WHERE (n:Meeting OR n:Episode OR n:Entity) + AND ($kinds = [] OR any(lbl IN labels(n) WHERE lbl IN $kinds)) + AND ($terms = [] + OR (n:Meeting AND any(t IN $terms WHERE toLower(coalesce(n.title,'')) CONTAINS t OR toLower(coalesce(n.summary,'')) CONTAINS t)) + OR (n:Episode AND any(t IN $terms WHERE toLower(coalesce(n.title,'')) CONTAINS t OR toLower(coalesce(n.content,'')) CONTAINS t)) + OR (n:Entity AND any(t IN $terms WHERE toLower(coalesce(n.name,'')) CONTAINS t OR toLower(coalesce(n.summary,'')) CONTAINS t OR toLower(coalesce(n.description,'')) CONTAINS t)) + ) + AND ($types = [] OR NOT n:Entity OR coalesce(n.entity_type, '') IN $types) + OPTIONAL MATCH (n)-[r]-() + RETURN n.meeting_id AS meeting_id, + n.episode_id AS episode_id, + n.name AS entity_name, + n.title AS title, + n.summary AS summary, + n.date AS date, + n.entity_type AS entity_type, + n.description AS description, + n.meeting_date AS meeting_date, + n.current_value AS current_value, + n.target AS target_value, + n.trend AS trend, + CASE + WHEN n:Meeting THEN 'Meeting' + WHEN n:Episode THEN 'Episode' + WHEN n:Department THEN 'Department' + WHEN n:Project THEN 'Project' + WHEN n:Metric THEN 'Metric' + ELSE 'Entity' + END AS kind, + count(DISTINCT r) AS degree + ORDER BY degree DESC, coalesce(n.title, n.name) ASC + LIMIT $limit_nodes + ''', + terms=keyword_terms, + types=entity_types or [], + kinds=kinds or [], + limit_nodes=limit_nodes, + ) + if not raw_nodes: + return {'nodes': [], 'edges': [], 'stats': self.get_stats()} + + all_raw_ids = set() + nodes = [] + for row in raw_nodes: + kind = row.get('kind', '') + if kind == 'Meeting': + raw_id = row.get('meeting_id', '') + label = row.get('title', '') or raw_id + elif kind == 'Episode': + raw_id = row.get('episode_id', '') + label = row.get('title', '') or raw_id + elif kind in ('Entity', 'Department', 'Project', 'Metric'): + raw_id = row.get('entity_name', '') + label = raw_id + else: + continue + if not raw_id: + continue + nid = f'{kind}:{raw_id}' + all_raw_ids.add(raw_id) + node = { + 'id': nid, + 'label': label, + 'kind': kind, + 'entity_type': row.get('entity_type', '') if kind in ('Entity', 'Department', 'Project', 'Metric') else '', + 'description': row.get('description', '') or row.get('summary', '') or '', + 'date': row.get('date', '') or row.get('meeting_date', '') or '', + 'degree': row.get('degree', 0), + 'summary': row.get('summary', '') or '', + } + if kind == 'Metric': + node['current_value'] = row.get('current_value', '') + node['target'] = row.get('target_value', '') + node['trend'] = row.get('trend', '') + nodes.append(node) + + if not nodes: + return {'nodes': [], 'edges': [], 'stats': self.get_stats()} + + ids_list = list(all_raw_ids) + edges_raw = self.run_query(''' + MATCH (s)-[r]->(t) + WHERE type(r) IN ['HAS_EPISODE','MENTIONS','RELATES_TO','HAS_PROJECT','HAS_METRIC'] + AND ( + (s:Meeting AND s.meeting_id IN $ids) + OR (s:Episode AND s.episode_id IN $ids) + OR (s:Entity AND s.name IN $ids) + ) + AND ( + (t:Meeting AND t.meeting_id IN $ids) + OR (t:Episode AND t.episode_id IN $ids) + OR (t:Entity AND t.name IN $ids) + ) + RETURN type(r) AS predicate, + r.name AS relation_name, + r.fact AS relation_fact, + r.confidence AS relation_confidence, + r.meeting_date AS relation_date, + r.meeting_id AS relation_meeting_id, + CASE WHEN s:Meeting THEN s.meeting_id + WHEN s:Episode THEN s.episode_id + WHEN s:Entity THEN s.name END AS source_raw, + CASE WHEN t:Meeting THEN t.meeting_id + WHEN t:Episode THEN t.episode_id + WHEN t:Entity THEN t.name END AS target_raw, + CASE WHEN s:Meeting THEN 'Meeting' WHEN s:Episode THEN 'Episode' + WHEN s:Department THEN 'Department' + WHEN s:Project THEN 'Project' + WHEN s:Metric THEN 'Metric' + WHEN s:Entity THEN 'Entity' END AS source_kind, + CASE WHEN t:Meeting THEN 'Meeting' WHEN t:Episode THEN 'Episode' + WHEN t:Department THEN 'Department' + WHEN t:Project THEN 'Project' + WHEN t:Metric THEN 'Metric' + WHEN t:Entity THEN 'Entity' END AS target_kind + LIMIT $limit_edges + ''', ids=list(all_raw_ids), limit_edges=limit_edges) + + degree_map: Dict[str, int] = {} + for row in edges_raw: + src = row.get('source_raw', '') + tgt = row.get('target_raw', '') + sk = row.get('source_kind', '') + tk = row.get('target_kind', '') + if sk and src: + degree_map[f'{sk}:{src}'] = degree_map.get(f'{sk}:{src}', 0) + 1 + if tk and tgt: + degree_map[f'{tk}:{tgt}'] = degree_map.get(f'{tk}:{tgt}', 0) + 1 + for node in nodes: + node['degree'] = degree_map.get(node['id'], node.get('degree', 0)) + + edges = [] + for idx, row in enumerate(edges_raw, start=1): + sk = row.get('source_kind', '') + tk = row.get('target_kind', '') + edges.append({ + 'id': f'edge_{idx}', + 'source': f'{sk}:{row["source_raw"]}' if sk and row.get('source_raw') else '', + 'target': f'{tk}:{row["target_raw"]}' if tk and row.get('target_raw') else '', + 'predicate': row.get('predicate', ''), + 'relation_name': row.get('relation_name', ''), + 'fact': row.get('relation_fact', '') or '', + 'confidence': row.get('relation_confidence', 0.0), + 'date': row.get('relation_date', '') or '', + 'meeting_id': row.get('relation_meeting_id', '') or '', + }) + + return {'nodes': nodes, 'edges': edges, 'stats': self.get_stats(), 'query': query} + + def format_search_context(self, question: str, top_k: int = 5) -> str: + results = self.hybrid_search(question, limit=top_k) + if not results: + return '' + lines = [] + for idx, row in enumerate(results, start=1): + date = row.get('date', '') + meeting_title = row.get('meeting_title', '') + title = row.get('title', row.get('kind', 'item')) + suffix = f' ({date})' if date else '' + source = f' | 来源会议: {meeting_title}' if meeting_title else '' + lines.append( + f'[{idx}] {title}{suffix}{source}\n' + f'{row.get("text", "")}\n' + f'score={row.get("score", 0):.4f}, semantic={row.get("semantic_score", 0):.4f}, keyword={row.get("keyword_score", 0):.4f}' + ) + return '\n\n'.join(lines) + + def _load_fact_candidates(self) -> List[Dict[str, Any]]: + return self.run_query(''' + MATCH (s:Entity)-[r:RELATES_TO]->(t:Entity) + OPTIONAL MATCH (ep:Episode)-[:MENTIONS]->(s) + WITH s, r, t, collect(DISTINCT ep.date) AS dates, collect(DISTINCT ep.title) AS titles + RETURN 'fact' AS kind, + s.name + ' -[' + r.name + ']-> ' + t.name AS title, + coalesce(r.fact, '') AS text, + head(dates) AS date, + head(titles) AS meeting_title, + r.fact_embedding AS embedding + ''') + + def _load_entity_candidates(self) -> List[Dict[str, Any]]: + return self.run_query(''' + MATCH (e:Entity) + OPTIONAL MATCH (ep:Episode)-[:MENTIONS]->(e) + RETURN 'entity' AS kind, + e.name AS title, + coalesce(e.summary, e.description, '') AS text, + max(ep.date) AS date, + head(collect(DISTINCT ep.title)) AS meeting_title, + e.name_embedding AS embedding + ''') + + def _load_episode_candidates(self) -> List[Dict[str, Any]]: + return self.run_query(''' + MATCH (m:Meeting)-[:HAS_EPISODE]->(ep:Episode) + RETURN 'episode' AS kind, + m.title AS title, + coalesce(ep.summary, ep.content, '') AS text, + ep.date AS date, + m.title AS meeting_title, + ep.content_embedding AS embedding + ''') + + @staticmethod + def _entity_summary(entity: dict) -> str: + entity_type = _canonical_entity_type(entity.get('entity_type', '').strip()) + name = entity.get('name', '').strip() + description = entity.get('description', '').strip() + parts = [part for part in [entity_type, name, description] if part] + return ' | '.join(parts) + + @staticmethod + def _relation_text(relation: dict) -> str: + source = relation.get('source_entity_name', '').strip() + rtype = relation.get('relation_type', '').strip() + target = relation.get('target_entity_name', '').strip() + fact = relation.get('fact', '').strip() or f'{source} {rtype} {target}'.strip() + qualifiers = relation.get('qualifiers', []) + qualifier_text = '; '.join(item for item in qualifiers if item) + parts = [fact] + if qualifier_text: + parts.append(qualifier_text) + return '. '.join(parts) + + @staticmethod + def _build_episode_text(meeting_data: dict) -> str: + payload = { + 'title': meeting_data.get('title', ''), + 'date': meeting_data.get('date', ''), + 'participants': meeting_data.get('participants', []), + 'summary': meeting_data.get('summary', ''), + 'entities': meeting_data.get('entities', []), + 'relations': meeting_data.get('relations', []), + 'action_items': meeting_data.get('action_items', []), + 'metrics': meeting_data.get('metrics', []), + 'decisions': meeting_data.get('decisions', []), + 'departments': meeting_data.get('departments', []), + 'original_text': meeting_data.get('_original_text', ''), + } + return json.dumps(payload, ensure_ascii=False) + + +graph_store = Neo4jGraphStore() diff --git a/meeting_memory/io_utils.py b/meeting_memory/io_utils.py deleted file mode 100644 index 958079f..0000000 --- a/meeting_memory/io_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -from __future__ import annotations - -from pathlib import Path - - -def read_text_auto(path: str | Path) -> str: - """Read common Chinese transcript encodings with a small fallback chain.""" - file_path = Path(path).expanduser().resolve() - encodings = ("utf-8-sig", "utf-8", "gb18030", "gbk") - last_error: Exception | None = None - for encoding in encodings: - try: - return file_path.read_text(encoding=encoding) - except UnicodeDecodeError as exc: - last_error = exc - raise UnicodeDecodeError( - "unknown", - b"", - 0, - 1, - f"Unable to decode {file_path}: {last_error}", - ) - - -def ensure_dir(path: str | Path) -> Path: - directory = Path(path) - directory.mkdir(parents=True, exist_ok=True) - return directory - diff --git a/meeting_memory/ledger.py b/meeting_memory/ledger.py deleted file mode 100644 index 459fdf4..0000000 --- a/meeting_memory/ledger.py +++ /dev/null @@ -1,640 +0,0 @@ -from __future__ import annotations - -import json -import re -from datetime import date -from pathlib import Path -from typing import Any - -from .io_utils import ensure_dir -from .models import ExtractedMeeting - - -class LedgerStore: - def __init__(self, root: str | Path = "data") -> None: - self.root = Path(root).resolve() - ensure_dir(self.root) - - def import_meeting(self, team_id: str, meeting: ExtractedMeeting) -> dict[str, Any]: - team_dir = ensure_dir(self.root / team_id) - state_path = team_dir / "state.json" - meetings_dir = ensure_dir(team_dir / "meetings") - state = self._load_state(state_path, team_id) - meeting_dict = meeting.to_dict() - - existing_ids = {item.get("meeting_id") for item in state.get("meetings", [])} - if meeting.meeting_id in existing_ids: - state["meetings"] = [ - meeting_dict if item.get("meeting_id") == meeting.meeting_id else item - for item in state.get("meetings", []) - ] - imported = False - else: - state["meetings"].append(meeting_dict) - imported = True - - state["last_meeting_date"] = meeting.meeting_date - state["last_updated"] = date.today().isoformat() - state["version"] = int(state.get("version", 0)) + (1 if imported else 0) - state = self._compact_state(state) - - state_path.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8") - (meetings_dir / f"{meeting.meeting_id}.json").write_text( - json.dumps(meeting_dict, ensure_ascii=False, indent=2), - encoding="utf-8", - ) - ledger_path = team_dir / "team_ledger.md" - ledger_path.write_text(self.render_markdown(state), encoding="utf-8") - return { - "team_id": team_id, - "knowledge_base_id": team_id, - "meeting_id": meeting.meeting_id, - "ledger_path": str(ledger_path), - "state_path": str(state_path), - "summary": meeting.summary, - "imported": imported, - "counts": { - "projects": len(meeting.projects), - "metrics": len(meeting.metrics), - "actions": len(meeting.action_items), - "risks": len(meeting.risks), - "completed": len(meeting.completed_items), - "followups": len(meeting.followups), - }, - } - - def query(self, team_id: str, query: str, *, limit: int = 8) -> dict[str, Any]: - resolved_team = self.resolve_team_id(team_id) - if resolved_team is None: - teams = self.list_team_ids() - if teams: - return { - "answer": f"未找到知识库 `{team_id}` 的台账。当前已有知识库:" + "、".join(teams) + "。请在问题里指定知识库名。", - "matches": [], - "available_knowledge_bases": teams, - } - return {"answer": "当前 data 目录下还没有任何知识库台账,请先导入会议。", "matches": [], "available_knowledge_bases": []} - - team_dir = self.root / resolved_team - state = json.loads((team_dir / "state.json").read_text(encoding="utf-8")) - ledger_path = team_dir / "team_ledger.md" - - if _looks_like_recent_meeting_query(query): - return self._recent_meeting_answer(state, ledger_path) - - haystacks = self._flatten_state(state) - keywords = [word for word in _tokenize_query(query) if word] - scored: list[tuple[int, dict[str, str]]] = [] - for item in haystacks: - score = sum(item["search_text"].count(word) for word in keywords) - if score: - scored.append((score, item)) - scored.sort(key=lambda pair: pair[0], reverse=True) - matches = [item for _, item in scored[:limit]] - - if matches: - answer = "根据当前会议台账,找到这些相关内容:\n" + "\n".join( - f"- [{item['type']}] {item['display_text']}" for item in matches - ) - else: - answer = "台账中没有命中明显相关内容。可以换成项目名、负责人、指标名、风险关键词或时间要求再问。" - return { - "answer": answer, - "matches": matches, - "ledger_path": str(ledger_path), - "team_id": resolved_team, - "knowledge_base_id": resolved_team, - } - - def list_team_ids(self) -> list[str]: - if not self.root.exists(): - return [] - return sorted(path.name for path in self.root.iterdir() if path.is_dir() and (path / "state.json").exists()) - - def resolve_team_id(self, team_id: str | None) -> str | None: - teams = self.list_team_ids() - if team_id and team_id != "default_team" and team_id in teams: - return team_id - if team_id and team_id != "default_team" and (self.root / team_id / "state.json").exists(): - return team_id - if (team_id == "default_team" or not team_id) and len(teams) == 1: - return teams[0] - return None - - def build_agent_context(self, team_id: str) -> str: - resolved = self.resolve_team_id(team_id) - if resolved is None: - return "" - state = json.loads((self.root / resolved / "state.json").read_text(encoding="utf-8")) - lines = [] - for project in state.get("projects", [])[:20]: - lines.append( - f"- 项目: {project.get('name', '')} | 状态: {project.get('status', '待确认')} | " - f"最近更新: {project.get('latest_update', '')} | 最近提及: {project.get('last_meeting_date', '')}" - ) - recent_meetings = state.get("meetings", [])[-5:] - if recent_meetings: - lines.append("") - lines.append("最近会议摘要:") - for meeting in recent_meetings: - title = meeting.get("title") or Path(meeting.get("source_path", "")).stem - summary = ";".join(meeting.get("summary", [])[:2]) - lines.append(f"- {meeting.get('meeting_date', '')} | {title} | {summary}") - return "\n".join(lines) - - def render_markdown(self, state: dict[str, Any]) -> str: - latest = state["meetings"][-1] if state.get("meetings") else {} - knowledge_base_id = state.get("knowledge_base_id") or state["team_id"] - lines = [ - "---", - f"knowledge_base_id: {knowledge_base_id}", - f"team_id: {state['team_id']}", - "doc_type: meeting_project_ledger", - f"last_meeting_date: {state.get('last_meeting_date', '')}", - f"last_updated: {state.get('last_updated', '')}", - f"version: {state.get('version', 0)}", - f"source_meeting_id: {latest.get('meeting_id', '')}", - "---", - "", - f"# {knowledge_base_id} 会议知识台账", - "", - "## 1. 当前总览", - "", - f"- 当前维护项目数:{len(state.get('projects', []))}", - f"- 最近一次会议:{state.get('last_meeting_date', '')}", - ] - if latest.get("summary"): - lines.append(f"- 最近会议摘要:{';'.join(latest.get('summary', [])[:3])}") - - lines.extend(["", "## 2. 项目索引", "", "| 项目 | 当前状态 | 最近提及 | 最新更新 |", "|---|---|---|---|"]) - projects = state.get("projects", []) - if projects: - for project in projects: - lines.append( - "| " - + " | ".join( - _escape_md(str(value)) - for value in ( - project.get("name", ""), - project.get("status", "待确认"), - project.get("last_meeting_date", ""), - project.get("latest_update", ""), - ) - ) - + " |" - ) - else: - lines.append("| 暂无 | | | |") - - lines.extend(["", "## 3. 项目详情", ""]) - if not projects: - lines.append("暂无项目记录。") - for index, project in enumerate(projects, start=1): - lines.extend( - [ - f"### 3.{index} {project.get('name', '')}", - "", - f"- 当前状态:{project.get('status', '待确认')}", - f"- 最近提及:{project.get('last_meeting_date', '')}", - f"- 最新更新:{project.get('latest_update', '') or project.get('summary', '') or '待确认'}", - ] - ) - if project.get("summary"): - lines.append(f"- 项目说明:{project.get('summary')}") - if project.get("keywords"): - lines.append(f"- 检索关键词:{'、'.join(project.get('keywords', []))}") - - lines.extend(["", "#### 指标", "", "| 指标 | 当前值 | 目标 | 状态 | 趋势 | 备注 |", "|---|---|---|---|---|---|"]) - metrics = project.get("metrics", []) - if metrics: - for metric in metrics: - lines.append( - "| " - + " | ".join( - _escape_md(str(metric.get(key, ""))) - for key in ("name", "value", "target", "status", "trend", "note") - ) - + " |" - ) - else: - lines.append("| 暂无 | | | | | |") - - lines.extend(["", "#### 待办", "", "| 事项 | 责任对象 | 截止时间 | 状态 | 下一步 |", "|---|---|---|---|---|"]) - actions = project.get("actions", []) - if actions: - for action in actions: - lines.append( - "| " - + " | ".join( - _escape_md(str(action.get(key, ""))) - for key in ("title", "owner", "deadline", "status", "next_step") - ) - + " |" - ) - else: - lines.append("| 暂无 | | | | |") - - lines.extend(["", "#### 风险", "", "| 风险/问题 | 当前状态 | 影响 | 责任对象 | 处理动作 |", "|---|---|---|---|---|"]) - risks = project.get("risks", []) - if risks: - for risk in risks: - lines.append( - "| " - + " | ".join( - _escape_md(str(risk.get(key, ""))) - for key in ("title", "status", "impact", "owner", "next_step") - ) - + " |" - ) - else: - lines.append("| 暂无 | | | | |") - - lines.extend(["", "#### 已完成", ""]) - completed = project.get("completed", []) - if completed: - for item in completed: - lines.append(f"- {item.get('title', '')}({item.get('owner', '待确认')},{item.get('status', '已完成')})") - else: - lines.append("- 暂无。") - - lines.extend(["", "#### 决策", ""]) - decisions = project.get("decisions", []) - if decisions: - lines.extend(f"- {item}" for item in decisions) - else: - lines.append("- 暂无。") - - lines.extend(["", "#### 下一步", ""]) - next_steps = project.get("next_steps", []) - if next_steps: - lines.extend(f"- {item}" for item in next_steps) - else: - lines.append("- 暂无。") - - lines.extend(["", "#### 更新历史", ""]) - history = project.get("history", [])[-6:] - if history: - for item in reversed(history): - lines.append( - f"- {item.get('meeting_date', '')} | {item.get('status', '待确认')} | {item.get('update', '')}" - ) - else: - lines.append("- 暂无。") - lines.append("") - - lines.extend(["## 4. 会议导入记录", "", "| 日期 | 标题 | meeting_id | 来源 |", "|---|---|---|---|"]) - for meeting in state.get("meetings", [])[-20:]: - title = meeting.get("title") or Path(meeting.get("source_path", "")).stem - lines.append( - "| " - + " | ".join( - _escape_md(str(value)) - for value in ( - meeting.get("meeting_date", ""), - title, - meeting.get("meeting_id", ""), - meeting.get("source_path", ""), - ) - ) - + " |" - ) - lines.append("") - return "\n".join(lines) - - def _recent_meeting_answer(self, state: dict[str, Any], ledger_path: Path) -> dict[str, Any]: - meetings = state.get("meetings", []) - if not meetings: - return { - "answer": "该知识库台账存在,但还没有会议导入记录。", - "matches": [], - "ledger_path": str(ledger_path), - "team_id": state.get("team_id", ""), - "knowledge_base_id": state.get("knowledge_base_id") or state.get("team_id", ""), - } - - latest = meetings[-1] - title = latest.get("title") or Path(latest.get("source_path", "")).stem - parts = [ - f"最近一次会议:{latest.get('meeting_date', '')}", - f"会议标题:{title}", - f"来源文件:{latest.get('source_path', '')}", - "", - "会议摘要:", - ] - parts.extend(f"- {item}" for item in latest.get("summary", [])[:6]) - - projects = latest.get("projects", []) - if projects: - parts.extend(["", "本次涉及项目:"]) - for project in projects[:8]: - parts.append( - f"- {project.get('name', '')}:状态={project.get('status', '待确认')};" - f"更新={project.get('summary', '') or project.get('progress', '') or '待确认'}" - ) - return { - "answer": "\n".join(parts), - "matches": [], - "ledger_path": str(ledger_path), - "team_id": state.get("team_id", ""), - "knowledge_base_id": state.get("knowledge_base_id") or state.get("team_id", ""), - "meeting_id": latest.get("meeting_id", ""), - } - - def _flatten_state(self, state: dict[str, Any]) -> list[dict[str, str]]: - result: list[dict[str, str]] = [] - for project in state.get("projects", []): - project_name = str(project.get("name", "")) - result.append( - _search_item( - item_type="项目", - display_text=f"{project_name}|状态:{project.get('status', '待确认')}|更新:{project.get('latest_update', '')}", - project=project_name, - raw_values=[ - project_name, - project.get("status", ""), - project.get("summary", ""), - project.get("latest_update", ""), - " ".join(project.get("keywords", [])), - ], - ) - ) - for metric in project.get("metrics", []): - result.append( - _search_item( - item_type="指标", - display_text=( - f"{project_name}|{metric.get('name', '')}={metric.get('value', '')}" - f"|目标:{metric.get('target', '')}|状态:{metric.get('status', '')}" - ), - project=project_name, - raw_values=list(metric.values()), - ) - ) - for action in project.get("actions", []): - result.append( - _search_item( - item_type="待办", - display_text=( - f"{project_name}|{action.get('title', '')}|责任:{action.get('owner', '待确认')}" - f"|截止:{action.get('deadline', '')}|状态:{action.get('status', '')}" - ), - project=project_name, - raw_values=list(action.values()), - ) - ) - for risk in project.get("risks", []): - result.append( - _search_item( - item_type="风险", - display_text=( - f"{project_name}|{risk.get('title', '')}|状态:{risk.get('status', '')}" - f"|影响:{risk.get('impact', '')}" - ), - project=project_name, - raw_values=list(risk.values()), - ) - ) - for item in project.get("completed", []): - result.append( - _search_item( - item_type="完成", - display_text=f"{project_name}|{item.get('title', '')}|责任:{item.get('owner', '待确认')}", - project=project_name, - raw_values=list(item.values()), - ) - ) - for item in project.get("decisions", []): - result.append( - _search_item( - item_type="决策", - display_text=f"{project_name}|{item}", - project=project_name, - raw_values=[project_name, item], - ) - ) - for item in project.get("next_steps", []): - result.append( - _search_item( - item_type="下一步", - display_text=f"{project_name}|{item}", - project=project_name, - raw_values=[project_name, item], - ) - ) - for meeting in state.get("meetings", []): - for item in meeting.get("summary", []): - result.append( - _search_item( - item_type="会议摘要", - display_text=f"{meeting.get('meeting_date', '')}|{item}", - raw_values=[meeting.get("meeting_date", ""), item], - ) - ) - return result - - def _load_state(self, state_path: Path, team_id: str) -> dict[str, Any]: - if state_path.exists(): - state = json.loads(state_path.read_text(encoding="utf-8")) - else: - state = { - "team_id": team_id, - "knowledge_base_id": team_id, - "version": 0, - "last_updated": "", - "last_meeting_date": "", - "meetings": [], - } - state.setdefault("projects", []) - state.setdefault("current_metrics", []) - state.setdefault("open_actions", []) - state.setdefault("risks", []) - state.setdefault("completed_items", []) - state.setdefault("followups", []) - return state - - def _compact_state(self, state: dict[str, Any]) -> dict[str, Any]: - project_index: dict[str, dict[str, Any]] = {} - for meeting in state.get("meetings", []): - for raw_project in meeting.get("projects", []): - project = _normalize_project_state(raw_project) - marker = _project_key(project.get("name", ""), project.get("aliases", [])) - if not marker: - continue - existing = project_index.get(marker) - if existing is None: - existing = { - "name": project.get("name", ""), - "aliases": project.get("aliases", []), - "status": project.get("status", "待确认"), - "summary": project.get("summary", ""), - "latest_update": project.get("summary", "") or project.get("progress", ""), - "last_meeting_date": meeting.get("meeting_date", ""), - "last_meeting_id": meeting.get("meeting_id", ""), - "keywords": project.get("keywords", []), - "decisions": [], - "next_steps": [], - "metrics": [], - "actions": [], - "risks": [], - "completed": [], - "history": [], - } - project_index[marker] = existing - _merge_project(existing, project, meeting) - - projects = sorted( - project_index.values(), - key=lambda item: (item.get("last_meeting_date", ""), item.get("name", "")), - reverse=True, - ) - state["projects"] = projects - state["current_metrics"] = _limit_items(_collect_from_projects(projects, "metrics"), 60) - state["open_actions"] = _limit_items([item for item in _collect_from_projects(projects, "actions") if item.get("status") != "已完成"], 120) - state["risks"] = _limit_items(_collect_from_projects(projects, "risks"), 100) - state["completed_items"] = _limit_items(_collect_from_projects(projects, "completed"), 120) - state["followups"] = _limit_items( - [ - { - "title": item.get("title", ""), - "owner": item.get("owner", ""), - "deadline": item.get("deadline", ""), - "project": item.get("project", ""), - "status": item.get("status", ""), - } - for item in state["open_actions"] - ], - 80, - ) - return state - - -def _normalize_project_state(project: dict[str, Any]) -> dict[str, Any]: - return { - "name": str(project.get("name", "")).strip(), - "aliases": _unique_text(project.get("aliases", [])), - "status": str(project.get("status", "")).strip() or "待确认", - "summary": str(project.get("summary", "")).strip(), - "progress": str(project.get("progress", "")).strip(), - "keywords": _unique_text(project.get("keywords", [])), - "decisions": _unique_text(project.get("decisions", [])), - "next_steps": _unique_text(project.get("next_steps", [])), - "metrics": [dict(item) for item in project.get("metrics", []) if item.get("name")], - "actions": [dict(item) for item in project.get("actions", []) if item.get("title")], - "risks": [dict(item) for item in project.get("risks", []) if item.get("title")], - "completed": [dict(item) for item in project.get("completed", []) if item.get("title")], - } - - -def _merge_project(target: dict[str, Any], incoming: dict[str, Any], meeting: dict[str, Any]) -> None: - target["aliases"] = _unique_text(target.get("aliases", []) + incoming.get("aliases", []) + [incoming.get("name", "")]) - target["status"] = incoming.get("status") or target.get("status", "待确认") - target["summary"] = incoming.get("summary") or target.get("summary", "") - target["latest_update"] = incoming.get("summary") or incoming.get("progress") or target.get("latest_update", "") - target["last_meeting_date"] = meeting.get("meeting_date", "") or target.get("last_meeting_date", "") - target["last_meeting_id"] = meeting.get("meeting_id", "") or target.get("last_meeting_id", "") - target["keywords"] = _unique_text(target.get("keywords", []) + incoming.get("keywords", [])) - target["decisions"] = _unique_text(target.get("decisions", []) + incoming.get("decisions", [])) - target["next_steps"] = _unique_text(target.get("next_steps", []) + incoming.get("next_steps", []) + [item.get("title", "") for item in incoming.get("actions", []) if item.get("title")]) - target["metrics"] = _merge_named_entries(target.get("metrics", []), incoming.get("metrics", []), "name", incoming.get("name", "")) - target["actions"] = _merge_named_entries(target.get("actions", []), incoming.get("actions", []), "title", incoming.get("name", "")) - target["risks"] = _merge_named_entries(target.get("risks", []), incoming.get("risks", []), "title", incoming.get("name", "")) - target["completed"] = _merge_named_entries(target.get("completed", []), incoming.get("completed", []), "title", incoming.get("name", "")) - update = incoming.get("summary") or incoming.get("progress") or "本次会议有更新" - history_item = { - "meeting_date": meeting.get("meeting_date", ""), - "meeting_id": meeting.get("meeting_id", ""), - "status": incoming.get("status", "待确认"), - "update": update, - } - target["history"] = _append_unique_history(target.get("history", []), history_item) - - -def _merge_named_entries(current: list[dict[str, Any]], incoming: list[dict[str, Any]], key: str, project_name: str) -> list[dict[str, Any]]: - merged: dict[str, dict[str, Any]] = {} - for item in current + incoming: - marker = _normalize_name(str(item.get(key, ""))) - if not marker: - continue - merged[marker] = {**merged.get(marker, {}), **item, "project": project_name} - return list(merged.values()) - - -def _append_unique_history(history: list[dict[str, Any]], item: dict[str, Any]) -> list[dict[str, Any]]: - marker = (item.get("meeting_id", ""), item.get("update", "")) - output = [entry for entry in history if (entry.get("meeting_id", ""), entry.get("update", "")) != marker] - output.append(item) - return output[-12:] - - -def _collect_from_projects(projects: list[dict[str, Any]], key: str) -> list[dict[str, Any]]: - items: list[dict[str, Any]] = [] - for project in projects: - for item in project.get(key, []): - items.append({**item, "project": project.get("name", "")}) - return items - - -def _limit_items(items: list[dict[str, Any]], size: int) -> list[dict[str, Any]]: - return items[-size:] - - -def _search_item(*, item_type: str, display_text: str, raw_values: list[Any], project: str = "") -> dict[str, str]: - search_text = ";".join(str(value) for value in raw_values if value) - if project: - search_text = f"{project};{search_text}" - return { - "type": item_type, - "project": project, - "display_text": display_text, - "search_text": search_text, - } - - -def _project_key(name: str, aliases: list[str]) -> str: - normalized = _normalize_name(name) - if normalized: - return normalized - for alias in aliases: - normalized = _normalize_name(alias) - if normalized: - return normalized - return "" - - -def _normalize_name(text: str) -> str: - return re.sub(r"[\W_]+", "", text).lower() - - -def _unique_text(values: list[Any]) -> list[str]: - seen: set[str] = set() - output: list[str] = [] - for raw in values: - text = str(raw).strip() - if not text or text in seen: - continue - seen.add(text) - output.append(text) - return output - - -def _escape_md(text: str) -> str: - return text.replace("|", "\\|").replace("\n", " ") - - -def _tokenize_query(query: str) -> list[str]: - stop_words = {"查询", "团队", "team", "team_id", "哪些", "还有", "一下", "帮我", "关于", "知识库", "项目"} - words = [part for part in re.split(r"[\s,。;、,.!??::=]+", query) if len(part) >= 2] - tokens: list[str] = [] - for word in words: - if word in stop_words: - continue - tokens.append(word) - if re.search(r"[\u4e00-\u9fff]", word) and len(word) > 4: - for size in range(2, min(6, len(word)) + 1): - for index in range(0, len(word) - size + 1): - tokens.append(word[index : index + size]) - return list(dict.fromkeys(tokens)) - - -def _looks_like_recent_meeting_query(query: str) -> bool: - return any(word in query for word in ("最近", "最新", "上次")) and any(word in query for word in ("会议", "内容", "纪要", "摘要")) diff --git a/meeting_memory/meeting_digest_agent.py b/meeting_memory/meeting_digest_agent.py deleted file mode 100644 index d64991f..0000000 --- a/meeting_memory/meeting_digest_agent.py +++ /dev/null @@ -1,393 +0,0 @@ -from __future__ import annotations - -import json -import re -from pathlib import Path -from typing import Any, Callable - -from core_agent.config import apply_compat_env_aliases, build_core_agent_config, load_core_agent_env -from core_agent.exceptions import AgentCancelled -from providers.openai_compatible import OpenAICompatibleProvider -from providers.prompt_loader import load_prompt - -from .models import ExtractedMeeting - -SubagentProgressCallback = Callable[[dict[str, Any]], None] - - -class MeetingDigestAgent: - def __init__(self, workspace: str | Path) -> None: - self.workspace = Path(workspace).resolve() - prompt = load_prompt("meeting_digest") - self.system_prompt = prompt["system"] - self.instructions = prompt.get("instructions", "") - self.provider = self._build_provider() - - def analyze( - self, - *, - transcript_text: str, - source_path: str | Path, - meeting_date: str, - knowledge_base_id: str, - existing_context: str = "", - progress_callback: SubagentProgressCallback | None = None, - should_cancel: Callable[[], bool] | None = None, - ) -> ExtractedMeeting: - source = str(Path(source_path).resolve()) - meeting_id = _meeting_id(source, transcript_text) - - if self.provider is None: - self._emit( - progress_callback, - kind="subagent_status", - phase="fallback", - text="Agent 未检测到可用模型,切换到降级整理。", - ) - return _fallback_meeting( - meeting_id=meeting_id, - meeting_date=meeting_date, - source_path=source, - transcript_text=transcript_text, - ) - - self._emit( - progress_callback, - kind="subagent_status", - phase="prepare", - text="Agent 正在加载历史台账并构造整理提示。", - ) - self._raise_if_cancelled(should_cancel) - user_message = self._build_user_message( - transcript_text=transcript_text, - source_path=source, - meeting_date=meeting_date, - knowledge_base_id=knowledge_base_id, - existing_context=existing_context, - ) - self._emit( - progress_callback, - kind="subagent_status", - phase="call_model", - text="Agent 正在调用模型整理项目、待办、风险和指标。", - ) - - turn = None - for event in self.provider.stream_generate( - [ - {"role": "system", "content": self.system_prompt}, - {"role": "user", "content": user_message}, - ], - [], - should_cancel=should_cancel, - ): - self._raise_if_cancelled(should_cancel) - if event.type == "reasoning" and event.delta: - self._emit( - progress_callback, - kind="subagent_reasoning", - phase="stream", - text=event.delta, - ) - elif event.type == "content" and event.delta: - self._emit( - progress_callback, - kind="subagent_output", - phase="stream", - text=event.delta, - ) - elif event.type == "turn": - turn = event.turn - - if turn is None: - raise RuntimeError("Meeting digest agent stream ended without a final turn") - self._raise_if_cancelled(should_cancel) - - self._emit( - progress_callback, - kind="subagent_status", - phase="parse", - text="Agent 已完成分析,正在解析结构化结果。", - ) - payload = _parse_json_object(turn.content or "") - self._emit( - progress_callback, - kind="subagent_status", - phase="parse_done", - text="Agent 已完成结构化解析,准备写入项目台账。", - ) - return _meeting_from_payload( - payload, - meeting_id=meeting_id, - meeting_date=meeting_date, - source_path=source, - ) - - def _build_provider(self) -> OpenAICompatibleProvider | None: - load_core_agent_env(self.workspace) - apply_compat_env_aliases() - config = build_core_agent_config() - if not config.api_key or not config.base_url or not config.model: - return None - return OpenAICompatibleProvider( - model=config.model, - api_key=config.api_key, - base_url=config.base_url, - timeout=config.timeout, - temperature=min(config.temperature, 0.1), - ) - - def _build_user_message( - self, - *, - transcript_text: str, - source_path: str, - meeting_date: str, - knowledge_base_id: str, - existing_context: str, - ) -> str: - context = existing_context.strip() or "暂无历史台账。请直接从本次会议中识别项目并建立初始状态。" - parts: list[str] = [] - if self.instructions: - parts.append(self.instructions) - parts.append( - f"知识库: {knowledge_base_id}\n" - f"会议日期: {meeting_date}\n" - f"来源文件: {source_path}\n\n" - "当前历史台账摘要:\n" - f"{context}\n\n" - "会议原文:\n" - f"{transcript_text}" - ) - return "\n\n---\n\n".join(parts) - - def _emit( - self, - callback: SubagentProgressCallback | None, - *, - kind: str, - phase: str, - text: str, - ) -> None: - if callback is None: - return - callback( - { - "channel": "meeting_digest", - "kind": kind, - "phase": phase, - "text": text, - } - ) - - def _raise_if_cancelled(self, should_cancel: Callable[[], bool] | None) -> None: - if should_cancel and should_cancel(): - raise AgentCancelled("Agent run cancelled by user.") - - -def _meeting_from_payload( - payload: dict[str, Any], - *, - meeting_id: str, - meeting_date: str, - source_path: str, -) -> ExtractedMeeting: - title = _as_text(payload.get("meeting_title")) - overview = _as_text(payload.get("overview")) - summary = _normalize_text_list(payload.get("summary")) - if overview and overview not in summary: - summary.insert(0, overview) - projects = [_normalize_project(item) for item in _as_list(payload.get("projects"))] - projects = [item for item in projects if item["name"]] - - metrics: list[dict[str, Any]] = [] - actions: list[dict[str, Any]] = [] - risks: list[dict[str, Any]] = [] - completed_items: list[dict[str, Any]] = [] - followups: list[dict[str, Any]] = [] - evidence: list[dict[str, str]] = [] - - for project in projects: - project_name = project["name"] - for metric in project["metrics"]: - metrics.append({**metric, "project": project_name}) - for action in project["actions"]: - entry = {**action, "project": project_name} - actions.append(entry) - if entry.get("status") != "已完成": - followups.append(entry) - for risk in project["risks"]: - risks.append({**risk, "project": project_name}) - for item in project["completed"]: - completed_items.append({**item, "project": project_name}) - for snippet in project["evidence"]: - evidence.append({"project": project_name, "text": snippet}) - - if not summary: - if projects: - summary = [f"本次会议识别到 {len(projects)} 个项目更新。"] - else: - summary = ["本次会议已导入,但未识别出明确项目。"] - - return ExtractedMeeting( - meeting_id=meeting_id, - meeting_date=meeting_date, - source_path=source_path, - title=title, - overview=overview, - summary=summary, - projects=projects, - metrics=metrics, - action_items=actions, - risks=risks, - completed_items=completed_items, - followups=followups, - evidence=evidence, - ) - - -def _normalize_project(raw: Any) -> dict[str, Any]: - item = raw if isinstance(raw, dict) else {} - actions = [_normalize_action(value) for value in _as_list(item.get("actions"))] - risks = [_normalize_risk(value) for value in _as_list(item.get("risks"))] - completed = [_normalize_completed(value) for value in _as_list(item.get("completed"))] - metrics = [_normalize_metric(value) for value in _as_list(item.get("metrics"))] - decisions = _normalize_text_list(item.get("decisions")) - next_steps = _normalize_text_list(item.get("next_steps")) - evidence = _normalize_text_list(item.get("evidence")) - aliases = _normalize_text_list(item.get("aliases")) - keywords = _normalize_text_list(item.get("keywords")) - return { - "name": _as_text(item.get("name")), - "status": _as_text(item.get("status")) or "待确认", - "summary": _as_text(item.get("summary")), - "progress": _as_text(item.get("progress")), - "decisions": decisions, - "next_steps": next_steps, - "metrics": [value for value in metrics if value["name"]], - "actions": [value for value in actions if value["title"]], - "risks": [value for value in risks if value["title"]], - "completed": [value for value in completed if value["title"]], - "keywords": keywords, - "aliases": aliases, - "evidence": evidence, - } - - -def _normalize_metric(raw: Any) -> dict[str, str]: - item = raw if isinstance(raw, dict) else {} - return { - "name": _as_text(item.get("name")), - "value": _as_text(item.get("value")), - "target": _as_text(item.get("target")), - "status": _as_text(item.get("status")), - "trend": _as_text(item.get("trend")), - "note": _as_text(item.get("note")), - "evidence": _as_text(item.get("evidence")), - } - - -def _normalize_action(raw: Any) -> dict[str, str]: - item = raw if isinstance(raw, dict) else {} - return { - "title": _as_text(item.get("title")), - "owner": _as_text(item.get("owner")), - "deadline": _as_text(item.get("deadline")), - "status": _as_text(item.get("status")) or "待跟进", - "next_step": _as_text(item.get("next_step")), - "evidence": _as_text(item.get("evidence")), - } - - -def _normalize_risk(raw: Any) -> dict[str, str]: - item = raw if isinstance(raw, dict) else {} - return { - "title": _as_text(item.get("title")), - "status": _as_text(item.get("status")) or "关注中", - "impact": _as_text(item.get("impact")), - "owner": _as_text(item.get("owner")), - "next_step": _as_text(item.get("next_step")), - "deadline": _as_text(item.get("deadline")), - "evidence": _as_text(item.get("evidence")), - } - - -def _normalize_completed(raw: Any) -> dict[str, str]: - item = raw if isinstance(raw, dict) else {} - return { - "title": _as_text(item.get("title")), - "status": _as_text(item.get("status")) or "已完成", - "owner": _as_text(item.get("owner")), - "evidence": _as_text(item.get("evidence")), - } - - -def _parse_json_object(text: str) -> dict[str, Any]: - cleaned = text.strip() - fenced = re.search(r"```(?:json)?\s*(\{.*\})\s*```", cleaned, flags=re.S) - if fenced: - cleaned = fenced.group(1) - if cleaned.startswith("{") and cleaned.endswith("}"): - return json.loads(cleaned) - match = re.search(r"\{.*\}", cleaned, flags=re.S) - if match: - return json.loads(match.group(0)) - raise ValueError("Meeting digest agent did not return valid JSON") - - -def _fallback_meeting( - *, - meeting_id: str, - meeting_date: str, - source_path: str, - transcript_text: str, -) -> ExtractedMeeting: - lines = [line.strip("-* \t") for line in transcript_text.splitlines() if line.strip()] - important = [line for line in lines if len(line) >= 12][:12] - summary = important[:4] or ["本次会议已导入,当前环境未配置模型,暂未完成智能整理。"] - project = { - "name": Path(source_path).stem[:48] or "未命名项目", - "status": "待确认", - "summary": summary[0], - "progress": "", - "decisions": [], - "next_steps": summary[1:3], - "metrics": [], - "actions": [], - "risks": [], - "completed": [], - "keywords": [], - "aliases": [], - "evidence": important[:6], - } - return ExtractedMeeting( - meeting_id=meeting_id, - meeting_date=meeting_date, - source_path=source_path, - title=Path(source_path).stem, - overview=summary[0], - summary=summary, - projects=[project], - evidence=[{"text": line} for line in important[:6]], - ) - - -def _normalize_text_list(value: Any) -> list[str]: - return [text for text in (_as_text(item) for item in _as_list(value)) if text] - - -def _as_list(value: Any) -> list[Any]: - return value if isinstance(value, list) else [] - - -def _as_text(value: Any) -> str: - if value is None: - return "" - return str(value).strip() - - -def _meeting_id(source: str, text: str) -> str: - import hashlib - - digest = hashlib.sha1((source + "\n" + text[:4000]).encode("utf-8", errors="ignore")).hexdigest()[:10] - return f"meeting_{digest}" diff --git a/meeting_memory/meeting_processor.py b/meeting_memory/meeting_processor.py new file mode 100644 index 0000000..edd32bb --- /dev/null +++ b/meeting_memory/meeting_processor.py @@ -0,0 +1,427 @@ +import hashlib +import logging +from typing import Callable, List, Optional + +from meeting_memory.config import config +from meeting_memory.dedup_helpers import resolve_with_similarity +from meeting_memory.extractor import ( + MeetingExtraction, + extract_entities_from_text, + extract_facts_from_text, + extract_meeting_info as monolithic_extract, +) +from meeting_memory.extractor import ( + resolve_entities_against_graph, + resolve_facts_against_graph, +) +from meeting_memory.graph_store import graph_store +from meeting_memory.meeting_state import MeetingStateStore +from meeting_memory.raw_store import raw_meeting_store + +logger = logging.getLogger(__name__) + +state_store = MeetingStateStore(config.state_path) +ProgressCallback = Callable[[int, int, str], None] + + +class MeetingProcessor: + + def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]: + with open(filepath, 'r', encoding='utf-8') as file_obj: + text = file_obj.read() + return self.process_meeting_text(text, force=force) + + def process_meeting_text( + self, + text: str, + force: bool = False, + interactive: bool = True, + progress_callback: Optional[ProgressCallback] = None, + use_multistep_extraction: bool = True, + ) -> Optional[str]: + def report(step: int, total: int, message: str) -> None: + if progress_callback: + progress_callback(step, total, message) + print(f'[{step}/{total}] {message}') + + if use_multistep_extraction: + return self._process_multistep(text, force, interactive, report) + else: + return self._process_monolithic(text, force, interactive, report) + + def _process_monolithic( + self, text: str, force: bool, interactive: bool, + report: Callable, + ) -> Optional[str]: + total_steps = 7 + report(1, total_steps, '计算内容哈希') + content_hash = self._compute_content_hash(text) + + if not force and state_store.has_content_hash(content_hash): + logger.info('Duplicate content hash skipped: %s', content_hash[:12]) + return None + + if not force: + report(2, total_steps, 'Neo4j 语义相似去重检索') + similar = graph_store.find_similar_episode(text, threshold=0.92) + if similar: + meta = similar['metadata'] + if not interactive: + logger.info('Skipped similar meeting: %s', meta.get('title', '')) + return None + print(f'\n发现相似会议:{meta.get("title", "")} ({meta.get("date", "")}) 相似度 {similar["score"]:.2%}') + while True: + choice = input('选择 [s]跳过 / [o]覆盖(默认 s):').strip().lower() or 's' + if choice == 's': + logger.info('Skipped similar meeting: %s', meta.get('title', '')) + return None + if choice == 'o': + force = True + break + print('请输入 s 或 o。') + else: + report(2, total_steps, '跳过语义去重,按覆盖模式继续') + + report(3, total_steps, '调用大模型抽取结构化信息(单步模式)') + meeting_data = self._extract_monolithic(text) + if not meeting_data: + logger.error('Failed to extract meeting information') + return None + + data_dict = meeting_data.model_dump() + return self._finish_pipeline(data_dict, content_hash, text, force, interactive, report, total_steps) + + def _process_multistep( + self, text: str, force: bool, interactive: bool, + report: Callable, + ) -> Optional[str]: + total_steps = 10 + report(1, total_steps, '计算内容哈希') + content_hash = self._compute_content_hash(text) + + if not force and state_store.has_content_hash(content_hash): + logger.info('Duplicate content hash skipped: %s', content_hash[:12]) + return None + + if not force: + report(2, total_steps, 'Neo4j 语义相似去重检索') + similar = graph_store.find_similar_episode(text, threshold=0.92) + if similar: + meta = similar['metadata'] + if not interactive: + logger.info('Skipped similar meeting: %s', meta.get('title', '')) + return None + print(f'\n发现相似会议:{meta.get("title", "")} ({meta.get("date", "")}) 相似度 {similar["score"]:.2%}') + while True: + choice = input('选择 [s]跳过 / [o]覆盖(默认 s):').strip().lower() or 's' + if choice == 's': + logger.info('Skipped similar meeting: %s', meta.get('title', '')) + return None + if choice == 'o': + force = True + break + print('请输入 s 或 o。') + else: + report(2, total_steps, '跳过语义去重,按覆盖模式继续') + + # Step 3: 提取标题、日期、参与人等元信息 + report(3, total_steps, '抽取会议元信息(标题、日期、参与者等)') + meta_info = self._extract_monolithic(text, stream=interactive) + if not meta_info: + logger.error('Failed to extract meeting metadata') + return None + data_dict = meta_info.model_dump() + data_dict['_content_hash'] = content_hash + data_dict['_graph_meeting_id'] = graph_store.meeting_id(data_dict) + data_dict['_original_text'] = text + + # Step 4: 抽取实体节点(LLM 调用 1) + report(4, total_steps, '第 1 步实体抽取:识别会议中提及的实体') + use_stream = interactive + previous_episodes = self._get_previous_episodes_context(data_dict) + extracted_entities = extract_entities_from_text( + text, previous_episodes=previous_episodes, stream=use_stream + ) + logger.info('Extracted %d entities from meeting', len(extracted_entities)) + if not extracted_entities: + logger.warning('No entities extracted, aborting') + return None + + # Step 5: 实体去重(与已有图谱对比 + LLM 裁决) + report(5, total_steps, '实体去重:与图谱中已有实体对比') + resolved_entities = self._dedup_entities(extracted_entities, text) + data_dict['entities'] = resolved_entities + logger.info('After dedup: %d entities remain', len(resolved_entities)) + + # Step 6: 抽取事实关系(LLM 调用 2) + report(6, total_steps, '事实抽取:提取实体间的结构化关系') + reference_time = data_dict.get('date', '') + extracted_facts = extract_facts_from_text( + text, resolved_entities, + reference_time=reference_time, + previous_episodes=previous_episodes, + stream=use_stream, + ) + logger.info('Extracted %d facts from meeting', len(extracted_facts)) + + # Step 7: 事实去重与矛盾检测 + report(7, total_steps, '事实解析:去重与矛盾检测') + resolved_facts = self._dedup_facts(extracted_facts, data_dict) + data_dict['relations'] = resolved_facts + logger.info('After dedup: %d facts remain', len(resolved_facts)) + + # Step 8: 检查标题和日期重复 + report(8, total_steps, '检查标题和日期重复') + should_skip = self._handle_duplicate(data_dict, force=force, interactive=interactive) + if should_skip: + return None + + meeting_title = data_dict.get('title', '') + meeting_date = data_dict.get('date', '') + + # Step 9: 归档 + 合并行动项/指标 + report(9, total_steps, '归档和状态合并') + raw_path = raw_meeting_store.save(text, title=meeting_title, date=meeting_date) + data_dict['_original_text_path'] = raw_path + + meeting_filename = f'{graph_store.meeting_id(data_dict)}.md' + data_dict['action_items'] = state_store.merge_action_items( + data_dict.get('action_items', []), + meeting_title, meeting_date, meeting_filename, + ) + data_dict['metrics'] = state_store.merge_metrics( + data_dict.get('metrics', []), + meeting_title, meeting_date, meeting_filename, + ) + state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename) + state_store.save() + + # Step 10: 写入 Neo4j + report(10, total_steps, '写入 Neo4j 图谱') + graph_store.upsert_meeting_subgraph(data_dict) + + logger.info('Meeting processed (multi-step): %s', meeting_title) + return raw_path + + def _get_previous_episodes_context(self, data_dict: dict) -> list: + meeting_title = data_dict.get('title', '') + meeting_date = data_dict.get('date', '') + series_info = state_store.get_series_info(meeting_title) + if not series_info: + return [] + processed = series_info.get('processed_titles', []) + if not processed: + return [] + rows = graph_store.run_query(''' + MATCH (m:Meeting) + WHERE m.title IN $titles + OPTIONAL MATCH (m)-[:HAS_EPISODE]->(ep:Episode) + RETURN m.title AS title, m.date AS date, ep.summary AS summary, ep.content AS content + ORDER BY m.date DESC + LIMIT 3 + ''', titles=processed[-3:]) + return [{'content': r.get('content', r.get('summary', '')), 'timestamp': r.get('date', '')} for r in rows] + + def _dedup_entities(self, extracted: list, text: str) -> list: + try: + existing = graph_store.get_entities_map() + if not existing: + return extracted + existing_list = [ + { + 'candidate_id': i, + 'name': v['name'], + 'entity_type': v.get('entity_type', ''), + 'summary': v.get('summary', '') or v.get('description', ''), + } + for i, v in enumerate(existing.values()) + ] + + resolved_indices, unresolved_indices = resolve_with_similarity( + extracted, existing_list + ) + logger.info( + 'Deterministic dedup: %d resolved to existing, %d need LLM', + len(resolved_indices), len(unresolved_indices), + ) + + resolved_set = set(resolved_indices) + unresolved_set = set(unresolved_indices) + + kept = [ + e for i, e in enumerate(extracted) + if '_resolved_to' not in e + ] + + if unresolved_indices: + unresolved = [extracted[i] for i in unresolved_indices] + llm_resolved = resolve_entities_against_graph( + unresolved, existing_list, episode_content=text, + ) + kept.extend(llm_resolved) + + return kept + except Exception as exc: + logger.warning('Entity dedup failed, keeping all extracted: %s', exc) + return extracted + + def _dedup_facts(self, facts: list, data_dict: dict) -> list: + resolved = [] + for fact in facts: + try: + source = fact.get('source_entity_name', '') + target = fact.get('target_entity_name', '') + fact_text = fact.get('fact', '') + + exact_existing = graph_store.get_facts_between(source, target) + hybrid_candidates = graph_store.search_related_facts_hybrid( + fact_text, limit=5 + ) if fact_text else [] + + invalidation_candidates = [ + c for c in hybrid_candidates + if c.get('fact', '') not in {e.get('fact', '') for e in exact_existing} + ] + + all_existing = list(exact_existing) + seen_facts = {e.get('fact', '') for e in exact_existing} + for c in hybrid_candidates: + cf = c.get('fact', '') + if cf and cf not in seen_facts: + all_existing.append(c) + seen_facts.add(cf) + + if not all_existing: + resolved.append(fact) + continue + + result = resolve_facts_against_graph( + fact, all_existing, invalidation_candidates + ) + if not isinstance(result, dict): + resolved.append(fact) + continue + + if result.get('is_duplicate'): + logger.debug('Skipped duplicate fact: %s', fact.get('fact', '')) + continue + + if result.get('is_contradicted'): + contradicted_indices = result.get('contradicted_facts', []) + for idx in contradicted_indices: + if idx < len(all_existing): + target_fact = all_existing[idx] + graph_store.mark_relation_expired( + source_name=target_fact.get('source_name', source), + target_name=target_fact.get('target_name', target), + relation_type=target_fact.get('relation_type', ''), + ) + logger.info( + 'Invalidated %d contradicted facts for: %s', + len(contradicted_indices), fact.get('fact', ''), + ) + + resolved.append(fact) + except Exception as exc: + logger.warning('Fact dedup failed, keeping: %s', exc) + resolved.append(fact) + return resolved + + def _finish_pipeline( + self, data_dict: dict, content_hash: str, text: str, + force: bool, interactive: bool, report: Callable, total_steps: int, + ) -> Optional[str]: + data_dict['_content_hash'] = content_hash + data_dict['_graph_meeting_id'] = graph_store.meeting_id(data_dict) + + report(4, total_steps, '检查标题和日期重复') + should_skip = self._handle_duplicate(data_dict, force=force, interactive=interactive) + if should_skip: + return None + + meeting_title = data_dict.get('title', '') + meeting_date = data_dict.get('date', '') + + report(5, total_steps, '归档原始会议文本') + raw_path = raw_meeting_store.save(text, title=meeting_title, date=meeting_date) + data_dict['_original_text'] = text + data_dict['_original_text_path'] = raw_path + + meeting_filename = f'{graph_store.meeting_id(data_dict)}.md' + + report(6, total_steps, '合并行动项和指标状态') + data_dict['action_items'] = state_store.merge_action_items( + data_dict.get('action_items', []), meeting_title, meeting_date, meeting_filename, + ) + data_dict['metrics'] = state_store.merge_metrics( + data_dict.get('metrics', []), meeting_title, meeting_date, meeting_filename, + ) + state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename) + state_store.save() + + report(7, total_steps, '写入 Neo4j 图谱和检索数据') + graph_store.upsert_meeting_subgraph(data_dict) + + logger.info('Meeting processed: %s', meeting_title) + return raw_path + + def _handle_duplicate(self, data_dict: dict, force: bool, interactive: bool = True) -> bool: + title = data_dict.get('title', '') + date = data_dict.get('date', '') + existing = graph_store.get_meeting(title, date) + if not existing: + return False + if force: + logger.info('Duplicate meeting found; overwriting in force mode: %s', title) + self._remove_old(data_dict, existing) + return False + if not interactive: + logger.info('Skipped duplicate meeting in non-interactive mode: %s', title) + return True + print(f'\n发现重复会议:{title} ({date})') + while True: + choice = input('选择 [s]跳过 / [o]覆盖(默认 s):').strip().lower() or 's' + if choice == 's': + logger.info('Skipped duplicate meeting: %s', title) + return True + if choice == 'o': + self._remove_old(data_dict, existing) + return False + print('请输入 s 或 o。') + + def _remove_old(self, data_dict: dict, existing: Optional[dict] = None) -> None: + meeting_id = graph_store.meeting_id(data_dict) + graph_store.remove_meeting_subgraph(meeting_id) + new_hash = data_dict.get('_content_hash', '') + if new_hash: + state_store.remove_content_hash(new_hash) + if existing: + old_hash = existing.get('content_hash', '') + if old_hash and old_hash != new_hash: + state_store.remove_content_hash(old_hash) + logger.info('Removed old meeting artifacts: %s', data_dict.get('title', '')) + + def _compute_content_hash(self, text: str) -> str: + normalized = text.strip().replace('\r\n', '\n') + return hashlib.sha256(normalized.encode('utf-8')).hexdigest() + + def _extract_monolithic(self, text: str, *, stream: bool = True) -> Optional[MeetingExtraction]: + try: + return monolithic_extract(text, stream=stream) + except Exception as exc: + logger.error('LLM extraction failed: %s', exc) + return None + + def query(self, question: str, top_k: int = 3) -> str: + return graph_store.format_search_context(question, top_k=top_k) + + def stats(self) -> dict: + return { + 'graph': graph_store.get_stats(), + 'state': state_store.get_stats(), + 'raw_dir': config.storage.raw_dir, + 'state_path': config.state_path, + } + + +meeting_processor = MeetingProcessor() diff --git a/meeting_memory/meeting_state.py b/meeting_memory/meeting_state.py new file mode 100644 index 0000000..e99a668 --- /dev/null +++ b/meeting_memory/meeting_state.py @@ -0,0 +1,187 @@ +import hashlib +import json +import logging +import os +import re +from typing import List, Optional + +logger = logging.getLogger(__name__) + + +def _item_id(task: str, assignee: str) -> str: + raw = f"{task}|{assignee}" + return hashlib.md5(raw.encode("utf-8")).hexdigest()[:8] + + +def _metric_id(metric_name: str, owner: str) -> str: + raw = f"{metric_name}|{owner}" + return hashlib.md5(raw.encode("utf-8")).hexdigest()[:8] + + +class MeetingStateStore: + def __init__(self, state_path: str): + self.state_path = state_path + self._state = self._load() + + def _load(self) -> dict: + if os.path.exists(self.state_path): + try: + with open(self.state_path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception as exc: + logger.warning("Failed to load state file, creating a new one: %s", exc) + return { + "action_items": {}, + "metrics": {}, + "meeting_series": {}, + "content_hashes": {}, + } + + def save(self): + os.makedirs(os.path.dirname(self.state_path), exist_ok=True) + with open(self.state_path, "w", encoding="utf-8") as f: + json.dump(self._state, f, ensure_ascii=False, indent=2) + + def _ensure_series(self, meeting_title: str, meeting_date: str) -> str: + series_name = self._detect_series(meeting_title) + series = self._state["meeting_series"].get(series_name) + if not series: + series = {"latest_date": meeting_date, "processed_titles": []} + self._state["meeting_series"][series_name] = series + if meeting_date > series.get("latest_date", ""): + series["latest_date"] = meeting_date + if meeting_title not in series["processed_titles"]: + series["processed_titles"].append(meeting_title) + return series_name + + def _detect_series(self, title: str) -> str: + cleaned = re.sub(r"\uFF08\d{4}\u7B2C\w+\u671F\uFF09", "", title) + cleaned = re.sub(r"\(\d{4}\u7B2C\w+\u671F\)", "", cleaned) + cleaned = re.sub(r"\d{4}\u7B2C\w+\u671F", "", cleaned) + cleaned = re.sub(r"\d{4}\u5E74\u7B2C\w+\u6B21", "", cleaned) + cleaned = cleaned.strip("-_ ") + return cleaned or title + + def merge_action_items( + self, + new_items: List[dict], + meeting_title: str, + meeting_date: str, + meeting_filename: str, + ) -> List[dict]: + series_name = self._ensure_series(meeting_title, meeting_date) + merged = [] + + for item in new_items: + task = item.get("task", "") + assignee = item.get("assignee", "") + iid = _item_id(task, assignee) + + history_entry = { + "date": meeting_date, + "meeting": meeting_filename, + "status": item.get("status", "待办"), + "priority": item.get("priority", "中"), + "deadline": item.get("deadline", ""), + } + + existing = self._state["action_items"].get(iid) + if existing: + existing["history"].append(history_entry) + existing["latest"] = history_entry + latest = existing["history"][-1] + item["_item_id"] = iid + item["_history"] = list(existing["history"]) + item["status"] = latest["status"] + item["priority"] = latest["priority"] + item["deadline"] = latest["deadline"] + else: + self._state["action_items"][iid] = { + "item_id": iid, + "task": task, + "assignee": assignee, + "series": series_name, + "created_meeting": meeting_filename, + "history": [history_entry], + "latest": history_entry, + } + item["_item_id"] = iid + item["_history"] = [history_entry] + + merged.append(item) + + return merged + + def merge_metrics( + self, + new_metrics: List[dict], + meeting_title: str, + meeting_date: str, + meeting_filename: str, + ) -> List[dict]: + merged = [] + + for metric in new_metrics: + metric_name = metric.get("metric_name", "") + owner = metric.get("owner", "") + mid = _metric_id(metric_name, owner) + + history_entry = { + "date": meeting_date, + "meeting": meeting_filename, + "value": metric.get("value", ""), + "target": metric.get("target", ""), + "trend": metric.get("trend", ""), + } + + existing = self._state["metrics"].get(mid) + if existing: + existing["history"].append(history_entry) + existing["latest"] = history_entry + metric["_metric_id"] = mid + metric["_history"] = list(existing["history"]) + else: + self._state["metrics"][mid] = { + "metric_id": mid, + "metric_name": metric_name, + "owner": owner, + "history": [history_entry], + "latest": history_entry, + } + metric["_metric_id"] = mid + metric["_history"] = [history_entry] + + merged.append(metric) + + return merged + + def get_action_item_history(self, item_id: str) -> Optional[dict]: + return self._state["action_items"].get(item_id) + + def get_metric_history(self, metric_id: str) -> Optional[dict]: + return self._state["metrics"].get(metric_id) + + def get_series_info(self, title: str) -> Optional[dict]: + series_name = self._detect_series(title) + return self._state["meeting_series"].get(series_name) + + def has_content_hash(self, content_hash: str) -> bool: + return content_hash in self._state["content_hashes"] + + def add_content_hash(self, content_hash: str, title: str, date: str, filename: str): + self._state["content_hashes"][content_hash] = { + "title": title, + "date": date, + "filename": filename, + } + + def remove_content_hash(self, content_hash: str): + self._state["content_hashes"].pop(content_hash, None) + + def get_stats(self) -> dict: + return { + "action_items_tracked": len(self._state["action_items"]), + "metrics_tracked": len(self._state["metrics"]), + "meeting_series": len(self._state["meeting_series"]), + "content_hashes": len(self._state["content_hashes"]), + } diff --git a/meeting_memory/models.py b/meeting_memory/models.py deleted file mode 100644 index b18e72d..0000000 --- a/meeting_memory/models.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Any - - -@dataclass(slots=True) -class ExtractedMeeting: - meeting_id: str - meeting_date: str - source_path: str - title: str = "" - overview: str = "" - summary: list[str] = field(default_factory=list) - projects: list[dict[str, Any]] = field(default_factory=list) - metrics: list[dict[str, Any]] = field(default_factory=list) - action_items: list[dict[str, Any]] = field(default_factory=list) - risks: list[dict[str, Any]] = field(default_factory=list) - completed_items: list[dict[str, Any]] = field(default_factory=list) - followups: list[dict[str, Any]] = field(default_factory=list) - evidence: list[dict[str, str]] = field(default_factory=list) - - def to_dict(self) -> dict[str, Any]: - return { - "meeting_id": self.meeting_id, - "meeting_date": self.meeting_date, - "source_path": self.source_path, - "title": self.title, - "overview": self.overview, - "summary": self.summary, - "projects": self.projects, - "metrics": self.metrics, - "action_items": self.action_items, - "risks": self.risks, - "completed_items": self.completed_items, - "followups": self.followups, - "evidence": self.evidence, - } diff --git a/meeting_memory/prompts/__init__.py b/meeting_memory/prompts/__init__.py new file mode 100644 index 0000000..617c89c --- /dev/null +++ b/meeting_memory/prompts/__init__.py @@ -0,0 +1,5 @@ +from .extract_nodes import extract_entities +from .extract_edges import extract_facts +from .dedupe_nodes import resolve_entities +from .dedupe_edges import resolve_facts +from .summarize_nodes import summarize_entity diff --git a/meeting_memory/prompts/dedupe_edges.py b/meeting_memory/prompts/dedupe_edges.py new file mode 100644 index 0000000..8ff439b --- /dev/null +++ b/meeting_memory/prompts/dedupe_edges.py @@ -0,0 +1,49 @@ +from typing import Any + + +def resolve_facts(context: dict[str, Any]) -> list[dict]: + existing_facts = context.get('existing_facts', []) + new_fact = context.get('new_fact', '') + invalidation_candidates = context.get('invalidation_candidates', []) + + existing_text = '\n'.join( + f' [idx={i}] {f.get("fact", "")}' for i, f in enumerate(existing_facts) + ) + + invalidation_text = '\n'.join( + f' [idx={i + len(existing_facts)}] {f.get("fact", "")}' + for i, f in enumerate(invalidation_candidates) + ) + + user_prompt = f""" +<已有事实> +{existing_text} + + +<事实失效候选> +{invalidation_text} + + +<新事实> +{new_fact} + + +注意:idx 编号是连续的——已有事实从 0 开始,失效候选紧随其后。 + +任务: +1. **重复检测**:如果<新事实>与<已有事实>中的某条描述的是完全相同的客观事实,返回该 idx。 +2. **矛盾检测**:如果<新事实>与<已有事实>或<失效候选>中的某条相互矛盾(如状态已更新、数值已变更),返回该 idx。 + +返回格式: +{{"duplicate_facts": [idx列表], "contradicted_facts": [idx列表]}} +如果没有重复或矛盾,返回空列表。 + +示例: +- 新事实:"张三负责宽带运维项目" vs 已有:"张三负责宽带运维" → 重复(相同事实) +- 新事实:"宽带用户数当前值 8500" vs 已有:"宽带用户数目标值 10000" → 不重复,不矛盾(数值维度不同) +- 新事实:"宽带用户数当前值 9000" vs 已有:"宽带用户数 8000" → 矛盾(同一指标数值更新) +""" + return [ + {'role': 'system', 'content': '你是事实去重和矛盾检测助手。判断新事实与已有事实的关系。'}, + {'role': 'user', 'content': user_prompt}, + ] diff --git a/meeting_memory/prompts/dedupe_nodes.py b/meeting_memory/prompts/dedupe_nodes.py new file mode 100644 index 0000000..a2b7152 --- /dev/null +++ b/meeting_memory/prompts/dedupe_nodes.py @@ -0,0 +1,49 @@ +from typing import Any + + +def resolve_entities(context: dict[str, Any]) -> list[dict]: + extracted = context.get('extracted_entities', []) + existing = context.get('existing_entities', []) + episode_content = context.get('episode_content', '') + + extracted_text = '\n'.join( + f' [{i}] {e.get("name", "")}({e.get("entity_type", "未知")}):{e.get("description", "")}' + for i, e in enumerate(extracted) + ) + + existing_text = '\n'.join( + f' [candidate_id={c.get("candidate_id", i)}] {c.get("name", "")}({c.get("entity_type", "未知")}):{c.get("summary", "")[:100]}' + for i, c in enumerate(existing) + ) + + user_prompt = f""" +<当前会议内容> +{episode_content} + + +<新抽取的实体> +{extracted_text} + + +<图谱中已有的实体> +{existing_text} + + +任务:判断<新抽取的实体>中的每一个是否与<图谱中已有的实体>中的某个是同一个真实世界对象。 + +判断标准: +- **是重复**:两个名称指向同一个真实世界的人、组织、地点、项目、指标等。 +- **不是重复**:名称相似但指向不同实体(如两个同名但不同的人、同名的不同项目)。 + +对每个新抽取的实体,返回: + - id: 对应新抽取实体列表中的序号 + - name: 实体的最佳名称(优先使用已有实体中的更完整名称) + - duplicate_candidate_id: 匹配到的已有实体的 candidate_id,如果无匹配则填 -1 + +返回格式 JSON 数组:[{{"id": 0, "name": "张三", "duplicate_candidate_id": -1}}, ...] +必须为新抽取的每个实体返回一条记录。id 从 0 开始连续编号。 +""" + return [ + {'role': 'system', 'content': '你是实体去重助手。判断两个实体是否指向同一个真实世界对象。'}, + {'role': 'user', 'content': user_prompt}, + ] diff --git a/meeting_memory/prompts/extract_edges.py b/meeting_memory/prompts/extract_edges.py new file mode 100644 index 0000000..113505a --- /dev/null +++ b/meeting_memory/prompts/extract_edges.py @@ -0,0 +1,66 @@ +from typing import Any + + +def extract_facts(context: dict[str, Any]) -> list[dict]: + previous = context.get('previous_episodes', []) + current = context.get('episode_content', '') + entities = context.get('entities', []) + reference_time = context.get('reference_time', '') + + previous_section = '' + if previous: + import json + previous_section = f'\n<历史上下文>\n{json.dumps(previous, ensure_ascii=False)}\n\n' + + entities_text = '\n'.join( + f' [{i}] {e.get("name", "")}({e.get("entity_type", "未知")})' for i, e in enumerate(entities) + ) + + user_prompt = f""" +{previous_section} +<当前会议内容> +{current} + + +<已抽取实体> +{entities_text} + + +<参考时间> +{reference_time} + + +抽取规则: +1. 从<当前会议内容>中抽取上述<已抽取实体>之间的**事实关系**。 +2. 每条关系必须涉及两个**不同**的实体。 +3. 返回 JSON 数组,格式: + [{{ + "source_entity_name": "源实体名称(必须来自上方的实体列表)", + "target_entity_name": "目标实体名称(必须来自上方的实体列表)", + "relation_type": "关系类型,如 负责、汇报、隶属于、参与、目标值、截止于、影响、依赖于", + "fact": "一句自然语言的事实描述,保留原文中所有具体细节(数值、时间、地点等)", + "valid_at": "该事实开始成立的时间(ISO 8601格式,如 2025-04-30T00:00:00Z),不明确则留空", + "invalid_at": "该事实不再成立的时间,不明确则留空", + "evidence": "原文中的关键证据短句", + "qualifiers": ["限定条件列表,如数值、范围、状态、截止时间等"], + "confidence": 置信度0到1之间 + }}] + +4. relation_type 避免使用"关联""涉及"等空泛词,优先使用具体谓词: + 负责、汇报、目标值、当前值、低于、高于、要求、督导、推进、支撑、依赖、计划、完成、截止于、参与、隶属于、分管、协调、审批 + +5. 层次关系(结构隶属)使用以下固定 relation_type: + HAS_PROJECT: 部门管辖项目(Department -> Project) + HAS_METRIC: 项目拥有指标(Project -> Metric) + PART_OF: 实体属于某个上级实体 + +6. 同一对实体之间可能既有层次关系(HAS_PROJECT)也有事实关系(负责、汇报),需要分别抽取。 + +7. fact 必须是一句完整的自然语言事实,保留所有具体信息(人名、数值、产品名、地点等)。 + +8. 如果根据上下文可以判断事实的开始/结束时间,填入 valid_at / invalid_at。 +""" + return [ + {'role': 'system', 'content': '你是一个专业的事实关系抽取专家。从会议记录中抽取实体间的结构化事实关系。'}, + {'role': 'user', 'content': user_prompt}, + ] diff --git a/meeting_memory/prompts/extract_nodes.py b/meeting_memory/prompts/extract_nodes.py new file mode 100644 index 0000000..e35f64d --- /dev/null +++ b/meeting_memory/prompts/extract_nodes.py @@ -0,0 +1,56 @@ +from typing import Any + + +SYSTEM_PROMPT = ( + '你是会议纪要实体抽取专家。' + '从会议记录中抽取明确的实体节点,包括部门(Department)、项目(Project)、指标(Metric)、人物(Person)、系统(System)、文档(Document)等。' + '不要抽取抽象概念、情感、时间日期或泛泛的名词。' +) + + +def extract_entities(context: dict[str, Any]) -> list[dict]: + previous = context.get('previous_episodes', []) + current = context.get('episode_content', '') + entity_types = context.get('entity_types', []) + + entity_types_section = '' + if entity_types: + entity_types_section = '\n'.join( + f' - {t["type"]}: {t["description"]}' for t in entity_types + ) + else: + entity_types_section = ' - 未限定类型,请根据上下文自行判断' + + previous_section = '' + if previous: + import json + previous_section = f'\n<历史上下文>\n{json.dumps(previous, ensure_ascii=False)}\n\n' + + user_prompt = f""" +{previous_section} +<当前会议内容> +{current} + + +<实体类型> +{entity_types_section} + + +抽取规则: +1. 只抽取当前会议内容中**明确提及**的实体。 +2. 每个实体必须是有唯一标识的具体事物——人名、组织名、地名、项目名、指标名称等。 +3. 不要抽取:代词(他、她、它、这、那)、抽象概念(增长、改善、风险)、时间日期。 +4. 如果同一实体在不同来源中以不同名称出现(如简称/全称),保留最完整的形式。 +5. 必须返回 JSON 数组,格式:[{{"name": "实体名称", "entity_type": "类型", "description": "描述", "evidence": "原文证据"}}] +6. description 写一段对该实体的简要描述(20字以内)。 +7. evidence 从原文中摘录提及该实体的关键短句。 + +注意:实体类型建议使用 Department(部门)、Project(项目)、Metric(指标)、Person(人物)、System(系统)、Document(文档)等。请确保: +- 部门(Department):会议中提到的具体部门名称,如"技术部"、"市场部"。 +- 项目(Project):部门负责的具体项目名称。 +- 指标(Metric):项目中提到的具体量化指标,如"响应时间"、"完成率"。 +""" + return [ + {'role': 'system', 'content': SYSTEM_PROMPT}, + {'role': 'user', 'content': user_prompt}, + ] diff --git a/meeting_memory/prompts/summarize_nodes.py b/meeting_memory/prompts/summarize_nodes.py new file mode 100644 index 0000000..51f03c1 --- /dev/null +++ b/meeting_memory/prompts/summarize_nodes.py @@ -0,0 +1,41 @@ +from typing import Any + + +def summarize_entity(context: dict[str, Any]) -> list[dict]: + entity_name = context.get('entity_name', '') + existing_summary = context.get('existing_summary', '') + episodes = context.get('episodes', []) + previous = context.get('previous_episodes', []) + + existing_section = '' + if existing_summary: + existing_section = f'\n<已有摘要>\n{existing_summary}\n\n' + + previous_section = '' + if previous: + import json + previous_section = f'\n<历史内容>\n{json.dumps(previous, ensure_ascii=False)}\n\n' + + episodes_text = '\n---\n'.join(episodes) if isinstance(episodes, list) else episodes + + user_prompt = f""" +{previous_section} +<当前内容> +{episodes_text} + +{existing_section} +为实体 **{entity_name}** 生成一段信息密集的摘要。 + +规则: +1. 只使用<当前内容>和<已有摘要>中的事实。不要推测。 +2. 保留所有实质性的人名、角色、地点、日期、数值。 +3. 用第三人称直接陈述事实。 +4. 不要使用"提及了""讨论了""指出"等元语言动词。直接陈述事实。 +5. 如果会议对已有信息做了更新,采用更新的说法。 +6. 摘要不超过 500 字。 +7. 返回 JSON:{{"summary": "摘要内容"}} +""" + return [ + {'role': 'system', 'content': '你是实体摘要助手。根据会议内容为实体生成信息密集的摘要。'}, + {'role': 'user', 'content': user_prompt}, + ] diff --git a/meeting_memory/raw_store.py b/meeting_memory/raw_store.py new file mode 100644 index 0000000..568167d --- /dev/null +++ b/meeting_memory/raw_store.py @@ -0,0 +1,55 @@ +import logging +import os +from datetime import datetime + +from meeting_memory.config import config + +logger = logging.getLogger(__name__) + + +def _sanitize_filename(name: str) -> str: + if not name: + return "untitled" + invalid = '<>:"/\\|?*' + for char in invalid: + name = name.replace(char, "") + name = name.replace(" ", "_").strip("._") + return name or "untitled" + + +class RawMeetingStore: + def __init__(self): + self.raw_dir = config.storage.raw_dir + os.makedirs(self.raw_dir, exist_ok=True) + + def save(self, text: str, title: str = "", date: str = "") -> str: + os.makedirs(self.raw_dir, exist_ok=True) + + date_str = date or datetime.now().strftime("%Y-%m-%d") + safe_date = _sanitize_filename(date_str)[:40] + safe_title = _sanitize_filename(title)[:60] + filename = f"{safe_date}_{safe_title}.md" + filepath = os.path.join(self.raw_dir, filename) + + content = "\n".join( + [ + "---", + f'title: "{title}"', + f'date: "{date_str}"', + "status: archived", + "---", + "", + f"# {title or 'Untitled Meeting'}", + "", + text, + "", + ] + ) + with open(filepath, "w", encoding="utf-8") as f: + f.write(content) + + logger.info("Saved raw meeting text: %s", filepath) + return filepath + + +raw_meeting_store = RawMeetingStore() diff --git a/meeting_memory/search_utils.py b/meeting_memory/search_utils.py new file mode 100644 index 0000000..aa15fdf --- /dev/null +++ b/meeting_memory/search_utils.py @@ -0,0 +1,142 @@ +import logging +import re +from collections import Counter +from typing import Any, Dict, List + +logger = logging.getLogger(__name__) + + +def _cosine_similarity(left: List[float], right: List[float]) -> float: + if not left or not right or len(left) != len(right): + return 0.0 + dot = sum(a * b for a, b in zip(left, right)) + left_norm = sum(a * a for a in left) ** 0.5 + right_norm = sum(b * b for b in right) ** 0.5 + if left_norm == 0 or right_norm == 0: + return 0.0 + return dot / (left_norm * right_norm) + + +def _keyword_terms(text: str) -> List[str]: + normalized = (text or '').lower() + raw_terms = re.findall(r'[a-z0-9]+|[\u4e00-\u9fff]{2,}', normalized) + stopwords = {'是什么', '多少', '分别', '以及', '还有', '当前值', '目标值'} + terms: List[str] = [] + for raw in raw_terms: + if raw in stopwords: + continue + if raw not in terms: + terms.append(raw) + if re.fullmatch(r'[\u4e00-\u9fff]{4,}', raw): + for size in (2, 3, 4): + for idx in range(0, len(raw) - size + 1): + piece = raw[idx: idx + size] + if piece not in stopwords and piece not in terms: + terms.append(piece) + return terms + + +def reciprocal_rank_fusion( + ranked_lists: List[List[Dict[str, Any]]], + k: int = 60, + key: str = 'text', +) -> List[Dict[str, Any]]: + if not ranked_lists: + return [] + + scores: Dict[str, Dict] = {} + for ranked in ranked_lists: + for rank, item in enumerate(ranked): + item_id = item.get(key, str(id(item))) + if item_id not in scores: + enriched = dict(item) + enriched['_fusion_scores'] = [] + scores[item_id] = {'item': enriched, 'score': 0.0} + scores[item_id]['score'] += 1.0 / (k + rank + 1) + scores[item_id]['item']['_fusion_scores'].append(1.0 / (k + rank + 1)) + + sorted_items = sorted( + scores.values(), + key=lambda x: (-x['score'], x['item'].get('title', '')), + ) + results = [] + for entry in sorted_items: + item = entry['item'] + item['score'] = round(entry['score'], 4) + results.append(item) + return results + + +def _keyword_score(text: str, question: str) -> float: + source = (text or '').lower() + terms = _keyword_terms(question) + if not source or not terms: + return 0.0 + hits = sum(1 for term in terms if term in source) + return hits / len(terms) + + +def bm25_score( + query_terms: List[str], + doc_freqs: Dict[str, int], + doc_term_counts: Dict[str, int], + doc_length: float, + avg_doc_length: float, + total_docs: int, + k1: float = 1.5, + b: float = 0.75, +) -> float: + score = 0.0 + for term in query_terms: + tf = doc_term_counts.get(term, 0) + if tf == 0: + continue + df = doc_freqs.get(term, 1) + idf = ((total_docs - df + 0.5) / (df + 0.5) + 1.0) + if idf <= 0: + continue + tf_norm = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * doc_length / avg_doc_length)) + score += idf * tf_norm + return score + + +def search_bfs( + seed_entities: List[str], + get_edges_fn, + max_depth: int = 2, + max_nodes: int = 50, +) -> Dict[str, Any]: + if not seed_entities: + return {'nodes': [], 'edges': [], 'paths': []} + + visited = set(seed_entities) + queue = [(entity, 0) for entity in seed_entities] + collected_edges: List[Dict] = [] + all_nodes = set(seed_entities) + + while queue: + current, depth = queue.pop(0) + if depth >= max_depth: + continue + edges = get_edges_fn(current) + for edge in edges: + source = edge.get('source_name', '') + target = edge.get('target_name', '') + edge_key = f'{source}->{target}:{edge.get("relation_type", "")}' + if edge_key not in {f'{e.get("source_name","")}->{e.get("target_name","")}:{e.get("relation_type","")}' for e in collected_edges}: + collected_edges.append(edge) + + for neighbor in (source, target): + if neighbor and neighbor != current and neighbor not in visited: + visited.add(neighbor) + all_nodes.add(neighbor) + if len(all_nodes) < max_nodes: + queue.append((neighbor, depth + 1)) + + if len(all_nodes) >= max_nodes: + break + + return { + 'nodes': list(all_nodes), + 'edges': collected_edges, + } diff --git a/meeting_memory/service.py b/meeting_memory/service.py index 3cb68af..2137045 100644 --- a/meeting_memory/service.py +++ b/meeting_memory/service.py @@ -1,57 +1,79 @@ from __future__ import annotations -from datetime import date from pathlib import Path from typing import Any, Callable -from .io_utils import read_text_auto -from .ledger import LedgerStore -from .meeting_digest_agent import MeetingDigestAgent - -SubagentProgressCallback = Callable[[dict[str, Any]], None] -CancelCheck = Callable[[], bool] +from meeting_memory.config import config +from meeting_memory.graph_store import graph_store +from meeting_memory.meeting_processor import meeting_processor -class MeetingKnowledgeService: - def __init__(self, data_dir: str | Path = "data", *, workspace: str | Path | None = None) -> None: - self.store = LedgerStore(data_dir) - self.workspace = Path(workspace or Path.cwd()).resolve() - self.digest_agent = MeetingDigestAgent(self.workspace) +ProgressCallback = Callable[[dict[str, Any]], None] - def import_meeting_transcript( + +class MeetingMemoryService: + """Expose the imported meeting-memory pipeline behind app-friendly methods.""" + + def __init__(self, workspace: str | Path | None = None) -> None: + self.workspace = Path(workspace).resolve() if workspace else None + + def store_meeting_memory( self, *, - file_path: str, - knowledge_base_id: str | None = None, - team_id: str | None = None, - meeting_date: str | None = None, - progress_callback: SubagentProgressCallback | None = None, - should_cancel: CancelCheck | None = None, + file_path: str | None = None, + content: str | None = None, + force: bool = False, + progress_callback: ProgressCallback | None = None, + use_multistep_extraction: bool = True, ) -> dict[str, Any]: - text = read_text_auto(file_path) - library_id = knowledge_base_id or team_id or "default_team" - digest = self.digest_agent.analyze( - transcript_text=text, - source_path=file_path, - meeting_date=meeting_date or date.today().isoformat(), - knowledge_base_id=library_id, - existing_context=self.store.build_agent_context(library_id), - progress_callback=progress_callback, - should_cancel=should_cancel, + source_path = (file_path or "").strip() + text = (content or "").strip() + if not source_path and not text: + raise ValueError("Either file_path or content is required.") + + resolved_path: Path | None = None + if source_path: + resolved_path = Path(source_path).expanduser().resolve() + text = resolved_path.read_text(encoding="utf-8") + + def report(step: int, total: int, message: str) -> None: + if callable(progress_callback): + progress_callback( + { + "step": step, + "total_steps": total, + "message": message, + } + ) + + archive_path = meeting_processor.process_meeting_text( + text, + force=force, + interactive=False, + progress_callback=report, + use_multistep_extraction=use_multistep_extraction, ) - return self.store.import_meeting(library_id, digest) + return { + "success": bool(archive_path), + "stored": bool(archive_path), + "archive_path": archive_path, + "source_file_path": str(resolved_path) if resolved_path else None, + "data_dir": config.storage.data_dir, + "raw_dir": config.storage.raw_dir, + "graph_enabled": graph_store.enabled, + "stats": meeting_processor.stats(), + "message": "Meeting memory stored." if archive_path else "Meeting skipped or processing failed.", + } - def query_knowledge( - self, - *, - query: str, - knowledge_base_id: str | None = None, - team_id: str | None = None, - limit: int = 8, - ) -> dict[str, Any]: - library_id = knowledge_base_id or team_id or "default_team" - return self.store.query(library_id, query, limit=limit) + def query_meeting_memory(self, *, query: str, top_k: int = 5) -> dict[str, Any]: + return { + "success": True, + "query": query, + "top_k": top_k, + "answer": meeting_processor.query(query, top_k=top_k), + "graph_enabled": graph_store.enabled, + "stats": meeting_processor.stats(), + } - def list_knowledge_bases(self) -> dict[str, Any]: - bases = self.store.list_team_ids() - return {"knowledge_bases": bases, "count": len(bases)} + def stats(self) -> dict[str, Any]: + return meeting_processor.stats() diff --git a/meeting_memory/services/__init__.py b/meeting_memory/services/__init__.py new file mode 100644 index 0000000..e338c25 --- /dev/null +++ b/meeting_memory/services/__init__.py @@ -0,0 +1,3 @@ +from meeting_memory.services.embedding_service import EmbeddingService, embedding_service + +__all__ = ["EmbeddingService", "embedding_service"] diff --git a/meeting_memory/services/embedding_service.py b/meeting_memory/services/embedding_service.py new file mode 100644 index 0000000..39feb52 --- /dev/null +++ b/meeting_memory/services/embedding_service.py @@ -0,0 +1,29 @@ +from typing import List, Optional + +from openai import OpenAI as OpenAIClient + +from meeting_memory.config import config + + +class EmbeddingService: + def __init__( + self, + model: Optional[str] = None, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ): + self._client = OpenAIClient( + api_key=api_key or config.embedding.api_key or "not-needed", + base_url=api_base or config.embedding.api_base or None, + ) + self._model = model or config.embedding.model + + def embed_text(self, text: str) -> List[float]: + response = self._client.embeddings.create( + model=self._model, + input=text, + ) + return response.data[0].embedding + + +embedding_service = EmbeddingService() diff --git a/prompt/zh/base.yaml b/prompt/zh/base.yaml index b4384e1..d4e09df 100644 --- a/prompt/zh/base.yaml +++ b/prompt/zh/base.yaml @@ -1,20 +1,10 @@ -system: | - 你是一个会议知识库维护 Agent,专门维护“周会经营台账”。 +system: | + 你是一个会议长期记忆维护 Agent。 你必须优先使用工具来导入和查询知识,而不是凭空猜测。 - 当前 demo 约定: - - data/ 下的每个子文件夹就是一个知识库。 - - 用户指定知识库时,把文件夹名作为 knowledge_base_id 传给工具。 - - 用户未指定知识库时,可以省略 knowledge_base_id;如果 data/ 下只有一个知识库,工具会自动使用它。 - - 用户询问有哪些知识库/库列表时,调用 list_knowledge_bases。 - - team_id 是旧兼容参数,优先使用 knowledge_base_id。 - 工作原则: - - 用户要求导入会议转录时,调用 import_meeting_transcript。 - - 导入不是简单摘抄,而是把“单次会议原文”交给会议整理子 Agent,更新到该知识库唯一的项目台账 Markdown。 - - 同一知识库下,每次只处理一份会议原文;如果会议提到历史项目,就更新原有项目状态;如果出现新项目,就新增项目条目。 - - 用户询问项目状态、指标、待办、风险、完成情况、决策、追踪项或最近会议内容时,调用 query_knowledge。 - - 回答要短,说明工具结果、台账路径、关键命中和下一步。 + - 用户要求导入、保存、记录、归档会议原文时,调用 store_meeting_memory。 + - `store_meeting_memory` 会把会议原文写入长期记忆系统,并抽取会议实体、事实关系、行动项、指标和原文归档。 + - 用户询问最近会议、项目状态、关键决策、指标变化、待办、风险、人物关系或历史上下文时,调用 query_meeting_memory。 + - 回答要短,说明工具结果、归档路径、关键命中和下一步。 - ASR 内容可能有口语、方言、错字和未知说话人,引用时要保留“待确认”的态度。 - - diff --git a/prompt/zh/meeting_digest.yaml b/prompt/zh/meeting_digest.yaml deleted file mode 100644 index 840c146..0000000 --- a/prompt/zh/meeting_digest.yaml +++ /dev/null @@ -1,92 +0,0 @@ -system: | - 你是一个专门负责"会议整理"的子 Agent。 - 你的唯一任务,是把单次会议原文整理成结构化 JSON,供主系统把内容更新进同一个项目 Markdown 台账。 - - 你的工作目标: - - 输入是一份会议原文,以及该知识库当前已有的历史台账摘要。 - - 你必须先理解"历史里已经有哪些项目",优先复用已有项目名称。 - - 如果本次会议提到了历史项目,就把它识别成同一个项目,并输出新的状态、进展、待办、风险、完成项等更新。 - - 如果本次会议出现了历史中没有的新项目,再新增项目。 - - 只保留真正重要、可执行、可追踪的信息,不要把整段口语照搬进结果。 - - 不确定的信息可以写"待确认",但不要编造。 - - 输出要求: - - 只能输出一个 JSON 对象,不要输出解释,不要加 Markdown 代码块。 - - 所有字段尽量给出中文内容。 - - 如果某个数组没有内容,返回空数组。 - -instructions: | - 你输出的 JSON 必须严格遵循以下结构: - { - "meeting_title": "本次会议标题,无法判断可为空", - "overview": "对本次会议最重要的整体概括,1-2句话", - "summary": [ - "3-6条高价值摘要,每条一句话" - ], - "projects": [ - { - "name": "项目名,优先复用历史命名", - "aliases": ["本次会议里提到的别名,可为空"], - "status": "项目当前状态,如推进中/阻塞/待启动/已完成/待确认", - "summary": "该项目这次会议的核心更新", - "progress": "进度说明,简短清晰", - "keywords": ["便于检索的关键词"], - "decisions": [ - "本次会议已明确的决策" - ], - "next_steps": [ - "下一步动作" - ], - "metrics": [ - { - "name": "指标名", - "value": "当前值", - "target": "目标值", - "status": "达标/未达标/改善中/待确认等", - "trend": "上升/下降/持平/待确认", - "note": "必要补充", - "evidence": "简短来源片段,可为空" - } - ], - "actions": [ - { - "title": "待办事项", - "owner": "责任人或责任部门", - "deadline": "时间要求,没有就空", - "status": "待跟进/进行中/已完成/待确认", - "next_step": "下一步动作", - "evidence": "简短来源片段,可为空" - } - ], - "risks": [ - { - "title": "风险或问题", - "status": "关注中/阻塞/待确认/已缓解等", - "impact": "影响描述", - "owner": "责任人或责任部门", - "next_step": "当前处理动作", - "deadline": "时间要求,没有就空", - "evidence": "简短来源片段,可为空" - } - ], - "completed": [ - { - "title": "已完成事项", - "status": "已完成/已关闭", - "owner": "责任人或责任部门", - "evidence": "简短来源片段,可为空" - } - ], - "evidence": [ - "支持该项目判断的关键原文短句,最多3-5条" - ] - } - ] - } - - 额外约束: - - 不要把整个会议标题、整段发言、长串 ASR 内容直接塞进 title/summary/action/risk。 - - 优先抽取:项目更新、负责人、截止时间、指标变化、风险、决策、下一步。 - - 如果会议内容与历史项目高度相似但叫法略有变化,优先归并到历史项目。 - - 请根据下方的历史台账摘要和会议原文,输出 JSON: diff --git a/providers/prompt_loader.py b/providers/prompt_loader.py index 97db97b..40acb57 100644 --- a/providers/prompt_loader.py +++ b/providers/prompt_loader.py @@ -6,7 +6,7 @@ from pathlib import Path def load_prompt(name: str, *, language: str = "zh") -> dict[str, str]: path = Path(__file__).resolve().parent.parent / "prompt" / language / f"{name}.yaml" if not path.exists(): - return {"system": "你是一个会议知识库维护 Agent。"} + return {"system": "你是一个会议长期记忆维护 Agent。"} text = path.read_text(encoding="utf-8-sig") return _parse_yaml_sections(text) diff --git a/providers/rule_based.py b/providers/rule_based.py index 05d56c8..2ef478e 100644 --- a/providers/rule_based.py +++ b/providers/rule_based.py @@ -1,4 +1,4 @@ -from __future__ import annotations +from __future__ import annotations import json import re @@ -16,31 +16,17 @@ class RuleBasedMeetingProvider(AgentProvider): if last.get("role") == "tool": return AssistantTurn(content=_summarize_tool_result(last.get("name", ""), last.get("content", "{}"))) user_message = _last_user_message(messages) - knowledge_base_id = _extract_knowledge_base_id(user_message) path = _extract_path(user_message) - if _looks_like_list_bases(user_message): - return AssistantTurn( - tool_calls=[ToolCall(id=f"call_{uuid.uuid4().hex}", name="list_knowledge_bases", arguments={})] - ) if _looks_like_import(user_message) and path: - args: dict[str, Any] = {"file_path": path} - if knowledge_base_id: - args["knowledge_base_id"] = knowledge_base_id - meeting_date = _extract_date(user_message) - if meeting_date: - args["meeting_date"] = meeting_date return AssistantTurn( - tool_calls=[ToolCall(id=f"call_{uuid.uuid4().hex}", name="import_meeting_transcript", arguments=args)] + tool_calls=[ToolCall(id=f"call_{uuid.uuid4().hex}", name="store_meeting_memory", arguments={"file_path": path})] ) - query_args: dict[str, Any] = {"query": user_message, "limit": 8} - if knowledge_base_id: - query_args["knowledge_base_id"] = knowledge_base_id return AssistantTurn( tool_calls=[ ToolCall( id=f"call_{uuid.uuid4().hex}", - name="query_knowledge", - arguments=query_args, + name="query_meeting_memory", + arguments={"query": user_message, "top_k": 5}, ) ] ) @@ -51,21 +37,15 @@ def _summarize_tool_result(tool_name: str, raw: str) -> str: data = json.loads(raw) except json.JSONDecodeError: return raw - if tool_name == "import_meeting_transcript": - counts = data.get("counts", {}) + if tool_name == "store_meeting_memory": return ( - "已导入会议并更新知识库周会台账。\n" - f"- knowledge_base_id: {data.get('knowledge_base_id') or data.get('team_id')}\n" - f"- meeting_id: {data.get('meeting_id')}\n" - f"- ledger: {data.get('ledger_path')}\n" - f"- 抽取:指标 {counts.get('metrics', 0)},待办 {counts.get('actions', 0)},风险 {counts.get('risks', 0)}," - f"完成事项 {counts.get('completed', 0)},追踪项 {counts.get('followups', 0)}" + "已尝试写入会议长期记忆。\n" + f"- stored: {data.get('stored')}\n" + f"- archive: {data.get('archive_path')}\n" + f"- graph_enabled: {data.get('graph_enabled')}" ) - if tool_name == "query_knowledge": + if tool_name == "query_meeting_memory": return data.get("answer", json.dumps(data, ensure_ascii=False, indent=2)) - if tool_name == "list_knowledge_bases": - bases = data.get("knowledge_bases", []) - return "当前已有知识库:" + ("、".join(bases) if bases else "暂无") return json.dumps(data, ensure_ascii=False, indent=2) @@ -77,7 +57,7 @@ def _last_user_message(messages: List[Dict[str, Any]]) -> str: def _looks_like_import(text: str) -> bool: - return any(word in text for word in ("导入", "读取", "更新", "import")) and bool(_extract_path(text)) + return any(word in text for word in ("导入", "读取", "保存", "记录", "归档", "import")) and bool(_extract_path(text)) def _extract_path(text: str) -> str: @@ -86,18 +66,3 @@ def _extract_path(text: str) -> str: return quoted.group(1) match = re.search(r"([A-Za-z]:\\[^\s,。;]+?\.(?:txt|md))", text) return match.group(1) if match else "" - - -def _extract_knowledge_base_id(text: str) -> str | None: - match = re.search(r"(?:knowledge_base_id|知识库|库|team_id|团队|team)[::= ]*([A-Za-z0-9_\-\u4e00-\u9fff]+)", text) - return match.group(1) if match else None - - -def _extract_date(text: str) -> str: - match = re.search(r"\d{4}-\d{2}-\d{2}", text) - return match.group(0) if match else "" - - - -def _looks_like_list_bases(text: str) -> bool: - return any(word in text for word in ("有哪些知识库", "知识库列表", "列出知识库", "可用知识库", "有哪些库")) diff --git a/requirements.txt b/requirements.txt index 62175a6..6e438da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ openai>=1.0.0 +pydantic>=2.0.0 python-dotenv>=1.0.0 fastapi>=0.115.0 uvicorn>=0.30.0 +neo4j>=5.26.0 diff --git a/tools/default_tools.py b/tools/default_tools.py index 7486e41..3f953f2 100644 --- a/tools/default_tools.py +++ b/tools/default_tools.py @@ -1,15 +1,13 @@ from __future__ import annotations -from pathlib import Path - from .general_tools import build_general_registry from .meeting_tools import build_meeting_registry from .registry import ToolRegistry -def build_default_registry(data_dir: str | Path = "data") -> ToolRegistry: +def build_default_registry(data_dir: str | None = None) -> ToolRegistry: registry = ToolRegistry() - for source in (build_general_registry(), build_meeting_registry(data_dir=data_dir)): + for source in (build_general_registry(), build_meeting_registry()): for definition in source.definitions(): function = definition["function"] name = function["name"] diff --git a/tools/meeting_tools.py b/tools/meeting_tools.py index af27f7d..615978f 100644 --- a/tools/meeting_tools.py +++ b/tools/meeting_tools.py @@ -1,115 +1,81 @@ -from __future__ import annotations +from __future__ import annotations -from pathlib import Path from typing import Any -from meeting_memory.service import MeetingKnowledgeService +from meeting_memory.service import MeetingMemoryService from .registry import ToolContext, ToolRegistry -def build_meeting_registry(data_dir: str | Path = "data") -> ToolRegistry: +def build_meeting_registry() -> ToolRegistry: registry = ToolRegistry() - meeting_service = MeetingKnowledgeService(data_dir=data_dir, workspace=Path.cwd()) + meeting_service = MeetingMemoryService() registry.register( - name="import_meeting_transcript", - description="导入一份会议原文,交给会议整理子Agent按项目梳理重点内容,并把结果更新到指定知识库的同一份Markdown项目台账。", + name="store_meeting_memory", + description="将会议原文写入长期会议记忆系统,抽取实体、事实、指标、行动项,并同步归档与图谱索引。", parameters={ "type": "object", "properties": { - "file_path": {"type": "string", "description": "会议转录文本文件路径。"}, - "knowledge_base_id": { + "file_path": { "type": "string", - "description": "知识库ID,对应 data/ 下的文件夹名。用户未说明时可以省略;工具会在只有一个知识库时自动使用该库。", + "description": "会议转录文本文件路径。与 content 二选一;优先使用 file_path。", }, - "team_id": { + "content": { "type": "string", - "description": "兼容旧参数:等同于 knowledge_base_id。新代码优先使用 knowledge_base_id。", + "description": "直接传入会议原文内容。适用于用户在消息里直接给出文本。", + }, + "force": { + "type": "boolean", + "description": "是否在发现重复会议时强制覆盖。默认 false。", + }, + "use_multistep_extraction": { + "type": "boolean", + "description": "是否使用多阶段抽取流程。默认 true。", }, - "meeting_date": {"type": "string", "description": "会议日期,YYYY-MM-DD,可省略。"}, }, - "required": ["file_path"], "additionalProperties": False, }, - handler=lambda ctx, args: _import_meeting(meeting_service, ctx, args), + handler=lambda ctx, args: _store_meeting_memory(meeting_service, ctx, args), ) registry.register( - name="query_knowledge", - description="查询指定知识库项目台账中的项目状态、待办、风险、指标、决策、完成事项和最近会议更新。", + name="query_meeting_memory", + description="查询长期会议记忆中的最近会议、实体关系、关键事实、指标变化、待办事项和上下文摘要。", parameters={ "type": "object", "properties": { "query": {"type": "string", "description": "用户查询。"}, - "knowledge_base_id": { - "type": "string", - "description": "知识库ID,对应 data/ 下的文件夹名。用户未说明时可以省略;工具会在只有一个知识库时自动使用该库。", - }, - "team_id": { - "type": "string", - "description": "兼容旧参数:等同于 knowledge_base_id。新代码优先使用 knowledge_base_id。", - }, - "limit": {"type": "integer", "description": "最多返回多少条命中。", "minimum": 1, "maximum": 20}, + "top_k": {"type": "integer", "description": "最多返回多少条候选上下文。", "minimum": 1, "maximum": 20}, }, "required": ["query"], "additionalProperties": False, }, - handler=lambda ctx, args: _query_knowledge(meeting_service, ctx, args), - ) - registry.register( - name="list_knowledge_bases", - description="列出 data/ 下已有的知识库ID,用于让用户或Web界面选择要查询的库。", - parameters={"type": "object", "properties": {}, "additionalProperties": False}, - handler=lambda ctx, args: _list_knowledge_bases(meeting_service, ctx, args), + handler=lambda ctx, args: _query_meeting_memory(meeting_service, ctx, args), ) return registry -def _resolve_knowledge_base_arg(ctx: ToolContext, args: dict[str, Any]) -> str: - return str( - args.get("knowledge_base_id") - or args.get("team_id") - or ctx.session.get("last_knowledge_base_id") - or ctx.session.get("last_team_id") - or "default_team" - ) - - -def _import_meeting(service: MeetingKnowledgeService, ctx: ToolContext, args: dict[str, Any]) -> dict[str, Any]: - file_path = str(args["file_path"]) - knowledge_base_id = _resolve_knowledge_base_arg(ctx, args) - meeting_date = args.get("meeting_date") +def _store_meeting_memory(service: MeetingMemoryService, ctx: ToolContext, args: dict[str, Any]) -> dict[str, Any]: live_event_sink = ctx.session.get("_live_event_sink") def progress_callback(payload: dict[str, Any]) -> None: if callable(live_event_sink): live_event_sink({"kind": "subagent_progress", "payload": payload}) - def should_cancel() -> bool: - return bool(ctx.session.get("_cancel_requested")) + if bool(ctx.session.get("_cancel_requested")): + raise RuntimeError("Agent run cancelled by user.") - result = service.import_meeting_transcript( - file_path=file_path, - knowledge_base_id=knowledge_base_id, - meeting_date=meeting_date, + result = service.store_meeting_memory( + file_path=args.get("file_path"), + content=args.get("content"), + force=bool(args.get("force", False)), progress_callback=progress_callback, - should_cancel=should_cancel, + use_multistep_extraction=bool(args.get("use_multistep_extraction", True)), ) - ctx.session["last_knowledge_base_id"] = result.get("knowledge_base_id") or knowledge_base_id - ctx.session["last_team_id"] = result.get("knowledge_base_id") or knowledge_base_id - ctx.session["last_ledger_path"] = result.get("ledger_path") + ctx.session["last_memory_archive_path"] = result.get("archive_path") return result -def _query_knowledge(service: MeetingKnowledgeService, ctx: ToolContext, args: dict[str, Any]) -> dict[str, Any]: - knowledge_base_id = _resolve_knowledge_base_arg(ctx, args) - limit = int(args.get("limit") or 8) - result = service.query_knowledge(query=str(args["query"]), knowledge_base_id=knowledge_base_id, limit=limit) - if result.get("knowledge_base_id"): - ctx.session["last_knowledge_base_id"] = result["knowledge_base_id"] - ctx.session["last_team_id"] = result["knowledge_base_id"] +def _query_meeting_memory(service: MeetingMemoryService, ctx: ToolContext, args: dict[str, Any]) -> dict[str, Any]: + result = service.query_meeting_memory(query=str(args["query"]), top_k=int(args.get("top_k") or 5)) + ctx.session["last_memory_query"] = str(args["query"]) return result - - -def _list_knowledge_bases(service: MeetingKnowledgeService, ctx: ToolContext, args: dict[str, Any]) -> dict[str, Any]: - return service.list_knowledge_bases() - diff --git a/webui_server.py b/webui_server.py index 23b448d..49e02fc 100644 --- a/webui_server.py +++ b/webui_server.py @@ -27,7 +27,7 @@ from core_agent.config import ( require_model_config, ) from core_agent.session import ConversationSession -from meeting_memory.service import MeetingKnowledgeService +from meeting_memory.service import MeetingMemoryService from providers.openai_compatible import OpenAICompatibleProvider from providers.rule_based import RuleBasedMeetingProvider from tools.default_tools import build_default_registry @@ -38,6 +38,7 @@ DATA_DIR = WORKSPACE / "data" FRONTEND_DIST_DIR = WORKSPACE / "webui" / "dist" SIDEBAR_STATE_PATH = WORKSPACE / ".webui_sidebar_state.json" UPLOADS_DIR = WORKSPACE / "agent_memory" / "uploads" +MEMORY_CARD_ID = "meeting-memory" def now_iso() -> str: @@ -234,12 +235,7 @@ class ChatStore: record.session.session_state["workspace"] = str(WORKSPACE.resolve()) def ensure_seed_records(self) -> None: - meeting_service = MeetingKnowledgeService(self._data_dir, workspace=WORKSPACE) - for knowledge_base_id in meeting_service.list_knowledge_bases().get("knowledge_bases", []): - chat_id = self._seed_chat_id(knowledge_base_id) - with self._lock: - if chat_id not in self._records: - self._create_record(chat_id, knowledge_base_id, title=knowledge_base_id) + return def list_records(self) -> list[ChatRecord]: self.ensure_seed_records() @@ -437,42 +433,34 @@ def build_agent(force_offline: bool = False) -> tuple[CoreAgent, Any, str]: def list_knowledge_base_cards(data_dir: Path) -> list[dict[str, Any]]: - cards: list[dict[str, Any]] = [] - if not data_dir.exists(): - return cards - for path in sorted(data_dir.iterdir()): - if not path.is_dir(): - continue - state_path = path / "state.json" - if not state_path.exists(): - continue - state = json.loads(state_path.read_text(encoding="utf-8")) - meetings = state.get("meetings", []) - latest = meetings[-1] if meetings else {} - cards.append( - { - "knowledge_base_id": path.name, - "title": path.name, - "meeting_count": len(meetings), - "last_meeting_date": state.get("last_meeting_date", ""), - "updated_at": state.get("last_updated", ""), - "latest_summary": latest.get("summary", [])[:3], - "counts": { - "metrics": len(state.get("current_metrics", [])), - "actions": len(state.get("open_actions", [])), - "risks": len(state.get("risks", [])), - "followups": len(state.get("followups", [])), - }, - } - ) - return cards + service = MeetingMemoryService(workspace=WORKSPACE) + stats = service.stats() + graph = stats.get("graph", {}) + state = stats.get("state", {}) + return [ + { + "knowledge_base_id": MEMORY_CARD_ID, + "title": "Meeting Memory", + "meeting_count": int(graph.get("meetings", 0) or 0), + "last_meeting_date": "", + "updated_at": now_iso(), + "latest_summary": [ + f"Meetings {graph.get('meetings', 0)}", + f"Entities {graph.get('entities', 0)}", + f"Action items {state.get('action_items_tracked', 0)}", + ], + "counts": { + "metrics": int(state.get("metrics_tracked", 0) or 0), + "actions": int(state.get("action_items_tracked", 0) or 0), + "risks": 0, + "followups": 0, + }, + } + ] def default_knowledge_base_id(data_dir: Path) -> str | None: - cards = list_knowledge_base_cards(data_dir) - if len(cards) == 1: - return cards[0]["knowledge_base_id"] - return None + return MEMORY_CARD_ID def _read_auth_token(request: Request, authorization: str | None = Header(default=None)) -> str | None: