2023-10-09 11:03:41 +00:00
|
|
|
|
# coding=utf-8
|
|
|
|
|
|
"""
|
|
|
|
|
|
@project: maxkb
|
|
|
|
|
|
@Author:虎
|
|
|
|
|
|
@file: data_set.py
|
|
|
|
|
|
@date:2023/9/21 9:35
|
|
|
|
|
|
@desc: 数据集
|
|
|
|
|
|
"""
|
|
|
|
|
|
import uuid
|
2024-11-26 04:08:13 +00:00
|
|
|
|
from enum import Enum
|
2023-10-09 11:03:41 +00:00
|
|
|
|
|
|
|
|
|
|
from django.db import models
|
2024-11-13 02:45:20 +00:00
|
|
|
|
from django.db.models.signals import pre_delete
|
|
|
|
|
|
from django.dispatch import receiver
|
2023-10-09 11:03:41 +00:00
|
|
|
|
|
2024-07-05 11:02:20 +00:00
|
|
|
|
from common.db.sql_execute import select_one
|
2023-10-09 11:03:41 +00:00
|
|
|
|
from common.mixins.app_model_mixin import AppModelMixin
|
2024-07-15 08:26:54 +00:00
|
|
|
|
from setting.models import Model
|
2023-10-09 11:03:41 +00:00
|
|
|
|
from users.models import User
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-11-26 04:08:13 +00:00
|
|
|
|
class TaskType(Enum):
|
|
|
|
|
|
# 向量
|
|
|
|
|
|
EMBEDDING = 1
|
|
|
|
|
|
# 生成问题
|
|
|
|
|
|
GENERATE_PROBLEM = 2
|
|
|
|
|
|
# 同步
|
|
|
|
|
|
SYNC = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class State(Enum):
|
|
|
|
|
|
# 等待
|
|
|
|
|
|
PENDING = '0'
|
|
|
|
|
|
# 执行中
|
|
|
|
|
|
STARTED = '1'
|
|
|
|
|
|
# 成功
|
|
|
|
|
|
SUCCESS = '2'
|
|
|
|
|
|
# 失败
|
|
|
|
|
|
FAILURE = '3'
|
|
|
|
|
|
# 取消任务
|
|
|
|
|
|
REVOKE = '4'
|
|
|
|
|
|
# 取消成功
|
|
|
|
|
|
REVOKED = '5'
|
|
|
|
|
|
# 忽略
|
|
|
|
|
|
IGNORED = 'n'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Status:
|
|
|
|
|
|
type_cls = TaskType
|
|
|
|
|
|
state_cls = State
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, status: str = None):
|
|
|
|
|
|
self.task_status = {}
|
|
|
|
|
|
status_list = list(status[::-1] if status is not None else '')
|
|
|
|
|
|
for _type in self.type_cls:
|
|
|
|
|
|
index = _type.value - 1
|
|
|
|
|
|
_state = self.state_cls(status_list[index] if len(status_list) > index else 'n')
|
|
|
|
|
|
self.task_status[_type] = _state
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def of(status: str):
|
|
|
|
|
|
return Status(status)
|
|
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
|
result = []
|
|
|
|
|
|
for _type in sorted(self.type_cls, key=lambda item: item.value, reverse=True):
|
|
|
|
|
|
result.insert(len(self.type_cls) - _type.value, self.task_status[_type].value)
|
|
|
|
|
|
return ''.join(result)
|
|
|
|
|
|
|
|
|
|
|
|
def __setitem__(self, key, value):
|
|
|
|
|
|
self.task_status[key] = value
|
|
|
|
|
|
|
|
|
|
|
|
def __getitem__(self, item):
|
|
|
|
|
|
return self.task_status[item]
|
|
|
|
|
|
|
|
|
|
|
|
def update_status(self, task_type: TaskType, state: State):
|
|
|
|
|
|
self.task_status[task_type] = state
|
2023-10-24 12:24:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
2023-12-29 10:02:23 +00:00
|
|
|
|
class Type(models.TextChoices):
|
|
|
|
|
|
base = 0, '通用类型'
|
|
|
|
|
|
|
|
|
|
|
|
web = 1, 'web站点类型'
|
|
|
|
|
|
|
2025-03-17 02:33:35 +00:00
|
|
|
|
lark = 2, '飞书类型'
|
|
|
|
|
|
yuque = 3, '语雀类型'
|
|
|
|
|
|
|
2023-12-29 10:02:23 +00:00
|
|
|
|
|
2024-04-25 02:44:14 +00:00
|
|
|
|
class HitHandlingMethod(models.TextChoices):
|
|
|
|
|
|
optimization = 'optimization', '模型优化'
|
|
|
|
|
|
directly_return = 'directly_return', '直接返回'
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-07-15 08:26:54 +00:00
|
|
|
|
def default_model():
|
|
|
|
|
|
return uuid.UUID('42f63a3d-427e-11ef-b3ec-a8a1595801ab')
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-11-26 04:08:13 +00:00
|
|
|
|
def default_status_meta():
|
|
|
|
|
|
return {"state_time": {}}
|
|
|
|
|
|
|
|
|
|
|
|
|
2023-10-09 11:03:41 +00:00
|
|
|
|
class DataSet(AppModelMixin):
|
|
|
|
|
|
"""
|
|
|
|
|
|
数据集表
|
|
|
|
|
|
"""
|
|
|
|
|
|
id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid1, editable=False, verbose_name="主键id")
|
|
|
|
|
|
name = models.CharField(max_length=150, verbose_name="数据集名称")
|
|
|
|
|
|
desc = models.CharField(max_length=256, verbose_name="数据库描述")
|
|
|
|
|
|
user = models.ForeignKey(User, on_delete=models.DO_NOTHING, verbose_name="所属用户")
|
2023-12-29 10:02:23 +00:00
|
|
|
|
type = models.CharField(verbose_name='类型', max_length=1, choices=Type.choices,
|
|
|
|
|
|
default=Type.base)
|
2024-07-17 09:01:57 +00:00
|
|
|
|
embedding_mode = models.ForeignKey(Model, on_delete=models.DO_NOTHING, verbose_name="向量模型",
|
|
|
|
|
|
default=default_model)
|
2023-12-29 10:02:23 +00:00
|
|
|
|
meta = models.JSONField(verbose_name="元数据", default=dict)
|
2023-10-09 11:03:41 +00:00
|
|
|
|
|
|
|
|
|
|
class Meta:
|
|
|
|
|
|
db_table = "dataset"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Document(AppModelMixin):
|
|
|
|
|
|
"""
|
|
|
|
|
|
文档表
|
|
|
|
|
|
"""
|
|
|
|
|
|
id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid1, editable=False, verbose_name="主键id")
|
|
|
|
|
|
dataset = models.ForeignKey(DataSet, on_delete=models.DO_NOTHING)
|
|
|
|
|
|
name = models.CharField(max_length=150, verbose_name="文档名称")
|
|
|
|
|
|
char_length = models.IntegerField(verbose_name="文档字符数 冗余字段")
|
2024-11-26 04:08:13 +00:00
|
|
|
|
status = models.CharField(verbose_name='状态', max_length=20, default=Status('').__str__)
|
|
|
|
|
|
status_meta = models.JSONField(verbose_name="状态统计数据", default=default_status_meta)
|
2023-10-09 11:03:41 +00:00
|
|
|
|
is_active = models.BooleanField(default=True)
|
|
|
|
|
|
|
2023-12-29 10:02:23 +00:00
|
|
|
|
type = models.CharField(verbose_name='类型', max_length=1, choices=Type.choices,
|
|
|
|
|
|
default=Type.base)
|
2024-04-25 02:44:14 +00:00
|
|
|
|
hit_handling_method = models.CharField(verbose_name='命中处理方式', max_length=20,
|
|
|
|
|
|
choices=HitHandlingMethod.choices,
|
|
|
|
|
|
default=HitHandlingMethod.optimization)
|
2024-05-08 09:31:56 +00:00
|
|
|
|
directly_return_similarity = models.FloatField(verbose_name='直接回答相似度', default=0.9)
|
2023-12-29 10:02:23 +00:00
|
|
|
|
|
|
|
|
|
|
meta = models.JSONField(verbose_name="元数据", default=dict)
|
|
|
|
|
|
|
2023-10-09 11:03:41 +00:00
|
|
|
|
class Meta:
|
|
|
|
|
|
db_table = "document"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Paragraph(AppModelMixin):
|
|
|
|
|
|
"""
|
|
|
|
|
|
段落表
|
|
|
|
|
|
"""
|
|
|
|
|
|
id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid1, editable=False, verbose_name="主键id")
|
2023-10-24 12:24:32 +00:00
|
|
|
|
document = models.ForeignKey(Document, on_delete=models.DO_NOTHING, db_constraint=False)
|
|
|
|
|
|
dataset = models.ForeignKey(DataSet, on_delete=models.DO_NOTHING)
|
2024-07-24 07:21:43 +00:00
|
|
|
|
content = models.CharField(max_length=102400, verbose_name="段落内容")
|
2023-10-24 12:24:32 +00:00
|
|
|
|
title = models.CharField(max_length=256, verbose_name="标题", default="")
|
2024-11-26 04:08:13 +00:00
|
|
|
|
status = models.CharField(verbose_name='状态', max_length=20, default=Status('').__str__)
|
|
|
|
|
|
status_meta = models.JSONField(verbose_name="状态数据", default=default_status_meta)
|
2024-03-11 09:28:05 +00:00
|
|
|
|
hit_num = models.IntegerField(verbose_name="命中次数", default=0)
|
2023-10-09 11:03:41 +00:00
|
|
|
|
is_active = models.BooleanField(default=True)
|
|
|
|
|
|
|
|
|
|
|
|
class Meta:
|
|
|
|
|
|
db_table = "paragraph"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Problem(AppModelMixin):
|
|
|
|
|
|
"""
|
|
|
|
|
|
问题表
|
|
|
|
|
|
"""
|
|
|
|
|
|
id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid1, editable=False, verbose_name="主键id")
|
2023-10-24 12:24:32 +00:00
|
|
|
|
dataset = models.ForeignKey(DataSet, on_delete=models.DO_NOTHING, db_constraint=False)
|
2023-10-09 11:03:41 +00:00
|
|
|
|
content = models.CharField(max_length=256, verbose_name="问题内容")
|
2024-03-11 09:28:05 +00:00
|
|
|
|
hit_num = models.IntegerField(verbose_name="命中次数", default=0)
|
2023-10-09 11:03:41 +00:00
|
|
|
|
|
|
|
|
|
|
class Meta:
|
2023-10-24 12:24:32 +00:00
|
|
|
|
db_table = "problem"
|
2024-03-11 09:28:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ProblemParagraphMapping(AppModelMixin):
|
|
|
|
|
|
id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid1, editable=False, verbose_name="主键id")
|
|
|
|
|
|
dataset = models.ForeignKey(DataSet, on_delete=models.DO_NOTHING, db_constraint=False)
|
|
|
|
|
|
document = models.ForeignKey(Document, on_delete=models.DO_NOTHING)
|
|
|
|
|
|
problem = models.ForeignKey(Problem, on_delete=models.DO_NOTHING, db_constraint=False)
|
|
|
|
|
|
paragraph = models.ForeignKey(Paragraph, on_delete=models.DO_NOTHING, db_constraint=False)
|
|
|
|
|
|
|
|
|
|
|
|
class Meta:
|
|
|
|
|
|
db_table = "problem_paragraph_mapping"
|
2024-04-23 11:03:34 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Image(AppModelMixin):
|
|
|
|
|
|
id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid1, editable=False, verbose_name="主键id")
|
|
|
|
|
|
image = models.BinaryField(verbose_name="图片数据")
|
|
|
|
|
|
image_name = models.CharField(max_length=256, verbose_name="图片名称", default="")
|
|
|
|
|
|
|
|
|
|
|
|
class Meta:
|
|
|
|
|
|
db_table = "image"
|
2024-07-05 11:02:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class File(AppModelMixin):
|
|
|
|
|
|
id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid1, editable=False, verbose_name="主键id")
|
|
|
|
|
|
|
|
|
|
|
|
file_name = models.CharField(max_length=256, verbose_name="文件名称", default="")
|
|
|
|
|
|
|
|
|
|
|
|
loid = models.IntegerField(verbose_name="loid")
|
|
|
|
|
|
|
2024-11-11 07:48:56 +00:00
|
|
|
|
meta = models.JSONField(verbose_name="文件关联数据", default=dict)
|
|
|
|
|
|
|
2024-07-05 11:02:20 +00:00
|
|
|
|
class Meta:
|
|
|
|
|
|
db_table = "file"
|
|
|
|
|
|
|
|
|
|
|
|
def save(
|
|
|
|
|
|
self, bytea=None, force_insert=False, force_update=False, using=None, update_fields=None
|
|
|
|
|
|
):
|
|
|
|
|
|
result = select_one("SELECT lo_from_bytea(%s, %s::bytea) as loid", [0, bytea])
|
|
|
|
|
|
self.loid = result['loid']
|
|
|
|
|
|
super().save()
|
|
|
|
|
|
|
|
|
|
|
|
def get_byte(self):
|
|
|
|
|
|
result = select_one(f'SELECT lo_get({self.loid}) as "data"', [])
|
|
|
|
|
|
return result['data']
|
2024-11-13 02:45:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@receiver(pre_delete, sender=File)
|
|
|
|
|
|
def on_delete_file(sender, instance, **kwargs):
|
|
|
|
|
|
select_one(f'SELECT lo_unlink({instance.loid})', [])
|