2024-04-22 03:21:24 +00:00
|
|
|
|
# coding=utf-8
|
|
|
|
|
|
"""
|
|
|
|
|
|
@project: maxkb
|
|
|
|
|
|
@Author:虎
|
|
|
|
|
|
@file: ts_vecto_util.py
|
|
|
|
|
|
@date:2024/4/16 15:26
|
|
|
|
|
|
@desc:
|
|
|
|
|
|
"""
|
|
|
|
|
|
import re
|
|
|
|
|
|
import uuid
|
|
|
|
|
|
from typing import List
|
|
|
|
|
|
|
|
|
|
|
|
import jieba
|
2024-05-07 04:18:03 +00:00
|
|
|
|
import jieba.posseg
|
2024-04-22 03:21:24 +00:00
|
|
|
|
from jieba import analyse
|
|
|
|
|
|
|
|
|
|
|
|
from common.util.split_model import group_by
|
|
|
|
|
|
|
|
|
|
|
|
jieba_word_list_cache = [chr(item) for item in range(38, 84)]
|
|
|
|
|
|
|
|
|
|
|
|
for jieba_word in jieba_word_list_cache:
|
|
|
|
|
|
jieba.add_word('#' + jieba_word + '#')
|
|
|
|
|
|
# r"(?i)\b(?:https?|ftp|tcp|file)://[^\s]+\b",
|
|
|
|
|
|
# 某些不分词数据
|
|
|
|
|
|
# r'"([^"]*)"'
|
|
|
|
|
|
word_pattern_list = [r"v\d+.\d+.\d+",
|
|
|
|
|
|
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"]
|
|
|
|
|
|
|
2024-05-07 04:18:03 +00:00
|
|
|
|
remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./'
|
|
|
|
|
|
|
|
|
|
|
|
jieba_remove_flag_list = ['x', 'w']
|
2024-04-22 03:21:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_word_list(text: str):
|
|
|
|
|
|
result = []
|
|
|
|
|
|
for pattern in word_pattern_list:
|
|
|
|
|
|
word_list = re.findall(pattern, text)
|
|
|
|
|
|
for child_list in word_list:
|
|
|
|
|
|
for word in child_list if isinstance(child_list, tuple) else [child_list]:
|
|
|
|
|
|
# 不能有: 所以再使用: 进行分割
|
|
|
|
|
|
if word.__contains__(':'):
|
|
|
|
|
|
item_list = word.split(":")
|
|
|
|
|
|
for w in item_list:
|
|
|
|
|
|
result.append(w)
|
|
|
|
|
|
else:
|
|
|
|
|
|
result.append(word)
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def replace_word(word_dict, text: str):
|
|
|
|
|
|
for key in word_dict:
|
2024-06-27 01:51:14 +00:00
|
|
|
|
pattern = '(?<!#)' + re.escape(word_dict[key]) + '(?!#)'
|
|
|
|
|
|
text = re.sub(pattern, key, text)
|
2024-04-22 03:21:24 +00:00
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_word_key(text: str, use_word_list):
|
2024-06-11 10:25:12 +00:00
|
|
|
|
j_word = next((j for j in jieba_word_list_cache if j not in text and all(j not in used for used in use_word_list)),
|
|
|
|
|
|
None)
|
|
|
|
|
|
if j_word:
|
|
|
|
|
|
return j_word
|
2024-04-22 03:21:24 +00:00
|
|
|
|
j_word = str(uuid.uuid1())
|
|
|
|
|
|
jieba.add_word(j_word)
|
|
|
|
|
|
return j_word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def to_word_dict(word_list: List, text: str):
|
|
|
|
|
|
word_dict = {}
|
|
|
|
|
|
for word in word_list:
|
|
|
|
|
|
key = get_word_key(text, set(word_dict))
|
|
|
|
|
|
word_dict['#' + key + '#'] = word
|
|
|
|
|
|
return word_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_key_by_word_dict(key, word_dict):
|
|
|
|
|
|
v = word_dict.get(key)
|
|
|
|
|
|
if v is None:
|
|
|
|
|
|
return key
|
|
|
|
|
|
return v
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def to_ts_vector(text: str):
|
|
|
|
|
|
# 获取不分词的数据
|
|
|
|
|
|
word_list = get_word_list(text)
|
|
|
|
|
|
# 获取关键词关系
|
|
|
|
|
|
word_dict = to_word_dict(word_list, text)
|
|
|
|
|
|
# 替换字符串
|
|
|
|
|
|
text = replace_word(word_dict, text)
|
|
|
|
|
|
# 分词
|
2024-05-09 07:55:35 +00:00
|
|
|
|
filter_word = jieba.analyse.extract_tags(text, topK=100)
|
|
|
|
|
|
result = jieba.lcut(text, HMM=True, use_paddle=True)
|
2024-05-07 04:18:03 +00:00
|
|
|
|
# 过滤标点符号
|
2024-05-09 07:55:35 +00:00
|
|
|
|
result = [item for item in result if filter_word.__contains__(item) and len(item) < 10]
|
|
|
|
|
|
result_ = [{'word': get_key_by_word_dict(result[index], word_dict), 'index': index} for index in
|
2024-05-07 04:18:03 +00:00
|
|
|
|
range(len(result))]
|
2024-04-22 03:21:24 +00:00
|
|
|
|
result_group = group_by(result_, lambda r: r['word'])
|
|
|
|
|
|
return " ".join(
|
|
|
|
|
|
[f"{key.lower()}:{','.join([str(item['index'] + 1) for item in result_group[key]][:20])}" for key in
|
|
|
|
|
|
result_group if
|
|
|
|
|
|
not remove_chars.__contains__(key) and len(key.strip()) >= 0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def to_query(text: str):
|
|
|
|
|
|
# 获取不分词的数据
|
|
|
|
|
|
word_list = get_word_list(text)
|
|
|
|
|
|
# 获取关键词关系
|
|
|
|
|
|
word_dict = to_word_dict(word_list, text)
|
|
|
|
|
|
# 替换字符串
|
|
|
|
|
|
text = replace_word(word_dict, text)
|
|
|
|
|
|
extract_tags = analyse.extract_tags(text, topK=5, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v', 'eng'))
|
|
|
|
|
|
result = " ".join([get_key_by_word_dict(word, word_dict) for word, score in extract_tags if
|
|
|
|
|
|
not remove_chars.__contains__(word)])
|
|
|
|
|
|
# 删除词库
|
|
|
|
|
|
for word in word_list:
|
|
|
|
|
|
jieba.del_word(word)
|
|
|
|
|
|
return result
|