41 lines
1.3 KiB
Python
41 lines
1.3 KiB
Python
|
|
# coding=utf-8
|
|||
|
|
"""
|
|||
|
|
@project: MaxKB
|
|||
|
|
@Author:虎
|
|||
|
|
@file: mark_chunk_handle.py
|
|||
|
|
@date:2024/7/23 16:52
|
|||
|
|
@desc:
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
from typing import List
|
|||
|
|
|
|||
|
|
from common.chunk.i_chunk_handle import IChunkHandle
|
|||
|
|
|
|||
|
|
max_chunk_len = 256
|
|||
|
|
split_chunk_pattern = r'.{1,%d}[。| |\\.|!|;|;|!|\n]' % max_chunk_len
|
|||
|
|
max_chunk_pattern = r'.{1,%d}' % max_chunk_len
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MarkChunkHandle(IChunkHandle):
|
|||
|
|
def handle(self, chunk_list: List[str]):
|
|||
|
|
result = []
|
|||
|
|
for chunk in chunk_list:
|
|||
|
|
chunk_result = re.findall(split_chunk_pattern, chunk, flags=re.DOTALL)
|
|||
|
|
for c_r in chunk_result:
|
|||
|
|
if len(c_r.strip()) > 0:
|
|||
|
|
result.append(c_r.strip())
|
|||
|
|
|
|||
|
|
other_chunk_list = re.split(split_chunk_pattern, chunk, flags=re.DOTALL)
|
|||
|
|
for other_chunk in other_chunk_list:
|
|||
|
|
if len(other_chunk) > 0:
|
|||
|
|
if len(other_chunk) < max_chunk_len:
|
|||
|
|
if len(other_chunk.strip()) > 0:
|
|||
|
|
result.append(other_chunk.strip())
|
|||
|
|
else:
|
|||
|
|
max_chunk_list = re.findall(max_chunk_pattern, other_chunk, flags=re.DOTALL)
|
|||
|
|
for m_c in max_chunk_list:
|
|||
|
|
if len(m_c.strip()) > 0:
|
|||
|
|
result.append(m_c.strip())
|
|||
|
|
|
|||
|
|
return result
|