feat(llm): 知识库增加分块重叠参数并优化相关逻辑
- 在 KnowledgeBaseDO、KnowledgeBaseSaveReqVO 和 KnowledgeRagEmbedReqVO 中添加分块重叠字段 - 优化知识库更新逻辑,增加参数校验和错误处理 - 调整文档处理流程,支持分块大小和重叠参数 - 新增错误码常量,用于处理分块参数相关的错误
This commit is contained in:
parent
8923987afb
commit
0018c535a7
@ -103,6 +103,12 @@ public interface ErrorCodeConstants {
|
||||
|
||||
ErrorCode KNOWLEDGE_BASE_NAME_NOT_EXISTS = new ErrorCode(10040, "知识库名称已存在");
|
||||
|
||||
ErrorCode CHUNK_SIZE_MUST_BE_GREATER_THAN_ZERO = new ErrorCode(10040_1, "分块大小必须大于 0");
|
||||
|
||||
ErrorCode CHUNK_OVERLAP_MUST_BE_GREATER_THAN_OR_EQUAL_TO_ZERO = new ErrorCode(10040_2, "分块重叠必须大于或等于 0");
|
||||
|
||||
ErrorCode CHUNK_OVERLAP_MUST_BE_LESS_THAN_CHUNK_SIZE = new ErrorCode(10040_3, "分块重叠必须小于分块大小");
|
||||
|
||||
ErrorCode APPLICATION_NAME_NOT_EXISTS = new ErrorCode(10041, "应用中心名称已存在");
|
||||
|
||||
ErrorCode MODEL_SERVIC_ENAME_NOT_EXISTS = new ErrorCode(10043, "模型名称已存在");
|
||||
|
@ -35,8 +35,15 @@ public class KnowledgeBaseSaveReqVO {
|
||||
/**
|
||||
* 分块大小
|
||||
*/
|
||||
@Schema(description = "分块大小")
|
||||
private Integer chunkSize;
|
||||
|
||||
/**
|
||||
* 分块重叠
|
||||
*/
|
||||
@Schema(description = "分块重叠,")
|
||||
private Integer chunkOverlap;
|
||||
|
||||
@Schema(description = "文件引用")
|
||||
private String knowledgeFile;
|
||||
|
||||
|
@ -4,6 +4,7 @@ import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
|
||||
import com.baomidou.mybatisplus.annotation.KeySequence;
|
||||
import com.baomidou.mybatisplus.annotation.TableId;
|
||||
import com.baomidou.mybatisplus.annotation.TableName;
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.*;
|
||||
|
||||
/**
|
||||
@ -55,4 +56,13 @@ public class KnowledgeBaseDO extends BaseDO {
|
||||
*/
|
||||
private String knowledgeFile;
|
||||
|
||||
/**
|
||||
* 分块大小
|
||||
*/
|
||||
private Integer chunkSize;
|
||||
|
||||
/**
|
||||
* 分块重叠
|
||||
*/
|
||||
private Integer chunkOverlap;
|
||||
}
|
||||
|
@ -11,15 +11,12 @@ import cn.iocoder.yudao.module.llm.service.http.vo.KnowledgeRagEmbedReqVO;
|
||||
import cn.iocoder.yudao.module.llm.service.http.vo.RegUploadReqVO;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
|
||||
|
||||
@ -38,8 +35,8 @@ public class AsyncKnowledgeBase {
|
||||
|
||||
|
||||
// 向向量知识库创建文件
|
||||
// @Async
|
||||
public void createKnowledgeBase(List<KnowledgeDocumentsDO> knowledgeList, List<Long> ids) {
|
||||
// @Async
|
||||
public void createKnowledgeBase (List<KnowledgeDocumentsDO> knowledgeList, List<Long> ids, Map<String, Integer> knowledgeParameters) {
|
||||
log.info("开始执行 createKnowledgeBase 方法。knowledgeList 大小: {}, ids 大小: {}", knowledgeList.size(), ids.size());
|
||||
|
||||
// 如果提供了 ids,则删除现有的知识库文档
|
||||
@ -80,7 +77,7 @@ public class AsyncKnowledgeBase {
|
||||
if (lastIndex != -1) {
|
||||
String extension = knowledge.getDocumentName().substring(lastIndex + 1).toLowerCase();
|
||||
log.info("文档扩展名: {}", extension);
|
||||
knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
|
||||
knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId(), knowledgeParameters);
|
||||
} else {
|
||||
log.warn("文档无扩展名,跳过处理,文档 ID: {}", knowledge.getId());
|
||||
}
|
||||
@ -119,13 +116,15 @@ public class AsyncKnowledgeBase {
|
||||
* @param knowledge 文件
|
||||
* @param id 知识库id
|
||||
*/
|
||||
public void knowledgeEmbed (KnowledgeDocumentsDO knowledge, Long id) {
|
||||
public void knowledgeEmbed (KnowledgeDocumentsDO knowledge, Long id, Map<String, Integer> knowledgeParameters) {
|
||||
|
||||
// 创建知识向量
|
||||
KnowledgeRagEmbedReqVO ragEmbedReqVo = new KnowledgeRagEmbedReqVO()
|
||||
.setFileId(String.valueOf(knowledge.getId()))
|
||||
.setFileName(knowledge.getDocumentName())
|
||||
.setFileUrl(knowledge.getFileUrl());
|
||||
.setFileUrl(knowledge.getFileUrl())
|
||||
.setChunkSize(knowledgeParameters.get("chunkSize"))
|
||||
.setChunkOverlap(knowledgeParameters.get("chunkOverlap"));
|
||||
|
||||
try {
|
||||
ragHttpService.knowledgeEmbed(ragEmbedReqVo, id);
|
||||
|
@ -390,9 +390,10 @@ public class RagHttpService {
|
||||
String fileName = reqVO.getFileName();
|
||||
String fileUrl = reqVO.getFileUrl();
|
||||
Integer chunkSize = Optional.ofNullable(reqVO.getChunkSize()).orElse(1500);
|
||||
Integer chunkOverlap = Optional.ofNullable(reqVO.getChunkOverlap()).orElse(300);
|
||||
String mediaType = getMediaType(fileName);
|
||||
|
||||
log.info("文件ID: {}, 文件名: {}, 文件URL: {}, 文件类型: {}, 分块大小:{}", fileId, fileName, fileUrl, mediaType,chunkSize);
|
||||
log.info("文件ID: {}, 文件名: {}, 文件URL: {}, 文件类型: {}, 分块大小:{}, 分块重叠:{}", fileId, fileName, fileUrl, mediaType,chunkSize,chunkOverlap);
|
||||
|
||||
// 获取知识库文档
|
||||
log.info("开始获取知识库文档,知识库ID: {}, 文件ID: {}", id, fileId);
|
||||
@ -450,6 +451,7 @@ public class RagHttpService {
|
||||
.setType(MultipartBody.FORM)
|
||||
.addFormDataPart("file_id", fileId)
|
||||
.addFormDataPart("chunk_size", String.valueOf(chunkSize))
|
||||
.addFormDataPart("chunk_overlap", String.valueOf(chunkOverlap))
|
||||
.addFormDataPart("file", fileName,
|
||||
RequestBody.create(tempFilePath.toFile(), MediaType.parse(mediaType))
|
||||
)
|
||||
|
@ -46,4 +46,9 @@ public class KnowledgeRagEmbedReqVO {
|
||||
* 分块大小
|
||||
*/
|
||||
private Integer chunkSize;
|
||||
|
||||
/**
|
||||
* 分块重叠
|
||||
*/
|
||||
private Integer chunkOverlap;
|
||||
}
|
||||
|
@ -17,24 +17,17 @@ import cn.iocoder.yudao.module.llm.dal.mysql.knowledgedocuments.KnowledgeDocumen
|
||||
import cn.iocoder.yudao.module.llm.service.application.ApplicationService;
|
||||
import cn.iocoder.yudao.module.llm.service.async.AsyncKnowledgeBase;
|
||||
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
|
||||
import kong.unirest.Unirest;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
import org.springframework.validation.annotation.Validated;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import javax.annotation.Tainted;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
|
||||
import static cn.iocoder.yudao.module.llm.enums.ErrorCodeConstants.KNOWLEDGE_BASE_NAME_NOT_EXISTS;
|
||||
import static cn.iocoder.yudao.module.llm.enums.ErrorCodeConstants.KNOWLEDGE_BASE_NOT_EXISTS;
|
||||
import static cn.iocoder.yudao.module.llm.enums.ErrorCodeConstants.*;
|
||||
|
||||
/**
|
||||
* 知识库 Service 实现类
|
||||
@ -66,68 +59,137 @@ public class KnowledgeBaseServiceImpl implements KnowledgeBaseService {
|
||||
}
|
||||
|
||||
@Override
|
||||
// @Transactional(rollbackFor = Exception.class)
|
||||
// @Transactional(rollbackFor = Exception.class)
|
||||
public void updateKnowledgeBase (KnowledgeBaseSaveReqVO updateReqVO) {
|
||||
// 1. 校验知识库是否存在
|
||||
validateKnowledgeParam(updateReqVO);
|
||||
|
||||
// 2. 更新知识库主表基础信息
|
||||
KnowledgeBaseDO updateObj = BeanUtils.toBean(updateReqVO, KnowledgeBaseDO.class);
|
||||
knowledgeBaseMapper.updateById(updateObj);
|
||||
|
||||
// 3. 处理附表(知识文档)数据
|
||||
handleKnowledgeDocuments(updateReqVO, updateObj);
|
||||
}
|
||||
|
||||
/**
|
||||
* 校验知识库参数
|
||||
*
|
||||
* @param updateReqVO 更新知识库参数
|
||||
*/
|
||||
private void validateKnowledgeParam (KnowledgeBaseSaveReqVO updateReqVO) {
|
||||
// 1. 校验知识库是否存在
|
||||
validateKnowledgeBaseExists(updateReqVO.getId());
|
||||
|
||||
// 2. 校验知识库名称是否重复
|
||||
validateKnowledgeBaseNameExists(updateReqVO);
|
||||
|
||||
// 3. 更新知识库主表
|
||||
KnowledgeBaseDO updateObj = BeanUtils.toBean(updateReqVO, KnowledgeBaseDO.class);
|
||||
knowledgeBaseMapper.updateById(updateObj);
|
||||
// 3. 校验分块大小和分块重叠是否正确
|
||||
validateChunkParameters(updateReqVO.getChunkSize(), updateReqVO.getChunkOverlap());
|
||||
}
|
||||
|
||||
// Unirest.config().reset();
|
||||
// Unirest.config()
|
||||
// .socketTimeout(86400000)
|
||||
// .connectTimeout(100000)
|
||||
// .concurrency(10, 5)
|
||||
// .setDefaultHeader("Accept", "application/json");
|
||||
/**
|
||||
* 校验分块大小和分块重叠是否合法
|
||||
*
|
||||
* @param chunkSize 分块大小
|
||||
* @param chunkOverlap 分块重叠
|
||||
* @throws IllegalArgumentException 如果校验不通过
|
||||
*/
|
||||
private void validateChunkParameters (int chunkSize, int chunkOverlap) {
|
||||
if (chunkSize < 1) {
|
||||
throw exception(CHUNK_SIZE_MUST_BE_GREATER_THAN_ZERO);
|
||||
}
|
||||
if (chunkOverlap < 0) {
|
||||
throw exception(CHUNK_OVERLAP_MUST_BE_GREATER_THAN_OR_EQUAL_TO_ZERO);
|
||||
}
|
||||
if (chunkOverlap >= chunkSize) {
|
||||
throw exception(CHUNK_OVERLAP_MUST_BE_LESS_THAN_CHUNK_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. 处理附表(知识文档)数据
|
||||
if (!CollectionUtils.isAnyEmpty(updateReqVO.getKnowledgeDocuments())) {
|
||||
// 4.1 获取需要保留的文档 ID
|
||||
List<Long> retainedIds = updateReqVO.getKnowledgeDocuments().stream()
|
||||
.map(KnowledgeDocumentsSaveReqVO::getId)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// 4.2 删除不需要保留的文档
|
||||
LambdaQueryWrapperX<KnowledgeDocumentsDO> deleteWrapper = new LambdaQueryWrapperX<KnowledgeDocumentsDO>()
|
||||
.eq(KnowledgeDocumentsDO::getKnowledgeBaseId, updateReqVO.getId());
|
||||
if (!CollectionUtils.isAnyEmpty(retainedIds)) {
|
||||
deleteWrapper.notIn(KnowledgeDocumentsDO::getId, retainedIds);
|
||||
}
|
||||
knowledgeDocumentsMapper.delete(deleteWrapper);
|
||||
|
||||
// 4.3 更新或插入文档数据
|
||||
List<KnowledgeDocumentsDO> newDocuments = new ArrayList<>();
|
||||
updateReqVO.getKnowledgeDocuments().forEach(doc -> {
|
||||
KnowledgeDocumentsDO docDO = BeanUtils.toBean(doc, KnowledgeDocumentsDO.class);
|
||||
docDO.setKnowledgeBaseId(updateReqVO.getId());
|
||||
docDO.setChunkSize(updateObj.getKnowledgeLength());
|
||||
if (doc.getId() == null) {
|
||||
newDocuments.add(docDO); // 收集新增文档
|
||||
}
|
||||
knowledgeDocumentsMapper.insertOrUpdate(docDO); // 更新或插入文档
|
||||
});
|
||||
|
||||
// 4.4 异步处理新增文档和删除的文档
|
||||
List<Long> deleteIds = knowledgeDocumentsMapper.selectDeleteIds(updateReqVO.getId());
|
||||
asyncKnowledgeBase.createKnowledgeBase(newDocuments, deleteIds);
|
||||
} else {
|
||||
// 5. 如果传入的文档列表为空,则删除所有关联文档
|
||||
knowledgeDocumentsMapper.delete(new LambdaQueryWrapperX<KnowledgeDocumentsDO>()
|
||||
.eq(KnowledgeDocumentsDO::getKnowledgeBaseId, updateReqVO.getId()));
|
||||
|
||||
// 5.1 异步处理删除的文档
|
||||
List<Long> deleteIds = knowledgeDocumentsMapper.selectDeleteIds(updateReqVO.getId());
|
||||
if (!CollectionUtils.isAnyEmpty(deleteIds)) {
|
||||
asyncKnowledgeBase.createKnowledgeBase(null, deleteIds);
|
||||
}
|
||||
/**
|
||||
* 处理知识文档数据
|
||||
*
|
||||
* @param updateReqVO 更新知识库参数
|
||||
* @param updateObj 更新知识库对象
|
||||
*/
|
||||
private void handleKnowledgeDocuments (KnowledgeBaseSaveReqVO updateReqVO, KnowledgeBaseDO updateObj) {
|
||||
List<KnowledgeDocumentsSaveReqVO> documents = updateReqVO.getKnowledgeDocuments();
|
||||
if (CollectionUtils.isAnyEmpty(documents)) {
|
||||
// 如果传入的文档列表为空,则删除所有关联文档
|
||||
deleteAllDocuments(updateReqVO.getId());
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取需要保留的文档 ID
|
||||
List<Long> retainedIds = documents.stream()
|
||||
.map(KnowledgeDocumentsSaveReqVO::getId)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// 删除不需要保留的文档
|
||||
deleteUnretainedDocuments(updateReqVO.getId(), retainedIds);
|
||||
|
||||
// 更新或插入文档数据
|
||||
List<KnowledgeDocumentsDO> newDocuments = updateOrInsertDocuments(documents, updateReqVO.getId(), updateObj.getKnowledgeLength());
|
||||
|
||||
Map<String,Integer> knowledgeParameters = new HashMap<>();
|
||||
knowledgeParameters.put("chunkSize",updateReqVO.getChunkSize());
|
||||
knowledgeParameters.put("chunkOverlap",updateReqVO.getChunkOverlap());
|
||||
|
||||
// 异步处理新增文档和删除的文档
|
||||
List<Long> deleteIds = knowledgeDocumentsMapper.selectDeleteIds(updateReqVO.getId());
|
||||
asyncKnowledgeBase.createKnowledgeBase(newDocuments, deleteIds,knowledgeParameters);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除所有关联的文档
|
||||
*
|
||||
* @param knowledgeBaseId 知识库 ID
|
||||
*/
|
||||
private void deleteAllDocuments (Long knowledgeBaseId) {
|
||||
knowledgeDocumentsMapper.delete(new LambdaQueryWrapperX<KnowledgeDocumentsDO>()
|
||||
.eq(KnowledgeDocumentsDO::getKnowledgeBaseId, knowledgeBaseId));
|
||||
|
||||
// 异步处理删除的文档
|
||||
List<Long> deleteIds = knowledgeDocumentsMapper.selectDeleteIds(knowledgeBaseId);
|
||||
if (!CollectionUtils.isAnyEmpty(deleteIds)) {
|
||||
asyncKnowledgeBase.createKnowledgeBase(new ArrayList<>(), deleteIds,new HashMap<>());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除不需要保留的文档
|
||||
* @param knowledgeBaseId 知识库 ID
|
||||
* @param retainedIds 需要保留的文档 ID
|
||||
*/
|
||||
private void deleteUnretainedDocuments (Long knowledgeBaseId, List<Long> retainedIds) {
|
||||
LambdaQueryWrapperX<KnowledgeDocumentsDO> deleteWrapper = new LambdaQueryWrapperX<KnowledgeDocumentsDO>()
|
||||
.eq(KnowledgeDocumentsDO::getKnowledgeBaseId, knowledgeBaseId);
|
||||
if (!CollectionUtils.isAnyEmpty(retainedIds)) {
|
||||
deleteWrapper.notIn(KnowledgeDocumentsDO::getId, retainedIds);
|
||||
}
|
||||
knowledgeDocumentsMapper.delete(deleteWrapper);
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新或插入文档数据
|
||||
* @param documents 需要更新的文档数据
|
||||
* @param knowledgeBaseId 知识库 ID
|
||||
* @param chunkSize
|
||||
* @return 更新或插入的文档数据
|
||||
*/
|
||||
private List<KnowledgeDocumentsDO> updateOrInsertDocuments (List<KnowledgeDocumentsSaveReqVO> documents, Long knowledgeBaseId, Integer chunkSize) {
|
||||
List<KnowledgeDocumentsDO> newDocuments = new ArrayList<>();
|
||||
documents.forEach(doc -> {
|
||||
KnowledgeDocumentsDO docDO = BeanUtils.toBean(doc, KnowledgeDocumentsDO.class);
|
||||
docDO.setKnowledgeBaseId(knowledgeBaseId);
|
||||
if (doc.getId() == null) {
|
||||
newDocuments.add(docDO); // 收集新增文档
|
||||
}
|
||||
knowledgeDocumentsMapper.insertOrUpdate(docDO); // 更新或插入文档
|
||||
});
|
||||
return newDocuments;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
Loading…
x
Reference in New Issue
Block a user