diff --git a/yudao-module-llm/yudao-module-llm-api/src/main/java/cn/iocoder/yudao/module/llm/enums/ErrorCodeConstants.java b/yudao-module-llm/yudao-module-llm-api/src/main/java/cn/iocoder/yudao/module/llm/enums/ErrorCodeConstants.java index 035b54fdb..fb4a79563 100644 --- a/yudao-module-llm/yudao-module-llm-api/src/main/java/cn/iocoder/yudao/module/llm/enums/ErrorCodeConstants.java +++ b/yudao-module-llm/yudao-module-llm-api/src/main/java/cn/iocoder/yudao/module/llm/enums/ErrorCodeConstants.java @@ -103,6 +103,12 @@ public interface ErrorCodeConstants { ErrorCode KNOWLEDGE_BASE_NAME_NOT_EXISTS = new ErrorCode(10040, "知识库名称已存在"); + ErrorCode CHUNK_SIZE_MUST_BE_GREATER_THAN_ZERO = new ErrorCode(10040_1, "分块大小必须大于 0"); + + ErrorCode CHUNK_OVERLAP_MUST_BE_GREATER_THAN_OR_EQUAL_TO_ZERO = new ErrorCode(10040_2, "分块重叠必须大于或等于 0"); + + ErrorCode CHUNK_OVERLAP_MUST_BE_LESS_THAN_CHUNK_SIZE = new ErrorCode(10040_3, "分块重叠必须小于分块大小"); + ErrorCode APPLICATION_NAME_NOT_EXISTS = new ErrorCode(10041, "应用中心名称已存在"); ErrorCode MODEL_SERVIC_ENAME_NOT_EXISTS = new ErrorCode(10043, "模型名称已存在"); diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/controller/admin/knowledgebase/vo/KnowledgeBaseSaveReqVO.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/controller/admin/knowledgebase/vo/KnowledgeBaseSaveReqVO.java index 0c5d43d69..c0a9be386 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/controller/admin/knowledgebase/vo/KnowledgeBaseSaveReqVO.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/controller/admin/knowledgebase/vo/KnowledgeBaseSaveReqVO.java @@ -35,8 +35,15 @@ public class KnowledgeBaseSaveReqVO { /** * 分块大小 */ + @Schema(description = "分块大小") private Integer chunkSize; + /** + * 分块重叠 + */ + @Schema(description = "分块重叠,") + private Integer chunkOverlap; + @Schema(description = "文件引用") private String knowledgeFile; diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/dal/dataobject/knowledgebase/KnowledgeBaseDO.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/dal/dataobject/knowledgebase/KnowledgeBaseDO.java index 6f1b957b3..79aefc497 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/dal/dataobject/knowledgebase/KnowledgeBaseDO.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/dal/dataobject/knowledgebase/KnowledgeBaseDO.java @@ -4,6 +4,7 @@ import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; import com.baomidou.mybatisplus.annotation.KeySequence; import com.baomidou.mybatisplus.annotation.TableId; import com.baomidou.mybatisplus.annotation.TableName; +import io.swagger.v3.oas.annotations.media.Schema; import lombok.*; /** @@ -55,4 +56,13 @@ public class KnowledgeBaseDO extends BaseDO { */ private String knowledgeFile; + /** + * 分块大小 + */ + private Integer chunkSize; + + /** + * 分块重叠 + */ + private Integer chunkOverlap; } diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java index 2823f33a9..b6e8213c6 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java @@ -11,15 +11,12 @@ import cn.iocoder.yudao.module.llm.service.http.vo.KnowledgeRagEmbedReqVO; import cn.iocoder.yudao.module.llm.service.http.vo.RegUploadReqVO; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import javax.annotation.Resource; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.net.URL; import java.util.List; +import java.util.Map; import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception; @@ -38,8 +35,8 @@ public class AsyncKnowledgeBase { // 向向量知识库创建文件 -// @Async - public void createKnowledgeBase(List knowledgeList, List ids) { + // @Async + public void createKnowledgeBase (List knowledgeList, List ids, Map knowledgeParameters) { log.info("开始执行 createKnowledgeBase 方法。knowledgeList 大小: {}, ids 大小: {}", knowledgeList.size(), ids.size()); // 如果提供了 ids,则删除现有的知识库文档 @@ -80,7 +77,7 @@ public class AsyncKnowledgeBase { if (lastIndex != -1) { String extension = knowledge.getDocumentName().substring(lastIndex + 1).toLowerCase(); log.info("文档扩展名: {}", extension); - knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId()); + knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId(), knowledgeParameters); } else { log.warn("文档无扩展名,跳过处理,文档 ID: {}", knowledge.getId()); } @@ -119,13 +116,15 @@ public class AsyncKnowledgeBase { * @param knowledge 文件 * @param id 知识库id */ - public void knowledgeEmbed (KnowledgeDocumentsDO knowledge, Long id) { + public void knowledgeEmbed (KnowledgeDocumentsDO knowledge, Long id, Map knowledgeParameters) { // 创建知识向量 KnowledgeRagEmbedReqVO ragEmbedReqVo = new KnowledgeRagEmbedReqVO() .setFileId(String.valueOf(knowledge.getId())) .setFileName(knowledge.getDocumentName()) - .setFileUrl(knowledge.getFileUrl()); + .setFileUrl(knowledge.getFileUrl()) + .setChunkSize(knowledgeParameters.get("chunkSize")) + .setChunkOverlap(knowledgeParameters.get("chunkOverlap")); try { ragHttpService.knowledgeEmbed(ragEmbedReqVo, id); diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java index 4380fdb3f..5ebbebaef 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java @@ -390,9 +390,10 @@ public class RagHttpService { String fileName = reqVO.getFileName(); String fileUrl = reqVO.getFileUrl(); Integer chunkSize = Optional.ofNullable(reqVO.getChunkSize()).orElse(1500); + Integer chunkOverlap = Optional.ofNullable(reqVO.getChunkOverlap()).orElse(300); String mediaType = getMediaType(fileName); - log.info("文件ID: {}, 文件名: {}, 文件URL: {}, 文件类型: {}, 分块大小:{}", fileId, fileName, fileUrl, mediaType,chunkSize); + log.info("文件ID: {}, 文件名: {}, 文件URL: {}, 文件类型: {}, 分块大小:{}, 分块重叠:{}", fileId, fileName, fileUrl, mediaType,chunkSize,chunkOverlap); // 获取知识库文档 log.info("开始获取知识库文档,知识库ID: {}, 文件ID: {}", id, fileId); @@ -450,6 +451,7 @@ public class RagHttpService { .setType(MultipartBody.FORM) .addFormDataPart("file_id", fileId) .addFormDataPart("chunk_size", String.valueOf(chunkSize)) + .addFormDataPart("chunk_overlap", String.valueOf(chunkOverlap)) .addFormDataPart("file", fileName, RequestBody.create(tempFilePath.toFile(), MediaType.parse(mediaType)) ) diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/vo/KnowledgeRagEmbedReqVO.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/vo/KnowledgeRagEmbedReqVO.java index 646c3a2bb..5cf945d91 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/vo/KnowledgeRagEmbedReqVO.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/vo/KnowledgeRagEmbedReqVO.java @@ -46,4 +46,9 @@ public class KnowledgeRagEmbedReqVO { * 分块大小 */ private Integer chunkSize; + + /** + * 分块重叠 + */ + private Integer chunkOverlap; } diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/knowledgebase/KnowledgeBaseServiceImpl.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/knowledgebase/KnowledgeBaseServiceImpl.java index 3229b0bfb..2b0fc74d9 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/knowledgebase/KnowledgeBaseServiceImpl.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/knowledgebase/KnowledgeBaseServiceImpl.java @@ -17,24 +17,17 @@ import cn.iocoder.yudao.module.llm.dal.mysql.knowledgedocuments.KnowledgeDocumen import cn.iocoder.yudao.module.llm.service.application.ApplicationService; import cn.iocoder.yudao.module.llm.service.async.AsyncKnowledgeBase; import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; -import kong.unirest.Unirest; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; -import org.springframework.transaction.annotation.Transactional; import org.springframework.validation.annotation.Validated; import javax.annotation.Resource; -import javax.annotation.Tainted; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Objects; +import java.util.*; import java.util.stream.Collectors; import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception; -import static cn.iocoder.yudao.module.llm.enums.ErrorCodeConstants.KNOWLEDGE_BASE_NAME_NOT_EXISTS; -import static cn.iocoder.yudao.module.llm.enums.ErrorCodeConstants.KNOWLEDGE_BASE_NOT_EXISTS; +import static cn.iocoder.yudao.module.llm.enums.ErrorCodeConstants.*; /** * 知识库 Service 实现类 @@ -66,68 +59,137 @@ public class KnowledgeBaseServiceImpl implements KnowledgeBaseService { } @Override -// @Transactional(rollbackFor = Exception.class) + // @Transactional(rollbackFor = Exception.class) public void updateKnowledgeBase (KnowledgeBaseSaveReqVO updateReqVO) { + // 1. 校验知识库是否存在 + validateKnowledgeParam(updateReqVO); + + // 2. 更新知识库主表基础信息 + KnowledgeBaseDO updateObj = BeanUtils.toBean(updateReqVO, KnowledgeBaseDO.class); + knowledgeBaseMapper.updateById(updateObj); + + // 3. 处理附表(知识文档)数据 + handleKnowledgeDocuments(updateReqVO, updateObj); + } + + /** + * 校验知识库参数 + * + * @param updateReqVO 更新知识库参数 + */ + private void validateKnowledgeParam (KnowledgeBaseSaveReqVO updateReqVO) { // 1. 校验知识库是否存在 validateKnowledgeBaseExists(updateReqVO.getId()); // 2. 校验知识库名称是否重复 validateKnowledgeBaseNameExists(updateReqVO); - // 3. 更新知识库主表 - KnowledgeBaseDO updateObj = BeanUtils.toBean(updateReqVO, KnowledgeBaseDO.class); - knowledgeBaseMapper.updateById(updateObj); + // 3. 校验分块大小和分块重叠是否正确 + validateChunkParameters(updateReqVO.getChunkSize(), updateReqVO.getChunkOverlap()); + } -// Unirest.config().reset(); -// Unirest.config() -// .socketTimeout(86400000) -// .connectTimeout(100000) -// .concurrency(10, 5) -// .setDefaultHeader("Accept", "application/json"); + /** + * 校验分块大小和分块重叠是否合法 + * + * @param chunkSize 分块大小 + * @param chunkOverlap 分块重叠 + * @throws IllegalArgumentException 如果校验不通过 + */ + private void validateChunkParameters (int chunkSize, int chunkOverlap) { + if (chunkSize < 1) { + throw exception(CHUNK_SIZE_MUST_BE_GREATER_THAN_ZERO); + } + if (chunkOverlap < 0) { + throw exception(CHUNK_OVERLAP_MUST_BE_GREATER_THAN_OR_EQUAL_TO_ZERO); + } + if (chunkOverlap >= chunkSize) { + throw exception(CHUNK_OVERLAP_MUST_BE_LESS_THAN_CHUNK_SIZE); + } + } - // 4. 处理附表(知识文档)数据 - if (!CollectionUtils.isAnyEmpty(updateReqVO.getKnowledgeDocuments())) { - // 4.1 获取需要保留的文档 ID - List retainedIds = updateReqVO.getKnowledgeDocuments().stream() - .map(KnowledgeDocumentsSaveReqVO::getId) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - - // 4.2 删除不需要保留的文档 - LambdaQueryWrapperX deleteWrapper = new LambdaQueryWrapperX() - .eq(KnowledgeDocumentsDO::getKnowledgeBaseId, updateReqVO.getId()); - if (!CollectionUtils.isAnyEmpty(retainedIds)) { - deleteWrapper.notIn(KnowledgeDocumentsDO::getId, retainedIds); - } - knowledgeDocumentsMapper.delete(deleteWrapper); - - // 4.3 更新或插入文档数据 - List newDocuments = new ArrayList<>(); - updateReqVO.getKnowledgeDocuments().forEach(doc -> { - KnowledgeDocumentsDO docDO = BeanUtils.toBean(doc, KnowledgeDocumentsDO.class); - docDO.setKnowledgeBaseId(updateReqVO.getId()); - docDO.setChunkSize(updateObj.getKnowledgeLength()); - if (doc.getId() == null) { - newDocuments.add(docDO); // 收集新增文档 - } - knowledgeDocumentsMapper.insertOrUpdate(docDO); // 更新或插入文档 - }); - - // 4.4 异步处理新增文档和删除的文档 - List deleteIds = knowledgeDocumentsMapper.selectDeleteIds(updateReqVO.getId()); - asyncKnowledgeBase.createKnowledgeBase(newDocuments, deleteIds); - } else { - // 5. 如果传入的文档列表为空,则删除所有关联文档 - knowledgeDocumentsMapper.delete(new LambdaQueryWrapperX() - .eq(KnowledgeDocumentsDO::getKnowledgeBaseId, updateReqVO.getId())); - - // 5.1 异步处理删除的文档 - List deleteIds = knowledgeDocumentsMapper.selectDeleteIds(updateReqVO.getId()); - if (!CollectionUtils.isAnyEmpty(deleteIds)) { - asyncKnowledgeBase.createKnowledgeBase(null, deleteIds); - } + /** + * 处理知识文档数据 + * + * @param updateReqVO 更新知识库参数 + * @param updateObj 更新知识库对象 + */ + private void handleKnowledgeDocuments (KnowledgeBaseSaveReqVO updateReqVO, KnowledgeBaseDO updateObj) { + List documents = updateReqVO.getKnowledgeDocuments(); + if (CollectionUtils.isAnyEmpty(documents)) { + // 如果传入的文档列表为空,则删除所有关联文档 + deleteAllDocuments(updateReqVO.getId()); + return; } + // 获取需要保留的文档 ID + List retainedIds = documents.stream() + .map(KnowledgeDocumentsSaveReqVO::getId) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + + // 删除不需要保留的文档 + deleteUnretainedDocuments(updateReqVO.getId(), retainedIds); + + // 更新或插入文档数据 + List newDocuments = updateOrInsertDocuments(documents, updateReqVO.getId(), updateObj.getKnowledgeLength()); + + Map knowledgeParameters = new HashMap<>(); + knowledgeParameters.put("chunkSize",updateReqVO.getChunkSize()); + knowledgeParameters.put("chunkOverlap",updateReqVO.getChunkOverlap()); + + // 异步处理新增文档和删除的文档 + List deleteIds = knowledgeDocumentsMapper.selectDeleteIds(updateReqVO.getId()); + asyncKnowledgeBase.createKnowledgeBase(newDocuments, deleteIds,knowledgeParameters); + } + + /** + * 删除所有关联的文档 + * + * @param knowledgeBaseId 知识库 ID + */ + private void deleteAllDocuments (Long knowledgeBaseId) { + knowledgeDocumentsMapper.delete(new LambdaQueryWrapperX() + .eq(KnowledgeDocumentsDO::getKnowledgeBaseId, knowledgeBaseId)); + + // 异步处理删除的文档 + List deleteIds = knowledgeDocumentsMapper.selectDeleteIds(knowledgeBaseId); + if (!CollectionUtils.isAnyEmpty(deleteIds)) { + asyncKnowledgeBase.createKnowledgeBase(new ArrayList<>(), deleteIds,new HashMap<>()); + } + } + + /** + * 删除不需要保留的文档 + * @param knowledgeBaseId 知识库 ID + * @param retainedIds 需要保留的文档 ID + */ + private void deleteUnretainedDocuments (Long knowledgeBaseId, List retainedIds) { + LambdaQueryWrapperX deleteWrapper = new LambdaQueryWrapperX() + .eq(KnowledgeDocumentsDO::getKnowledgeBaseId, knowledgeBaseId); + if (!CollectionUtils.isAnyEmpty(retainedIds)) { + deleteWrapper.notIn(KnowledgeDocumentsDO::getId, retainedIds); + } + knowledgeDocumentsMapper.delete(deleteWrapper); + } + + /** + * 更新或插入文档数据 + * @param documents 需要更新的文档数据 + * @param knowledgeBaseId 知识库 ID + * @param chunkSize + * @return 更新或插入的文档数据 + */ + private List updateOrInsertDocuments (List documents, Long knowledgeBaseId, Integer chunkSize) { + List newDocuments = new ArrayList<>(); + documents.forEach(doc -> { + KnowledgeDocumentsDO docDO = BeanUtils.toBean(doc, KnowledgeDocumentsDO.class); + docDO.setKnowledgeBaseId(knowledgeBaseId); + if (doc.getId() == null) { + newDocuments.add(docDO); // 收集新增文档 + } + knowledgeDocumentsMapper.insertOrUpdate(docDO); // 更新或插入文档 + }); + return newDocuments; } @Override