diff --git a/yudao-module-llm/yudao-module-llm-biz/pom.xml b/yudao-module-llm/yudao-module-llm-biz/pom.xml index 6474d8cab..e38fda93e 100644 --- a/yudao-module-llm/yudao-module-llm-biz/pom.xml +++ b/yudao-module-llm/yudao-module-llm-biz/pom.xml @@ -152,6 +152,12 @@ org.springframework spring-webflux + + + cn.iocoder.boot + yudao-module-mdpf-biz + 2.3.0-jdk8-SNAPSHOT + diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataSetService.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataSetService.java index e350fa28f..e57b6cf56 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataSetService.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataSetService.java @@ -12,6 +12,8 @@ import cn.iocoder.yudao.module.llm.service.dataset.DatasetFilesService; import cn.iocoder.yudao.module.llm.service.dataset.vo.AigcDatasetVo; import cn.iocoder.yudao.module.llm.service.http.TrainHttpService; import cn.iocoder.yudao.module.llm.service.http.vo.AigcDatasetFileRespV0; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO; import com.alibaba.fastjson.JSONObject; import com.baomidou.mybatisplus.core.toolkit.StringUtils; import com.baomidou.mybatisplus.core.toolkit.Wrappers; @@ -184,4 +186,94 @@ public class AsyncDataSetService { } } + + + public String JsonFileWriteFineMiddle (String hostUrl, DataSetMiddleDO datasetDO, List datasetQuestionList) { + try { + log.info("开始生成 JSON 文件并上传,数据集ID: {}", datasetDO.getId()); + + // 构建 AigcDatasetVo 列表 + log.debug("正在构建 AigcDatasetVo 列表..."); + List aigcDatasetVoList = new ArrayList<>(); + for (PlatformDatasetQuestionRespVO dataSource : datasetQuestionList) { + AigcDatasetVo aigcDatasetVo = new AigcDatasetVo(); + aigcDatasetVo.setInstruction(StringUtils.isNotBlank(dataSource.getSystem()) ? dataSource.getSystem() : ""); + aigcDatasetVo.setInput(StringUtils.isNotBlank(dataSource.getQuestion()) ? dataSource.getQuestion() : ""); + + // 检查答案列表是否为空 + if (!CollectionUtils.isAnyEmpty(dataSource.getDatasetAnswerRespVO())) { + aigcDatasetVo.setOutput(StringUtils.isNotBlank(dataSource.getDatasetAnswerRespVO().get(0).getAnswer()) ? + dataSource.getDatasetAnswerRespVO().get(0).getAnswer() : ""); + } else { + aigcDatasetVo.setOutput(""); + } + aigcDatasetVoList.add(aigcDatasetVo); + } + log.debug("AigcDatasetVo 列表构建完成。记录数量: {}", aigcDatasetVoList.size()); + + // 将 AigcDatasetVo 列表转换为 JSON 字符串 + log.debug("正在将 AigcDatasetVo 列表转换为 JSON 字符串..."); + ObjectMapper mapper = new ObjectMapper(); + StringBuilder sb = new StringBuilder(); + for (AigcDatasetVo aigcDatasetVo : aigcDatasetVoList) { + String json = mapper.writeValueAsString(aigcDatasetVo); + sb.append(json).append("\n"); + } + + // 将 JSON 字符串转换为输入流 + log.debug("正在将 JSON 字符串转换为输入流..."); + InputStream inputStream = new ByteArrayInputStream(sb.toString().getBytes()); + + // 上传文件 + log.info("正在上传 JSON 文件..."); + String fileName = datasetDO.getDatasetName() + "new" + datasetDO.getId() + ".json"; + AigcDatasetFileRespV0 aigcDatasetFileRespV0 = trainHttpService.AigcUploadFile(new HashMap<>(), hostUrl, inputStream, fileName); + + if (aigcDatasetFileRespV0 != null) { + log.debug("文件上传成功。文件ID: {}", aigcDatasetFileRespV0.getFileId()); + + // 更新数据集的 Job ID + log.debug("正在更新数据集的 Job ID..."); + datasetMapper.setJobid(datasetDO.getId(), aigcDatasetFileRespV0.getFileId()); + + log.info("hostUrl:{}", hostUrl); + // 更新数据集的 URL + String s3Url = aigcDatasetFileRespV0.getS3Url(); + log.info("s3Url:{}", s3Url); + + // int lastIndex = s3Url.lastIndexOf("/storage"); + // String url = s3Url.substring(lastIndex + 1); + // log.info("url:{}", url); + // 找到 "/uploads" 的位置 + int uploadsIndex = s3Url.indexOf("/uploads"); + if (uploadsIndex == -1) { + log.error("s3Url 中未找到 '/uploads' 路径"); + return ""; + } + + // 提取 "/uploads" 及之后的部分 + String uploadsPath = s3Url.substring(uploadsIndex); + log.info("uploadsPath: {}", uploadsPath); + + // 构建新的完整 URL + String newUrl = hostUrl + uploadsPath; + log.info("newUrl: {}", newUrl); + datasetMapper.setUrl(datasetDO.getId(), newUrl); + + // 返回结果 + String result = newUrl.substring(hostUrl.length()); + log.info("JSON 文件生成并上传成功。返回结果: {}", result); + + return result; + } else { + log.error("文件上传失败。数据集ID: {}", datasetDO.getId()); + return ""; + } + + } catch (IOException e) { + log.error("生成或上传 JSON 文件时发生异常。数据集ID: {}", datasetDO.getId(), e); + return ""; + } + } + } diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncFineTuningTaskService.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncFineTuningTaskService.java index 588db41cc..807e4335f 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncFineTuningTaskService.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncFineTuningTaskService.java @@ -17,8 +17,13 @@ import cn.iocoder.yudao.module.llm.service.http.FineTuningTaskHttpService; import cn.iocoder.yudao.module.llm.service.http.TrainHttpService; import cn.iocoder.yudao.module.llm.service.http.vo.AigcFineTuningCreateReqVO; import cn.iocoder.yudao.module.llm.service.http.vo.AigcFineTuningCreateRespVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO; +import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetMiddleService; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetQuestionService; import lombok.extern.slf4j.Slf4j; import org.jetbrains.annotations.Nullable; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Lazy; import org.springframework.scheduling.annotation.Async; @@ -58,12 +63,121 @@ public class AsyncFineTuningTaskService { @Value("${spring.profiles.active}") private String active; + @Autowired + @Lazy + private DataSetMiddleService dataSetMiddleService; + + @Autowired + @Lazy + private PlatformDatasetQuestionService platformDatasetQuestionService; + //大模型平台创建调优任务 @Async public void createTuning (FineTuningTaskDO fineTuningTask) { // 记录开始创建任务的日志 log.info("异步创建。 开始创建微调任务,请求参数: {}", fineTuningTask); + try { + log.info("开始创建微调任务,任务ID: {}", fineTuningTask.getId()); + + ServerNameDO serverNameDO = getServerNameDO(fineTuningTask); + if (serverNameDO == null) { + return; + } + + // 构建微调任务请求对象 + log.debug("正在构建微调任务请求对象..."); + AigcFineTuningCreateReqVO req = getAigcFineTuningCreateReqVO(fineTuningTask); + + // 查询基础模型信息 + log.debug("正在查询基础模型信息,模型ID: {}", fineTuningTask.getBaseModelId()); + BaseModelDO baseModelDO = baseModelMapper.selectById(fineTuningTask.getBaseModelId()); + if (baseModelDO != null) { + req.setModel(baseModelDO.getAigcModelName()); + log.debug("基础模型信息设置成功。模型名称: {}", baseModelDO.getAigcModelName()); + } else { + log.warn("未找到基础模型信息,模型ID: {}", fineTuningTask.getBaseModelId()); + } + + // 查询数据集信息 + Long datasetId = fineTuningTask.getDataset(); + log.debug("正在查询数据集信息,数据集ID: {}", datasetId); +// DatasetRespVO dataset = datasetService.getDataset(datasetId); + DataSetMiddleDO dataset = dataSetMiddleService.getOne(datasetId); + if (dataset == null) { + log.error("未找到数据集信息,数据集ID: {}", datasetId); + throw new RuntimeException("数据集信息不存在"); + } + log.debug("数据集信息查询成功。数据集名称: {}", dataset.getDatasetName()); + + // 查询数据集问题列表 + log.debug("正在查询数据集问题列表,数据集ID: {}", dataset.getId()); + List datasetQuestionList = platformDatasetQuestionService.getDatasetQuestionList(dataset.getId()); + log.debug("数据集问题列表查询成功。问题数量: {}", datasetQuestionList.size()); + + // 将数据集信息转换为 DO 对象 + log.debug("正在转换数据集信息为 DO 对象..."); + DatasetDO datasetDO = BeanUtils.toBean(dataset, DatasetDO.class); + + // 生成 JSON 文件并获取文件 URL + log.debug("正在生成 JSON 文件并获取文件 URL..."); + String fileUrl = dataSetService.JsonFileWriteFineMiddle(serverNameDO.getHost(), dataset, datasetQuestionList); + req.setDataset(fileUrl); + log.info("JSON 文件生成成功。文件 URL: {}", fileUrl); + + // 设置部署次数 + int newDeployCount = Optional.ofNullable(fineTuningTask.getDeployCount()) + .orElse(0) + 1; + fineTuningTask.setDeployCount(newDeployCount); + + // 设置后缀 + req.setSuffix(active + "-" + fineTuningTask.getId() + "-" + newDeployCount); + log.info("请求参数设置完成。后缀: {}", req.getSuffix()); + + // 调用模型服务创建微调任务 + log.info("正在调用模型服务创建微调任务..."); + AigcFineTuningCreateRespVO resp=null; + String modelType = baseModelDO.getModelType(); + if("1".equals(modelType)){ + resp = fineTuningTaskHttpService.finetuningCreate(new HashMap<>(), serverNameDO.getHost(), req); + }else{ + resp = fineTuningTaskHttpService.finetuningCreateModal(new HashMap<>(), serverNameDO.getHost(), req); + } + + + // 更新任务状态 + FineTuningTaskDO updateObj = new FineTuningTaskDO(); + updateObj.setId(fineTuningTask.getId()); + updateObj.setDeployCount(newDeployCount); + if (resp != null && resp.getId() != 0) { + updateObj.setJobId(resp.getJobId()); + updateObj.setStatus(FinetuningTaskStatusEnum.WAITING.getStatus()); + updateObj.setJobModelName(resp.getFineTunedModel()); + updateObj.setTrainLog(resp.getTrainLog()); + updateObj.setMergeLogPath(resp.getMergeLogPath()); + log.info("微调任务创建成功。任务ID: {}, 任务模型名称: {} , 任务状态: {}", fineTuningTask.getId(), resp.getFineTunedModel(), FinetuningTaskStatusEnum.WAITING.getStatus()); + } else { + updateObj.setStatus(FinetuningTaskStatusEnum.FAILED.getStatus()); + log.error("微调任务创建失败。任务ID: {}", fineTuningTask.getId()); + } + + // 更新数据库 + log.debug("正在更新数据库中的任务状态..."); + fineTuningTaskMapper.updateById(updateObj); + log.info("数据库更新完成。任务ID: {}", fineTuningTask.getId()); + + } catch (Exception e) { + log.error("创建微调任务时发生异常。任务ID: {}", fineTuningTask.getId(), e); + throw e; + } + } + + + @Async + public void createTuningMiddleData (FineTuningTaskDO fineTuningTask) { + // 记录开始创建任务的日志 + log.info("异步创建。 开始创建微调任务,请求参数: {}", fineTuningTask); + try { log.info("开始创建微调任务,任务ID: {}", fineTuningTask.getId()); @@ -113,7 +227,7 @@ public class AsyncFineTuningTaskService { // 设置部署次数 int newDeployCount = Optional.ofNullable(fineTuningTask.getDeployCount()) - .orElse(0) + 1; + .orElse(0) + 1; fineTuningTask.setDeployCount(newDeployCount); // 设置后缀 @@ -125,7 +239,7 @@ public class AsyncFineTuningTaskService { AigcFineTuningCreateRespVO resp=null; String modelType = baseModelDO.getModelType(); if("1".equals(modelType)){ - resp = fineTuningTaskHttpService.finetuningCreate(new HashMap<>(), serverNameDO.getHost(), req); + resp = fineTuningTaskHttpService.finetuningCreate(new HashMap<>(), serverNameDO.getHost(), req); }else{ resp = fineTuningTaskHttpService.finetuningCreateModal(new HashMap<>(), serverNameDO.getHost(), req); } diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/finetuningtask/FineTuningTaskServiceImpl.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/finetuningtask/FineTuningTaskServiceImpl.java index 1df57a5d9..2f2f469ee 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/finetuningtask/FineTuningTaskServiceImpl.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/finetuningtask/FineTuningTaskServiceImpl.java @@ -96,7 +96,8 @@ public class FineTuningTaskServiceImpl implements FineTuningTaskService { // 异步调用模型服务,创建调优任务 log.debug("正在异步调用模型服务,创建微调任务..."); - asyncFineTuningTaskService.createTuning(fineTuningTask); +// asyncFineTuningTaskService.createTuning(fineTuningTask); + asyncFineTuningTaskService.createTuningMiddleData(fineTuningTask); log.info("已成功发起异步微调任务创建。任务ID: {}", fineTuningTask.getId()); // 返回任务ID diff --git a/yudao-module-mdpf/yudao-module-mdpf-api/src/main/java/cn/iocoder/module/mdpf/enums/DatasetStatusMiddleEnum.java b/yudao-module-mdpf/yudao-module-mdpf-api/src/main/java/cn/iocoder/module/mdpf/enums/DatasetStatusMiddleEnum.java new file mode 100644 index 000000000..4cf27b555 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-api/src/main/java/cn/iocoder/module/mdpf/enums/DatasetStatusMiddleEnum.java @@ -0,0 +1,27 @@ +package cn.iocoder.module.mdpf.enums; + +import cn.hutool.core.util.ObjUtil; +import lombok.AllArgsConstructor; +import lombok.Getter; + +import java.util.Arrays; + +@Getter +@AllArgsConstructor +public enum DatasetStatusMiddleEnum { + NOPENDING("未标注",0), + RUNNING("进行中",1), + SUCCESS("已完成",2); + private final String name; + private final Integer status; + public static final int[] ARRAYS = Arrays.stream(values()).mapToInt(DatasetStatusMiddleEnum::getStatus).toArray(); + + public static String getStatusByName(Integer status) { + for (DatasetStatusMiddleEnum name : values()) { + if (ObjUtil.equal(name.getStatus(), status)) { + return name.getName(); + } + } + return null; // 如果未找到对应的 name,返回 null + } +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-api/src/main/java/cn/iocoder/module/mdpf/enums/ErrorCodeConstants.java b/yudao-module-mdpf/yudao-module-mdpf-api/src/main/java/cn/iocoder/module/mdpf/enums/ErrorCodeConstants.java new file mode 100644 index 000000000..3ce342384 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-api/src/main/java/cn/iocoder/module/mdpf/enums/ErrorCodeConstants.java @@ -0,0 +1,129 @@ +package cn.iocoder.module.mdpf.enums; + +import cn.iocoder.yudao.framework.common.exception.ErrorCode; + +public interface ErrorCodeConstants { + ErrorCode KNOWLEDGE_BASE_NOT_EXISTS = new ErrorCode(10001, "知识库不存在"); + + ErrorCode DATASET_NOT_EXISTS = new ErrorCode(10002, "数据集不存在"); + + ErrorCode MODEL_SERVICE_NOT_EXISTS = new ErrorCode(10003, "模型服务不存在"); + + ErrorCode LABEL_NOT_EXISTS = new ErrorCode(10004,"标签不存在"); + + ErrorCode FINE_TUNING_TASK_NOT_EXISTS = new ErrorCode(10005, "微调任务不存在"); + + ErrorCode APPLICATION_NOT_EXISTS = new ErrorCode(10006, "应用名称服务不存在"); + + ErrorCode DATA_REFLUX_DATA_NOT_EXISTS = new ErrorCode(10007, "数据回流 —— 数据不存在"); + + ErrorCode DATA_REFLUX_CONFIG_NOT_EXISTS = new ErrorCode(10008, "数据回流不存在"); + + ErrorCode MODEL_ASSESS_TASK_MANUAL_NOT_EXISTS = new ErrorCode(10009, "人工评估不存在"); + + ErrorCode MODEL_ASSESS_DIMENSION_NOT_EXISTS = new ErrorCode(10010, "评估维度不存在"); + + ErrorCode MODEL_ASSESS_TASK_DIMENSION_NOT_EXISTS = new ErrorCode(10011, "人工评估维度不存在"); + + ErrorCode MODEL_ASSESS_TASK_AUTO_NOT_EXISTS = new ErrorCode(10012, "自动评估维度不存在"); + + ErrorCode MODEL_ASSESS_TASK_STOPLIST_NOT_EXISTS = new ErrorCode(10013, "自动评估维度不存在"); + + ErrorCode MODEL_ASSESS_STOPLIST_NOT_EXISTS = new ErrorCode(10014, "自动评估维度不存在"); + ErrorCode THE_AUTO_EVALUATE_DIMENSION_IS_IN_USE = new ErrorCode(10014_1, "自动评估维度正在使用中不可删除"); + + ErrorCode LEARNING_RESOURCES_NOT_EXISTS = new ErrorCode(10015, "学习资源不存在"); + + ErrorCode LEARNING_RESOURCES_FILE_URL_NOT_NULL = new ErrorCode(10016, "文件地址不能为空"); + ErrorCode VIDEO_COVER_IMAGE_EMPTY = new ErrorCode(10016_1, "视频封面图不能为空"); +/* + ErrorCode DATASET_FILES_NOT_EXISTS = new ErrorCode(10016, "数据集文件资源不存在"); + ErrorCode DATASET_QUESTION_NOT_EXISTS = new ErrorCode(10017, "数据集标准问题不存在"); + ErrorCode DATASET_ANSWER_NOT_EXISTS = new ErrorCode(10018, "数据集标准问题答案不存在"); +*/ + + ErrorCode PROMPT_TEMPLATES_NOT_EXISTS = new ErrorCode(100_1000, "模板信息不存在"); + + ErrorCode PROMPT_TEMPLATES_EXISTS = new ErrorCode(100_1001, "模板信息已存在"); + + ErrorCode PROMPT_TEMPLATESBACKUP_EXISTS = new ErrorCode(100_1002, "模板信息已备份"); + + ErrorCode PROMPT_TEMPLATES_BACKUP_NOT_EXISTS = new ErrorCode(101_1000, "Prompt模板备份不存在"); + + ErrorCode PROMPT_TEMPLATES_APPLICATIONS_NOT_EXISTS = new ErrorCode(10017, "模板信息不存在"); + + ErrorCode PROMPT_TEMPLATES_TAGS_NOT_EXISTS = new ErrorCode(10017, "模板信息不存在"); + + ErrorCode FINE_TUNING_LOSS_NOT_EXISTS = new ErrorCode(10018, "损失记录不存在"); + + ErrorCode FINE_TUNING_NOT_EXISTS = new ErrorCode(10019, "模型微调不存在"); + + ErrorCode DATA_PROCESS_TASK_NOT_EXISTS = new ErrorCode(10020, "数据处理任务不存在"); + + ErrorCode CONVERSATION_NOT_EXISTS = new ErrorCode(10021, "大模型对话记录不存在"); + + ErrorCode BASE_MODEL_NOT_EXISTS = new ErrorCode(10022, "基座模型不存在"); + + ErrorCode DATASET_ANSWER_NOT_EXISTS = new ErrorCode(10023, "数据集数据问题标注内容不存在"); + + ErrorCode DATASET_FILES_NOT_EXISTS = new ErrorCode(10024, "数据集数据文件不存在"); + + ErrorCode DATASET_QUESTION_NOT_EXISTS = new ErrorCode(10025, "数据集数据问题不存在"); + + ErrorCode KNOWLEDGE_DOCUMENTS_NOT_EXISTS = new ErrorCode(10026, "知识库文档不存在"); + + ErrorCode KNOWLEDGE_DOCUMENTS_CHUNKS_NOT_EXISTS = new ErrorCode(10027, "知识库文档块不存在"); + + ErrorCode KNOWLEDGE_DOCUMENTS_CHUNKS_VECTORIZED_NOT_EXISTS = new ErrorCode(10028, "向量化存储不存在"); + + ErrorCode TRAINING_NOT_EXISTS = new ErrorCode(10029, "训练不存在"); + + ErrorCode MODEL_COMPLETIONS_ERROR = new ErrorCode(10030, "模型推理失败"); + + ErrorCode MANUAL_MODEL_ANSWER_NOT_EXISTS = new ErrorCode(10031, "模型评估人工评估信息不存在"); + + ErrorCode MANUAL_MODEL_ANNO_NOT_EXISTS = new ErrorCode(10032, "模型评估人工评估标注信息不存在"); + + ErrorCode MODEL_ASSESS_TASK_MANUAL_BACKUP_NOT_EXISTS = new ErrorCode(10033, "人工评估备份不存在"); + + ErrorCode MODEL_ASSESS_TASK_MANUAL_BACKUP_EXISTS = new ErrorCode(100_1002, "人工评估已备份"); + + ErrorCode DATASET_NAME_EXISTS = new ErrorCode(10034, "数据集名称重复"); + + ErrorCode MODEL_ASSESS_TASK_MANUAL_NAME_EXISTS = new ErrorCode(10035, "模型评估任务人工评估名称重复"); + + ErrorCode PROMPT_TEMPLATES_APPLICATIONS_BACKUP_NOT_EXISTS = new ErrorCode(10036, "模板信息不存在"); + + ErrorCode PROMPT_TEMPLATES_TAGS_BACKUP_NOT_EXISTS = new ErrorCode(10037, "模板信息不存在"); + + ErrorCode DATA_PROCESS_TASK_NAME_NOT_EXISTS = new ErrorCode(10038, "数据处理任务名称已存在"); + + ErrorCode FINE_TUNING_TASK_NAME_NOT_EXISTS = new ErrorCode(10039, "模型调优任务名称已存在"); + + ErrorCode LEARNING_RESOURCES_NAME_NOT_EXISTS = new ErrorCode(10040, "学习资源标题名称已存在"); + + ErrorCode KNOWLEDGE_BASE_NAME_NOT_EXISTS = new ErrorCode(10040, "知识库名称已存在"); + + ErrorCode CHUNK_SIZE_MUST_BE_GREATER_THAN_ZERO = new ErrorCode(10040_1, "分块大小必须大于 0"); + + ErrorCode CHUNK_OVERLAP_MUST_BE_GREATER_THAN_OR_EQUAL_TO_ZERO = new ErrorCode(10040_2, "分块重叠必须大于或等于 0"); + + ErrorCode CHUNK_OVERLAP_MUST_BE_LESS_THAN_CHUNK_SIZE = new ErrorCode(10040_3, "分块重叠必须小于分块大小"); + + ErrorCode APPLICATION_NAME_NOT_EXISTS = new ErrorCode(10041, "应用中心名称已存在"); + + ErrorCode MODEL_SERVIC_ENAME_NOT_EXISTS = new ErrorCode(10043, "模型名称已存在"); + + ErrorCode OPTIMIZE_PROMPT_NOT_EXISTS = new ErrorCode(10044, "优化后信息不存在"); + ErrorCode LABEL_NAME_EXISTS = new ErrorCode(10045, "标签名称重复"); + + + ErrorCode PARSE_CSV_ERROR = new ErrorCode(10034, "请正确上传csv格式得数据!!!"); + + ErrorCode QUESTION_NOT_EXISTS = new ErrorCode(10046, "数据集信息不完整,无法进行评估"); + ErrorCode BASE_MODEL_NAME_EXISTS = new ErrorCode(10047, "基座模型名称重复"); + ErrorCode SERVER_NAME_NOT_EXISTS = new ErrorCode(10048, "服务器主机名称不存在"); + ErrorCode SERVER_NAME_URL_ERROR = new ErrorCode(10049, "主机地址URL格式不正确"); + ErrorCode SET_FILE_MIDDLE_NOT_EXISTS = new ErrorCode(10050, "文件不能为空"); + ErrorCode SET_MIDDLE_NOT_EXISTS = new ErrorCode(10051, "数据集ID不能为空"); +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/pom.xml b/yudao-module-mdpf/yudao-module-mdpf-biz/pom.xml index 4a1923214..6c53247ee 100644 --- a/yudao-module-mdpf/yudao-module-mdpf-biz/pom.xml +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/pom.xml @@ -54,5 +54,37 @@ cn.iocoder.boot yudao-spring-boot-starter-excel + + + org.springframework.boot + spring-boot-starter-data-mongodb + + + com.github.houbb + opencc4j + 1.8.1 + + + com.github.houbb + sensitive-word + 0.24.0 + compile + + + org.apache.opennlp + opennlp-tools + 1.9.3 + + + + com.hankcs + hanlp + portable-1.3.4 + + + cn.iocoder.boot + yudao-module-infra-biz + 2.3.0-jdk8-SNAPSHOT + diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/DataSetFileMiddleController.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/DataSetFileMiddleController.java new file mode 100644 index 000000000..8dc010763 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/DataSetFileMiddleController.java @@ -0,0 +1,98 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset; + +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddlePageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleRespVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetFileMiddleService; +import org.springframework.web.bind.annotation.*; +import org.springframework.validation.annotation.Validated; +import org.springframework.security.access.prepost.PreAuthorize; +import io.swagger.v3.oas.annotations.tags.Tag; +import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.Operation; + +import java.util.*; +import java.io.IOException; + +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.pojo.CommonResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; +import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success; + +import cn.iocoder.yudao.framework.excel.core.util.ExcelUtils; + +import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog; + +import javax.annotation.Resource; +import javax.servlet.http.HttpServletResponse; +import javax.validation.Valid; + +import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.*; + + +@Tag(name = "管理后台 - 数据集对应的文件地址") +@RestController +@RequestMapping("/data/set-file-middle") +@Validated +public class DataSetFileMiddleController { + + @Resource + private DataSetFileMiddleService setFileMiddleService; + + @PostMapping("/create") + @Operation(summary = "创建数据集对应的文件地址") + @PreAuthorize("@ss.hasPermission('data:set-file-middle:create')") + public CommonResult createSetFileMiddle(@Valid @RequestBody DataSetFileMiddleSaveReqVO createReqVO) { + return success(setFileMiddleService.createSetFileMiddle(createReqVO)); + } + + @PutMapping("/update") + @Operation(summary = "更新数据集对应的文件地址") + @PreAuthorize("@ss.hasPermission('data:set-file-middle:update')") + public CommonResult updateSetFileMiddle(@Valid @RequestBody DataSetFileMiddleSaveReqVO updateReqVO) { + setFileMiddleService.updateSetFileMiddle(updateReqVO); + return success(true); + } + + @DeleteMapping("/delete") + @Operation(summary = "删除数据集对应的文件地址") + @Parameter(name = "id", description = "编号", required = true) + @PreAuthorize("@ss.hasPermission('data:set-file-middle:delete')") + public CommonResult deleteSetFileMiddle(@RequestParam("id") Long id) { + setFileMiddleService.deleteSetFileMiddle(id); + return success(true); + } + + @GetMapping("/get") + @Operation(summary = "获得数据集对应的文件地址") + @Parameter(name = "id", description = "编号", required = true, example = "1024") + @PreAuthorize("@ss.hasPermission('data:set-file-middle:query')") + public CommonResult getSetFileMiddle(@RequestParam("id") Long id) { + DataSetFileMiddleDO setFileMiddle = setFileMiddleService.getSetFileMiddle(id); + return success(BeanUtils.toBean(setFileMiddle, DataSetFileMiddleRespVO.class)); + } + + @GetMapping("/page") + @Operation(summary = "获得数据集对应的文件地址分页") + @PreAuthorize("@ss.hasPermission('data:set-file-middle:query')") + public CommonResult> getSetFileMiddlePage(@Valid DataSetFileMiddlePageReqVO pageReqVO) { + PageResult pageResult = setFileMiddleService.getSetFileMiddlePage(pageReqVO); + return success(BeanUtils.toBean(pageResult, DataSetFileMiddleRespVO.class)); + } + + @GetMapping("/export-excel") + @Operation(summary = "导出数据集对应的文件地址 Excel") + @PreAuthorize("@ss.hasPermission('data:set-file-middle:export')") + @ApiAccessLog(operateType = EXPORT) + public void exportSetFileMiddleExcel(@Valid DataSetFileMiddlePageReqVO pageReqVO, + HttpServletResponse response) throws IOException { + pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE); + List list = setFileMiddleService.getSetFileMiddlePage(pageReqVO).getList(); + // 导出 Excel + ExcelUtils.write(response, "数据集对应的文件地址.xls", "数据", DataSetFileMiddleRespVO.class, + BeanUtils.toBean(list, DataSetFileMiddleRespVO.class)); + } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/DataSetMiddleController.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/DataSetMiddleController.java new file mode 100644 index 000000000..96b3662af --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/DataSetMiddleController.java @@ -0,0 +1,121 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset; + +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.*; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO; +import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetMiddleService; +import org.springframework.web.bind.annotation.*; +import org.springframework.validation.annotation.Validated; +import org.springframework.security.access.prepost.PreAuthorize; +import io.swagger.v3.oas.annotations.tags.Tag; +import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.Operation; + +import java.util.*; +import java.io.IOException; + +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.pojo.CommonResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; +import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success; + +import cn.iocoder.yudao.framework.excel.core.util.ExcelUtils; + +import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog; +import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.*; + + +import javax.annotation.Resource; +import javax.annotation.security.PermitAll; +import javax.validation.Valid; + +@Tag(name = "管理后台 - 中台中的数据集") +@RestController +@RequestMapping("/data/data-set-middle") +@Validated +public class DataSetMiddleController { + + @Resource + private DataSetMiddleService setMiddleService; + + @PostMapping("/create") + @Operation(summary = "创建中台中的数据集") +// @PreAuthorize("@ss.hasPermission('data:set-middle:create')") + public CommonResult createSetMiddle(@Valid @RequestBody DataSetMiddleSaveReqVO createReqVO) { + return success(setMiddleService.createSetMiddle(createReqVO)); + } + + @PutMapping("/update") + @Operation(summary = "更新中台中的数据集") +// @PreAuthorize("@ss.hasPermission('data:set-middle:update')") + public CommonResult updateSetMiddle(@Valid @RequestBody DataSetMiddleSaveReqVO updateReqVO) { + setMiddleService.updateSetMiddle(updateReqVO); + return success(true); + } + + @DeleteMapping("/delete") + @Operation(summary = "删除中台中的数据集") + @Parameter(name = "id", description = "编号", required = true) +// @PreAuthorize("@ss.hasPermission('data:set-middle:delete')") + public CommonResult deleteSetMiddle(@RequestParam("id") Long id) { + setMiddleService.deleteSetMiddle(id); + return success(true); + } + + @GetMapping("/get") + @Operation(summary = "获得中台中的数据集") + @Parameter(name = "id", description = "编号", required = true, example = "1024") +// @PreAuthorize("@ss.hasPermission('data:set-middle:query')") + public CommonResult getSetMiddle(@RequestParam("id") Long id) { + DataSetMiddleDO setMiddle = setMiddleService.getSetMiddle(id); + return success(BeanUtils.toBean(setMiddle, DataSetMiddleRespVO.class)); + } + + @GetMapping("/getOneInfo") + @Operation(summary = "获得中台中的数据集") + @Parameter(name = "id", description = "编号", required = true, example = "1024") +// @PreAuthorize("@ss.hasPermission('data:set-middle:query')") + public CommonResult getOneInfo(@RequestParam("id") Long id) { + DataSetMiddleRespVO oneInfo = setMiddleService.getOneInfo(id); + return success(BeanUtils.toBean(oneInfo, DataSetMiddleRespVO.class)); + } + + @GetMapping("/page") + @Operation(summary = "获得中台中的数据集分页") +// @PreAuthorize("@ss.hasPermission('data:set-middle:query')") + public CommonResult> getSetMiddlePage(@Valid DataSetMiddlePageReqVO pageReqVO) { + PageResult pageResult = setMiddleService.getSetMiddlePage(pageReqVO); + return success(BeanUtils.toBean(pageResult, DataSetMiddleRespVO.class)); + } + @GetMapping("/getAllList") + @Operation(summary = "获得中台中的数据集") + public CommonResult> getDataSetMiddleList(DataSetMiddlePageReqVO pageReqVO){ + List dataSetMiddleList = setMiddleService.getDataSetMiddleList(pageReqVO.getDatasetParentType()); + return success(dataSetMiddleList); + + } + + + +// @GetMapping("getdataseturl") +// @PermitAll +// public CommonResult getDataSetUrl(@RequestParam("datasetid") Long id,@RequestParam("hostUrl") String hostUrl){ +// +// String dataSetUrl = setMiddleService.getDataSetUrl(id,hostUrl); +// return success(dataSetUrl); +// } + +// @GetMapping("/export-excel") +// @Operation(summary = "导出中台中的数据集 Excel") +// @PreAuthorize("@ss.hasPermission('data:set-middle:export')") +// @ApiAccessLog(operateType = EXPORT) +// public void exportSetMiddleExcel(@Valid DataSetMiddlePageReqVO pageReqVO, +// HttpServletResponse response) throws IOException { +// pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE); +// List list = setMiddleService.getSetMiddlePage(pageReqVO).getList(); +// // 导出 Excel +// ExcelUtils.write(response, "中台中的数据集.xls", "数据", DataSetMiddleRespVO.class, +// BeanUtils.toBean(list, DataSetMiddleRespVO.class)); +// } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetAnswerController.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetAnswerController.java new file mode 100644 index 000000000..2a94eb3c0 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetAnswerController.java @@ -0,0 +1,92 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset; + +import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog; +import cn.iocoder.yudao.framework.common.pojo.CommonResult; +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; +import cn.iocoder.yudao.framework.excel.core.util.ExcelUtils; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerRespVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetAnswerService; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; +import org.springframework.security.access.prepost.PreAuthorize; +import org.springframework.web.bind.annotation.*; + +import javax.annotation.Resource; +import javax.servlet.http.HttpServletResponse; +import javax.validation.Valid; +import java.io.IOException; +import java.util.List; + +import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.EXPORT; +import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success; + + +/*@Tag(name = "管理后台 - 数据集数据问题标注内容") +@RestController +@RequestMapping("/llm/dataset-answer") +@Validated*/ +public class PlatformDatasetAnswerController { + + @Resource + private PlatformDatasetAnswerService platformDatasetAnswerService; + + @PostMapping("/create") + @Operation(summary = "创建数据集数据问题标注内容") + @PreAuthorize("@ss.hasPermission('llm:dataset-answer:create')") + public CommonResult createDatasetAnswer(@Valid @RequestBody PlatformDatasetAnswerSaveReqVO createReqVO) { + return success(platformDatasetAnswerService.createDatasetAnswer(createReqVO)); + } + + @PutMapping("/update") + @Operation(summary = "更新数据集数据问题标注内容") + @PreAuthorize("@ss.hasPermission('llm:dataset-answer:update')") + public CommonResult updateDatasetAnswer(@Valid @RequestBody PlatformDatasetAnswerSaveReqVO updateReqVO) { + platformDatasetAnswerService.updateDatasetAnswer(updateReqVO); + return success(true); + } + + @DeleteMapping("/delete") + @Operation(summary = "删除数据集数据问题标注内容") + @Parameter(name = "id", description = "编号", required = true) + @PreAuthorize("@ss.hasPermission('llm:dataset-answer:delete')") + public CommonResult deleteDatasetAnswer(@RequestParam("id") Long id) { + platformDatasetAnswerService.deleteDatasetAnswer(id); + return success(true); + } + + @GetMapping("/get") + @Operation(summary = "获得数据集数据问题标注内容") + @Parameter(name = "id", description = "编号", required = true, example = "1024") + @PreAuthorize("@ss.hasPermission('llm:dataset-answer:query')") + public CommonResult getDatasetAnswer(@RequestParam("id") Long id) { + PlatformDatasetAnswerDO datasetAnswer = platformDatasetAnswerService.getDatasetAnswer(id); + return success(BeanUtils.toBean(datasetAnswer, PlatformDatasetAnswerRespVO.class)); + } + + @GetMapping("/page") + @Operation(summary = "获得数据集数据问题标注内容分页") + @PreAuthorize("@ss.hasPermission('llm:dataset-answer:query')") + public CommonResult> getDatasetAnswerPage(@Valid PlatformDatasetAnswerPageReqVO pageReqVO) { + PageResult pageResult = platformDatasetAnswerService.getDatasetAnswerPage(pageReqVO); + return success(BeanUtils.toBean(pageResult, PlatformDatasetAnswerRespVO.class)); + } + + @GetMapping("/export-excel") + @Operation(summary = "导出数据集数据问题标注内容 Excel") + @PreAuthorize("@ss.hasPermission('llm:dataset-answer:export')") + @ApiAccessLog(operateType = EXPORT) + public void exportDatasetAnswerExcel(@Valid PlatformDatasetAnswerPageReqVO pageReqVO, + HttpServletResponse response) throws IOException { + pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE); + List list = platformDatasetAnswerService.getDatasetAnswerPage(pageReqVO).getList(); + // 导出 Excel + ExcelUtils.write(response, "数据集数据问题标注内容.xls", "数据", PlatformDatasetAnswerRespVO.class, + BeanUtils.toBean(list, PlatformDatasetAnswerRespVO.class)); + } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetFilesController.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetFilesController.java new file mode 100644 index 000000000..f84ad9635 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetFilesController.java @@ -0,0 +1,93 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset; + +import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog; +import cn.iocoder.yudao.framework.common.pojo.CommonResult; +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; +import cn.iocoder.yudao.framework.excel.core.util.ExcelUtils; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesRespVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetFilesService; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; +import org.springframework.security.access.prepost.PreAuthorize; +import org.springframework.web.bind.annotation.*; + +import javax.annotation.Resource; +import javax.servlet.http.HttpServletResponse; +import javax.validation.Valid; +import java.io.IOException; +import java.util.List; + +import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.EXPORT; +import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success; + + +/* +@Tag(name = "管理后台 - 数据集数据文件") +@RestController +@RequestMapping("/llm/dataset-files") +@Validated*/ +public class PlatformDatasetFilesController { + + @Resource + private PlatformDatasetFilesService platformDatasetFilesService; + + @PostMapping("/create") + @Operation(summary = "创建数据集数据文件") + @PreAuthorize("@ss.hasPermission('llm:dataset-files:create')") + public CommonResult createDatasetFiles(@Valid @RequestBody PlatformDatasetFilesSaveReqVO createReqVO) { + return success(platformDatasetFilesService.createDatasetFiles(createReqVO)); + } + + @PutMapping("/update") + @Operation(summary = "更新数据集数据文件") + @PreAuthorize("@ss.hasPermission('llm:dataset-files:update')") + public CommonResult updateDatasetFiles(@Valid @RequestBody PlatformDatasetFilesSaveReqVO updateReqVO) { + platformDatasetFilesService.updateDatasetFiles(updateReqVO); + return success(true); + } + + @DeleteMapping("/delete") + @Operation(summary = "删除数据集数据文件") + @Parameter(name = "id", description = "编号", required = true) + @PreAuthorize("@ss.hasPermission('llm:dataset-files:delete')") + public CommonResult deleteDatasetFiles(@RequestParam("id") Long id) { + platformDatasetFilesService.deleteDatasetFiles(id); + return success(true); + } + + @GetMapping("/get") + @Operation(summary = "获得数据集数据文件") + @Parameter(name = "id", description = "编号", required = true, example = "1024") + @PreAuthorize("@ss.hasPermission('llm:dataset-files:query')") + public CommonResult getDatasetFiles(@RequestParam("id") Long id) { + PlatformDatasetFilesDO datasetFiles = platformDatasetFilesService.getDatasetFiles(id); + return success(BeanUtils.toBean(datasetFiles, PlatformDatasetFilesRespVO.class)); + } + + @GetMapping("/page") + @Operation(summary = "获得数据集数据文件分页") + @PreAuthorize("@ss.hasPermission('llm:dataset-files:query')") + public CommonResult> getDatasetFilesPage(@Valid PlatformDatasetFilesPageReqVO pageReqVO) { + PageResult pageResult = platformDatasetFilesService.getDatasetFilesPage(pageReqVO); + return success(BeanUtils.toBean(pageResult, PlatformDatasetFilesRespVO.class)); + } + + @GetMapping("/export-excel") + @Operation(summary = "导出数据集数据文件 Excel") + @PreAuthorize("@ss.hasPermission('llm:dataset-files:export')") + @ApiAccessLog(operateType = EXPORT) + public void exportDatasetFilesExcel(@Valid PlatformDatasetFilesPageReqVO pageReqVO, + HttpServletResponse response) throws IOException { + pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE); + List list = platformDatasetFilesService.getDatasetFilesPage(pageReqVO).getList(); + // 导出 Excel + ExcelUtils.write(response, "数据集数据文件.xls", "数据", PlatformDatasetFilesRespVO.class, + BeanUtils.toBean(list, PlatformDatasetFilesRespVO.class)); + } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetQuestionController.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetQuestionController.java new file mode 100644 index 000000000..ab4c75a47 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/PlatformDatasetQuestionController.java @@ -0,0 +1,191 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset; + +import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog; +import cn.iocoder.yudao.framework.common.pojo.CommonResult; +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerRespVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionSaveReqVO; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetAnswerService; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetQuestionService; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import org.apache.poi.hssf.usermodel.HSSFCellStyle; +import org.apache.poi.hssf.usermodel.HSSFRow; +import org.apache.poi.hssf.usermodel.HSSFSheet; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.ss.usermodel.HorizontalAlignment; +import org.apache.poi.ss.usermodel.VerticalAlignment; +import org.apache.poi.ss.util.CellRangeAddress; +import org.springframework.validation.annotation.Validated; +import org.springframework.web.bind.annotation.*; + +import javax.annotation.Resource; +import javax.servlet.http.HttpServletResponse; +import javax.validation.Valid; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.EXPORT; +import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success; + + +@Tag(name = "管理后台 - 数据集数据问题+标注") +@RestController +@RequestMapping("/platform/dataset-question") +@Validated +public class PlatformDatasetQuestionController { + + @Resource + private PlatformDatasetQuestionService platformDatasetQuestionService; + + @Resource + private PlatformDatasetAnswerService platformDatasetAnswerService; + + + @PutMapping("data-anno") + @Operation(summary = "保存标注接口") +// @PreAuthorize("@ss.hasPermission('llm:dataset-question:anno')") + public CommonResult updateDatasetQuestionDataAnno(@Valid @RequestBody List updateReqVOS) { + platformDatasetQuestionService.updateDatasetQuestionDataAnno(updateReqVOS); + return success(true); + } + @GetMapping("/page") + @Operation(summary = "获得数据集数据问题分页") +// @PreAuthorize("@ss.hasPermission('llm:dataset-question:query')") + public CommonResult> getDatasetQuestionPage(@Valid PlatformDatasetQuestionPageReqVO pageReqVO) { + PageResult pageResult = platformDatasetQuestionService.getDatasetQuestionPage(pageReqVO); + return success(pageResult); + } + + @GetMapping("/export-excel") + @Operation(summary = "导出数据集数据文件 Excel") +// @PreAuthorize("@ss.hasPermission('llm:dataset-files:export')") + @ApiAccessLog(operateType = EXPORT) + public void exportDatasetFilesExcel(@Valid PlatformDatasetQuestionPageReqVO pageReqVO, + HttpServletResponse response) throws IOException { +// DatasetRespVO dataset = datasetService.getDataset(pageReqVO.getDatasetId()); +// if(dataset!=null&&dataset.getStatus()!=2){ +// throw new RuntimeException("只有状态为已完成的数据才能导出"); +// } + HSSFWorkbook template = new HSSFWorkbook(); + HSSFSheet sheet = template.createSheet(); + // 创建样式并设置垂直居中 + HSSFCellStyle cellStyle = template.createCellStyle(); + cellStyle.setVerticalAlignment(VerticalAlignment.CENTER); + cellStyle.setAlignment(HorizontalAlignment.CENTER); + int count = 0; + List id = new ArrayList<>(); + HSSFRow row = sheet.createRow(count); + row.createCell(0).setCellValue("system"); + row.getCell(0).setCellStyle(cellStyle); + row.createCell(1).setCellValue("question"); + row.getCell(1).setCellStyle(cellStyle); + row.createCell(2).setCellValue("answer"); + row.getCell(2).setCellStyle(cellStyle); + id.add(count); + count++; + pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE); + List list = platformDatasetQuestionService.getDatasetQuestionPage(pageReqVO).getList(); + for (PlatformDatasetQuestionRespVO item : list){ + String system = item.getSystem(); + String question = item.getQuestion(); + List datasetAnswerRespVO = item.getDatasetAnswerRespVO(); + if(datasetAnswerRespVO!=null&&datasetAnswerRespVO.size()>0){ + List collect = datasetAnswerRespVO.stream().map(PlatformDatasetAnswerRespVO::getAnswer).collect(Collectors.toList()); + if (collect.size() == 0){ + row = sheet.createRow(count); + row.createCell(0).setCellValue(system); + row.getCell(0).setCellStyle(cellStyle); + row.createCell(1).setCellValue(question); + row.getCell(1).setCellStyle(cellStyle); + row.createCell(2).setCellValue(""); + row.getCell(2).setCellStyle(cellStyle); + id.add(count); + count++; + }else { + for (String s : collect) { + row = sheet.createRow(count); + row.createCell(0).setCellValue(system); + row.getCell(0).setCellStyle(cellStyle); + row.createCell(1).setCellValue(question); + row.getCell(1).setCellStyle(cellStyle); + row.createCell(2).setCellValue(s); + row.getCell(2).setCellStyle(cellStyle); + count++; + } + id.add(count-1); + } + } + } + //合并相同内容的单元格 + for (int i = 0; i < id.size() - 1; i++){ + if (id.get(i+1)-id.get(i)>1) { + sheet.addMergedRegion(new CellRangeAddress(id.get(i)+1, id.get(i + 1), 0, 1)); + } + } + // 导出 Excel + try { + response.setCharacterEncoding("UTF-8"); + response.setContentType("application/vnd.ms-excel"); + template.write(response.getOutputStream()); + response.getOutputStream().close(); + template.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + +/* @PutMapping("/update") + @Operation(summary = "更新数据集数据问题") + @PreAuthorize("@ss.hasPermission('llm:dataset-question:update')") + public CommonResult updateDatasetQuestion(@Valid @RequestBody DatasetQuestionSaveReqVO updateReqVO) { + datasetQuestionService.updateDatasetQuestion(updateReqVO); + return success(true); + }*/ + +/* @DeleteMapping("/delete") + @Operation(summary = "删除数据集数据问题") + @Parameter(name = "id", description = "编号", required = true) + @PreAuthorize("@ss.hasPermission('llm:dataset-question:delete')") + public CommonResult deleteDatasetQuestion(@RequestParam("id") Long id) { + datasetQuestionService.deleteDatasetQuestion(id); + return success(true); + }*/ + +/* @GetMapping("/get") + @Operation(summary = "获得数据集数据问题") + @Parameter(name = "id", description = "编号", required = true, example = "1024") + @PreAuthorize("@ss.hasPermission('llm:dataset-question:query')") + public CommonResult getDatasetQuestion(@RequestParam("id") Long id) { + DatasetQuestionDO datasetQuestion = datasetQuestionService.getDatasetQuestion(id); + return success(BeanUtils.toBean(datasetQuestion, DatasetQuestionRespVO.class)); + }*/ + + + +/* @GetMapping("/export-excel") + @Operation(summary = "导出数据集数据问题 Excel") + @PreAuthorize("@ss.hasPermission('llm:dataset-question:export')") + @ApiAccessLog(operateType = EXPORT) + public void exportDatasetQuestionExcel(@Valid DatasetQuestionPageReqVO pageReqVO, + HttpServletResponse response) throws IOException { + pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE); + List list = datasetQuestionService.getDatasetQuestionPage(pageReqVO).getList(); + // 导出 Excel + ExcelUtils.write(response, "数据集数据问题.xls", "数据", DatasetQuestionRespVO.class, + BeanUtils.toBean(list, DatasetQuestionRespVO.class)); + }*/ + + /* @PostMapping("/create") + @Operation(summary = "创建数据集数据问题") + @PreAuthorize("@ss.hasPermission('llm:dataset-question:create')") + public CommonResult createDatasetQuestion(@Valid @RequestBody DatasetQuestionSaveReqVO createReqVO) { + return success(datasetQuestionService.createDatasetQuestion(createReqVO)); + }*/ + +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/dto/PlatformDataJsonTemplate.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/dto/PlatformDataJsonTemplate.java new file mode 100644 index 000000000..8767c1090 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/dto/PlatformDataJsonTemplate.java @@ -0,0 +1,12 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.dto; + +import lombok.Data; + +import java.util.List; + +@Data +public class PlatformDataJsonTemplate { + private String system; + private String question; + private List answers; +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/AigcDatasetFileMiddleRespV0.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/AigcDatasetFileMiddleRespV0.java new file mode 100644 index 000000000..f571503f3 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/AigcDatasetFileMiddleRespV0.java @@ -0,0 +1,21 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@AllArgsConstructor +@NoArgsConstructor +public class AigcDatasetFileMiddleRespV0 { + private String createdAt; + private String fileId; + private String fileType; + private String filename; + private Integer lineCount; + private String purpose; + private String s3Url; + private Integer size; + private Integer tenantId; + private Integer tokenCount; +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/AigcDatasetMiddleVo.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/AigcDatasetMiddleVo.java new file mode 100644 index 000000000..c59fdc67b --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/AigcDatasetMiddleVo.java @@ -0,0 +1,15 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@AllArgsConstructor +@NoArgsConstructor +public class AigcDatasetMiddleVo { + private String instruction; + private String input; + private String output; +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddlePageReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddlePageReqVO.java new file mode 100644 index 000000000..57749c913 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddlePageReqVO.java @@ -0,0 +1,34 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import lombok.*; +import java.util.*; +import io.swagger.v3.oas.annotations.media.Schema; +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import org.springframework.format.annotation.DateTimeFormat; +import java.time.LocalDateTime; + +import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND; + +@Schema(description = "管理后台 - 数据集对应的文件地址分页 Request VO") +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +public class DataSetFileMiddlePageReqVO extends PageParam { + + @Schema(description = "对应数据集的id", example = "8353") + private Long dataSetId; + + @Schema(description = "数据集文件对应的上传地址", example = "https://www.iocoder.cn") + private String dataSetFileUrl; + + @Schema(description = "数据集文件类型0数据文件1图片2视频", example = "2") + private String dataSetFileType; + + @Schema(description = "创建时间") + @DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND) + private LocalDateTime[] createTime; + + @Schema(description = "数据集文件名称", example = "赵六") + private String datasetFileName; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddleRespVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddleRespVO.java new file mode 100644 index 000000000..6c3f00e00 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddleRespVO.java @@ -0,0 +1,39 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.*; +import java.util.*; +import org.springframework.format.annotation.DateTimeFormat; +import java.time.LocalDateTime; +import com.alibaba.excel.annotation.*; + +@Schema(description = "管理后台 - 数据集对应的文件地址 Response VO") +@Data +@ExcelIgnoreUnannotated +public class DataSetFileMiddleRespVO { + + @Schema(description = "主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "22402") + @ExcelProperty("主键ID") + private Long id; + + @Schema(description = "对应数据集的id", example = "8353") + @ExcelProperty("对应数据集的id") + private Long dataSetId; + + @Schema(description = "数据集文件对应的上传地址", example = "https://www.iocoder.cn") + @ExcelProperty("数据集文件对应的上传地址") + private String dataSetFileUrl; + + @Schema(description = "数据集文件类型0数据文件1图片2视频", example = "2") + @ExcelProperty("数据集文件类型0数据文件1图片2视频") + private String dataSetFileType; + + @Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED) + @ExcelProperty("创建时间") + private LocalDateTime createTime; + + @Schema(description = "数据集文件名称", example = "赵六") + @ExcelProperty("数据集文件名称") + private String datasetFileName; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddleSaveReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddleSaveReqVO.java new file mode 100644 index 000000000..eb9d50eb9 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetFileMiddleSaveReqVO.java @@ -0,0 +1,26 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.*; +import java.util.*; + +@Schema(description = "管理后台 - 数据集对应的文件地址新增/修改 Request VO") +@Data +public class DataSetFileMiddleSaveReqVO { + + @Schema(description = "主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "22402") + private Long id; + + @Schema(description = "对应数据集的id", example = "8353") + private Long dataSetId; + + @Schema(description = "数据集文件对应的上传地址", example = "https://www.iocoder.cn") + private String dataSetFileUrl; + + @Schema(description = "数据集文件类型0数据文件1图片2视频", example = "2") + private String dataSetFileType; + + @Schema(description = "数据集文件名称", example = "赵六") + private String datasetFileName; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddlePageReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddlePageReqVO.java new file mode 100644 index 000000000..f150c6dab --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddlePageReqVO.java @@ -0,0 +1,61 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import lombok.*; +import java.util.*; +import io.swagger.v3.oas.annotations.media.Schema; +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import org.springframework.format.annotation.DateTimeFormat; +import java.time.LocalDateTime; + +import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND; + +@Schema(description = "管理后台 - 中台中的数据集分页 Request VO") +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +public class DataSetMiddlePageReqVO extends PageParam { + + @Schema(description = "数据集名称", example = "张三") + private String datasetName; + + @Schema(description = "数据集来源") + private String datasetSource; + + @Schema(description = "清洗状态0未清洗1已经清洗", example = "2") + private Integer cleanStatus; + + @Schema(description = "数据标注状态0未完成1进行中2已完成", example = "1") + private Integer markStatus; + + @Schema(description = "数据集父类型(1文本数据集2多模态数据集)", example = "2") + private Integer datasetParentType; + + @Schema(description = "备注", example = "你猜") + private String remark; + + @Schema(description = "创建时间") + @DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND) + private LocalDateTime[] createTime; + + @Schema(description = "数据集类型 0 普通 1 官方", example = "1") + private Integer type; + + @Schema(description = "数据集描述") + private String datasetIntro; + + @Schema(description = "数据集类型,(1-训练数据集、2-评估数据集)", example = "2") + private Integer datasetType; + + @Schema(description = "数据集类型,使用字典(llm_dataset_category_1、llm_dataset_category_2)") + private Integer datasetCategory; + + @Schema(description = "数据长度") + private Long dataLength; + + @Schema(description = "标注进度") + private Integer annotateProgress; + + @Schema(description = "对应mongodb中的数据iD", example = "12642") + private Long mongoId; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddleRespVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddleRespVO.java new file mode 100644 index 000000000..6de64c068 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddleRespVO.java @@ -0,0 +1,78 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.*; +import java.util.*; +import org.springframework.format.annotation.DateTimeFormat; +import java.time.LocalDateTime; +import com.alibaba.excel.annotation.*; + +@Schema(description = "管理后台 - 中台中的数据集 Response VO") +@Data +@ExcelIgnoreUnannotated +public class DataSetMiddleRespVO { + + @Schema(description = "主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31179") + @ExcelProperty("主键ID") + private Long id; + + @Schema(description = "数据集名称", example = "张三") + @ExcelProperty("数据集名称") + private String datasetName; + + @Schema(description = "数据集来源") + @ExcelProperty("数据集来源") + private String datasetSource; + + @Schema(description = "清洗状态0未清洗1已经清洗", example = "2") + @ExcelProperty("清洗状态0未清洗1已经清洗") + private Integer cleanStatus; + + @Schema(description = "数据标注状态0未完成1进行中2已完成", example = "1") + @ExcelProperty("数据标注状态0未完成1进行中2已完成") + private Integer markStatus; + + @Schema(description = "数据集父类型(1文本数据集2多模态数据集)", example = "2") + @ExcelProperty("数据集父类型(1文本数据集2多模态数据集)") + private Integer datasetParentType; + + @Schema(description = "备注", example = "你猜") + @ExcelProperty("备注") + private String remark; + + @Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED) + @ExcelProperty("创建时间") + private LocalDateTime createTime; + + @Schema(description = "数据集类型 0 普通 1 官方", example = "1") + @ExcelProperty("数据集类型 0 普通 1 官方") + private Integer type; + + @Schema(description = "数据集描述") + @ExcelProperty("数据集描述") + private String datasetIntro; + + @Schema(description = "数据集类型,(1-训练数据集、2-评估数据集)", example = "2") + @ExcelProperty("数据集类型,(1-训练数据集、2-评估数据集)") + private Integer datasetType; + + @Schema(description = "数据集类型,使用字典(llm_dataset_category_1、llm_dataset_category_2)") + @ExcelProperty("数据集类型,使用字典(llm_dataset_category_1、llm_dataset_category_2)") + private Integer datasetCategory; + + @Schema(description = "数据长度") + @ExcelProperty("数据长度") + private Long dataLength; + + @Schema(description = "标注进度") + @ExcelProperty("标注进度") + private Integer annotateProgress; + + @Schema(description = "对应mongodb中的数据iD", example = "12642") + @ExcelProperty("对应mongodb中的数据iD") + private Long mongoId; + + @Schema(description = "数据集数据文件", example = "[]") + private List datasetFiles; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddleSaveReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddleSaveReqVO.java new file mode 100644 index 000000000..5659c726f --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/DataSetMiddleSaveReqVO.java @@ -0,0 +1,55 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.*; +import java.util.*; + +@Schema(description = "管理后台 - 中台中的数据集新增/修改 Request VO") +@Data +public class DataSetMiddleSaveReqVO { + + @Schema(description = "主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31179") + private Long id; + + @Schema(description = "数据集名称", example = "张三") + private String datasetName; + + @Schema(description = "数据集来源") + private String datasetSource; + + @Schema(description = "清洗状态0未清洗1已经清洗", example = "2") + private Integer cleanStatus; + + @Schema(description = "数据标注状态0未完成1进行中2已完成", example = "1") + private Integer markStatus; + + @Schema(description = "数据集父类型(1文本数据集2多模态数据集)", example = "2") + private Integer datasetParentType; + + @Schema(description = "备注", example = "你猜") + private String remark; + + @Schema(description = "数据集类型 0 普通 1 官方", example = "1") + private Integer type; + + @Schema(description = "数据集描述") + private String datasetIntro; + + @Schema(description = "数据集类型,(1-训练数据集、2-评估数据集)", example = "2") + private Integer datasetType; + + @Schema(description = "数据集类型,使用字典(llm_dataset_category_1、llm_dataset_category_2)") + private Integer datasetCategory; + + @Schema(description = "数据长度") + private Long dataLength; + + @Schema(description = "标注进度") + private Integer annotateProgress; + + @Schema(description = "对应mongodb中的数据iD", example = "12642") + private Long mongoId; + + private List> filesList; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerPageReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerPageReqVO.java new file mode 100644 index 000000000..86de128ca --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerPageReqVO.java @@ -0,0 +1,36 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.ToString; +import org.springframework.format.annotation.DateTimeFormat; + +import java.time.LocalDateTime; + +import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND; + +@Schema(description = "管理后台 - 数据集数据问题标注内容分页 Request VO") +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +public class PlatformDatasetAnswerPageReqVO extends PageParam { + + @Schema(description = "数据集ID", example = "31073") + private Long datasetId; + + @Schema(description = "数据文件ID", example = "21597") + private Long datasetFilesId; + + @Schema(description = "问题ID", example = "23725") + private Long questionId; + + @Schema(description = "标注内容") + private String answer; + + @Schema(description = "创建时间") + @DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND) + private LocalDateTime[] createTime; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerRespVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerRespVO.java new file mode 100644 index 000000000..7ce260936 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerRespVO.java @@ -0,0 +1,39 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import com.alibaba.excel.annotation.ExcelIgnoreUnannotated; +import com.alibaba.excel.annotation.ExcelProperty; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; + +import java.time.LocalDateTime; + +@Schema(description = "管理后台 - 数据集数据问题标注内容 Response VO") +@Data +@ExcelIgnoreUnannotated +public class PlatformDatasetAnswerRespVO { + + @Schema(description = "数据集问题ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "32153") + @ExcelProperty("数据集问题ID") + private Long id; + + @Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31073") + @ExcelProperty("数据集ID") + private Long datasetId; + + @Schema(description = "数据文件ID", example = "21597") + @ExcelProperty("数据文件ID") + private Long datasetFilesId; + + @Schema(description = "问题ID", example = "23725") + @ExcelProperty("问题ID") + private Long questionId; + + @Schema(description = "标注内容") + @ExcelProperty("标注内容") + private String answer; + + @Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED) + @ExcelProperty("创建时间") + private LocalDateTime createTime; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerSaveReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerSaveReqVO.java new file mode 100644 index 000000000..6de660659 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetAnswerSaveReqVO.java @@ -0,0 +1,28 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; + +import javax.validation.constraints.NotNull; + +@Schema(description = "管理后台 - 数据集数据问题标注内容新增/修改 Request VO") +@Data +public class PlatformDatasetAnswerSaveReqVO { + + @Schema(description = "数据集问题ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "32153") + private Long id; + + @Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31073") + @NotNull(message = "数据集ID不能为空") + private Long datasetId; + + @Schema(description = "数据文件ID", example = "21597") + private Long datasetFilesId; + + @Schema(description = "问题ID", example = "23725") + private Long questionId; + + @Schema(description = "标注内容") + private String answer; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesPageReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesPageReqVO.java new file mode 100644 index 000000000..29637ffec --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesPageReqVO.java @@ -0,0 +1,38 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.ToString; +import org.springframework.format.annotation.DateTimeFormat; + +import java.time.LocalDateTime; + +import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND; + +@Schema(description = "管理后台 - 数据集数据文件分页 Request VO") +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +public class PlatformDatasetFilesPageReqVO extends PageParam { + + @Schema(description = "数据集ID", example = "8530") + private Long datasetId; + + @Schema(description = "数据长度") + private Long dataLength; + + @Schema(description = "数据文件(文件表的ID)") + private Long datasetFile; + + @Schema(description = "文件URL地址", example = "https://www.iocoder.cn") + private String datasetFileUrl; + + @Schema(description = "创建时间") + @DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND) + private LocalDateTime[] createTime; + @Schema(description = "文件名称", example = "https://www.iocoder.cn") + private String datasetFileName; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesRespVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesRespVO.java new file mode 100644 index 000000000..731f07066 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesRespVO.java @@ -0,0 +1,45 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import com.alibaba.excel.annotation.ExcelIgnoreUnannotated; +import com.alibaba.excel.annotation.ExcelProperty; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; + +import java.time.LocalDateTime; +import java.util.List; + +@Schema(description = "管理后台 - 数据集数据文件 Response VO") +@Data +@ExcelIgnoreUnannotated +public class PlatformDatasetFilesRespVO { + + @Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31018") + @ExcelProperty("数据集ID") + private Long id; + + @Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "8530") + @ExcelProperty("数据集ID") + private Long datasetId; + + @Schema(description = "数据长度") + @ExcelProperty("数据长度") + private Long dataLength; + + @Schema(description = "数据文件(文件表的ID)") + @ExcelProperty("数据文件(文件表的ID)") + private Long datasetFile; + + @Schema(description = "文件URL地址", example = "https://www.iocoder.cn") + @ExcelProperty("文件URL地址") + private String datasetFileUrl; + + @Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED) + @ExcelProperty("创建时间") + private LocalDateTime createTime; + @Schema(description = "文件名称", example = "https://www.iocoder.cn") + private String datasetFileName; + + @Schema(description = "数据集数据文件", example = "[]") + private List datasetFiles; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesSaveReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesSaveReqVO.java new file mode 100644 index 000000000..43bef83f6 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetFilesSaveReqVO.java @@ -0,0 +1,28 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; + +@Schema(description = "管理后台 - 数据集数据文件新增/修改 Request VO") +@Data +public class PlatformDatasetFilesSaveReqVO { + + @Schema(description = "数据集文件主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31018") + private Long id; + + @Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "8530") +// @NotNull(message = "数据集ID不能为空") + private Long datasetId; + + @Schema(description = "数据长度") + private Long dataLength; + + @Schema(description = "数据文件(文件表的ID)") + private Long datasetFile; + + @Schema(description = "文件URL地址", example = "https://www.iocoder.cn") + private String datasetFileUrl; + @Schema(description = "文件名称", example = "https://www.iocoder.cn") + private String datasetFileName; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionPageReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionPageReqVO.java new file mode 100644 index 000000000..81868e0cd --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionPageReqVO.java @@ -0,0 +1,38 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.ToString; +import org.springframework.format.annotation.DateTimeFormat; + +import javax.validation.constraints.NotNull; +import java.time.LocalDateTime; + +import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND; + +@Schema(description = "管理后台 - 数据集数据问题分页 Request VO") +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +public class PlatformDatasetQuestionPageReqVO extends PageParam { + + @Schema(description = "数据集ID", example = "15672") + @NotNull(message = "数据集ID不能为空") + private Long datasetId; + + @Schema(description = "数据文件ID", example = "23062") + private Long datasetFilesId; + + @Schema(description = "问题内容") + private String question; + + @Schema(description = "标注状态,使用字典(llm_dataset_mark_status)", example = "1") + private Integer status; + + @Schema(description = "创建时间") + @DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND) + private LocalDateTime[] createTime; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionRespVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionRespVO.java new file mode 100644 index 000000000..46ebd518a --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionRespVO.java @@ -0,0 +1,47 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import com.alibaba.excel.annotation.ExcelIgnoreUnannotated; +import com.alibaba.excel.annotation.ExcelProperty; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; + +import java.time.LocalDateTime; +import java.util.List; + +@Schema(description = "管理后台 - 数据集数据问题 Response VO") +@Data +@ExcelIgnoreUnannotated +public class PlatformDatasetQuestionRespVO { + + @Schema(description = "数据集问题ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "394") + @ExcelProperty("数据集问题ID") + private Long id; + + @Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "15672") + @ExcelProperty("数据集ID") + private Long datasetId; + + @Schema(description = "数据文件ID", example = "23062") + @ExcelProperty("数据文件ID") + private Long datasetFilesId; + + @Schema(description = "问题内容") + @ExcelProperty("问题内容") + private String question; + + @Schema(description = "标注状态,使用字典(llm_dataset_mark_status)", example = "1") + @ExcelProperty("标注状态,使用字典(llm_dataset_mark_status)") + private Integer status; + + @Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED) + @ExcelProperty("创建时间") + private LocalDateTime createTime; + @Schema(description = "系统身份") + private String system; + @Schema(description = "标注内容") + private List datasetAnswerRespVO; + + @Schema(description = "问题对应的图片") + private List imagesList; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionSaveReqVO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionSaveReqVO.java new file mode 100644 index 000000000..0963ed3e7 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/controller/dataset/vo/PlatformDatasetQuestionSaveReqVO.java @@ -0,0 +1,34 @@ +package cn.iocoder.yudao.module.mdpf.controller.dataset.vo; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; + +import javax.validation.constraints.NotNull; +import java.util.List; + +@Schema(description = "管理后台 - 数据集数据问题新增/修改 Request VO") +@Data +public class PlatformDatasetQuestionSaveReqVO { + + @Schema(description = "数据集问题ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "394") + private Long id; + + @Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "15672") + @NotNull(message = "数据集ID不能为空") + private Long datasetId; + + @Schema(description = "数据文件ID", example = "23062") + private Long datasetFilesId; + + @Schema(description = "问题内容") + private String question; + + @Schema(description = "标注状态,使用字典(llm_dataset_mark_status)", example = "1") + private Integer status; + + @Schema(description = "系统身份") + private String system; + @Schema(description = "标注内容") + private List datasetAnswerRespVO; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/DataSetFileMiddleDO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/DataSetFileMiddleDO.java new file mode 100644 index 000000000..7e360c47c --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/DataSetFileMiddleDO.java @@ -0,0 +1,59 @@ +package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset; + +import lombok.*; + +import java.math.BigDecimal; +import java.util.*; +import java.time.LocalDateTime; +import java.time.LocalDateTime; +import com.baomidou.mybatisplus.annotation.*; +import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; + +/** + * 数据集对应的文件地址 DO + * + * @author 管理员 + */ +@TableName("data_set_file_middle") +@KeySequence("data_set_file_middle_seq") // 用于 Oracle、PostgreSQL、Kingbase、DB2、H2 数据库的主键自增。如果是 MySQL 等数据库,可不写。 +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class DataSetFileMiddleDO extends BaseDO { + + /** + * 主键ID + */ + @TableId + private Long id; + /** + * 对应数据集的id + */ + private Long dataSetId; + /** + * 数据集文件对应的上传地址 + */ + private String dataSetFileUrl; + /** + * 数据集文件类型0数据文件1图片2视频 + */ + private String dataSetFileType; + /** + * 数据集文件名称 + */ + private String datasetFileName; + + private String sourceFileUrl; + private Long sourceFileId; + private String cleanedText; + private String cleanedTextHash; + private BigDecimal qualityScore; + private Integer tokenCount; + private LocalDateTime cleanTime; + private String sourceFileExtension; + private String sourceFileName; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/DataSetMiddleDO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/DataSetMiddleDO.java new file mode 100644 index 000000000..3a770a7b4 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/DataSetMiddleDO.java @@ -0,0 +1,83 @@ +package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset; + +import lombok.*; +import java.util.*; +import java.time.LocalDateTime; +import java.time.LocalDateTime; +import com.baomidou.mybatisplus.annotation.*; +import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; + +/** + * 中台中的数据集 DO + * + * @author 管理员 + */ +@TableName("data_set_middle") +@KeySequence("data_set_middle_seq") // 用于 Oracle、PostgreSQL、Kingbase、DB2、H2 数据库的主键自增。如果是 MySQL 等数据库,可不写。 +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class DataSetMiddleDO extends BaseDO { + + /** + * 主键ID + */ + @TableId + private Long id; + /** + * 数据集名称 + */ + private String datasetName; + /** + * 数据集来源 + */ + private String datasetSource; + /** + * 清洗状态0未清洗1已经清洗 + */ + private Integer cleanStatus; + /** + * 数据标注状态0未完成1进行中2已完成 + */ + private Integer markStatus; + /** + * 数据集父类型(1文本数据集2多模态数据集) + */ + private Integer datasetParentType; + /** + * 备注 + */ + private String remark; + /** + * 数据集类型 0 普通 1 官方 + */ + private Integer type; + /** + * 数据集描述 + */ + private String datasetIntro; + /** + * 数据集类型,(1-训练数据集、2-评估数据集) + */ + private Integer datasetType; + /** + * 数据集类型,使用字典(llm_dataset_category_1、llm_dataset_category_2) + */ + private Integer datasetCategory; + /** + * 数据长度 + */ + private Long dataLength; + /** + * 标注进度 + */ + private Integer annotateProgress; + /** + * 对应mongodb中的数据iD + */ + private Long mongoId; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetAnswerDO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetAnswerDO.java new file mode 100644 index 000000000..ff315ae4c --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetAnswerDO.java @@ -0,0 +1,48 @@ +package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset; + +import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; +import com.baomidou.mybatisplus.annotation.KeySequence; +import com.baomidou.mybatisplus.annotation.TableId; +import com.baomidou.mybatisplus.annotation.TableName; +import lombok.*; + +/** + * 数据集数据问题标注内容 DO + * + * @author 华大大模型 + */ +@TableName("platform_dataset_answer") +@KeySequence("platform_dataset_answer_seq") // 用于 Oracle、PostgreSQL、Kingbase、DB2、H2 数据库的主键自增。如果是 MySQL 等数据库,可不写。 +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class PlatformDatasetAnswerDO extends BaseDO { + + /** + * 数据集问题ID + */ + @TableId + private Long id; + /** + * 数据集ID + */ + private Long datasetId; + /** + * 数据文件ID + */ + private Long datasetFilesId; + /** + * 问题ID + */ + private Long questionId; + /** + * 标注内容 + */ + private String answer; + + private String answerFrom; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetFilesDO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetFilesDO.java new file mode 100644 index 000000000..fd2b2d22f --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetFilesDO.java @@ -0,0 +1,48 @@ +package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset; + +import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; +import com.baomidou.mybatisplus.annotation.KeySequence; +import com.baomidou.mybatisplus.annotation.TableId; +import com.baomidou.mybatisplus.annotation.TableName; +import lombok.*; + +/** + * 数据集数据文件 DO + * + * @author 华大大模型 + */ +@TableName("platform_dataset_files") +@KeySequence("platform_dataset_files_seq") // 用于 Oracle、PostgreSQL、Kingbase、DB2、H2 数据库的主键自增。如果是 MySQL 等数据库,可不写。 +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class PlatformDatasetFilesDO extends BaseDO { + + /** + * 数据集ID + */ + @TableId + private Long id; + /** + * 数据集ID + */ + private Long datasetId; + /** + * 数据长度 + */ + private Long dataLength; + /** + * 数据文件(文件表的ID) + */ + private Long datasetFile; + /** + * 文件URL地址 + */ + private String datasetFileUrl; + + private String datasetFileName; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetQuestionDO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetQuestionDO.java new file mode 100644 index 000000000..7a65c0841 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/dataobject/dataset/PlatformDatasetQuestionDO.java @@ -0,0 +1,51 @@ +package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset; + +import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; +import com.baomidou.mybatisplus.annotation.KeySequence; +import com.baomidou.mybatisplus.annotation.TableField; +import com.baomidou.mybatisplus.annotation.TableId; +import com.baomidou.mybatisplus.annotation.TableName; +import lombok.*; + +/** + * 数据集数据问题 DO + * + * @author 华大大模型 + */ +@TableName("platform_dataset_question") +@KeySequence("platform_dataset_question_seq") // 用于 Oracle、PostgreSQL、Kingbase、DB2、H2 数据库的主键自增。如果是 MySQL 等数据库,可不写。 +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class PlatformDatasetQuestionDO extends BaseDO { + + /** + * 数据集问题ID + */ + @TableId + private Long id; + /** + * 数据集ID + */ + private Long datasetId; + /** + * 数据文件ID + */ + private Long datasetFilesId; + /** + * 问题内容 + */ + private String question; + /** + * 标注状态,使用字典(llm_dataset_mark_status) + */ + private Integer status; + @TableField("`system`") + private String system; + + private String questionFrom; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/DataSetFileMiddleMapper.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/DataSetFileMiddleMapper.java new file mode 100644 index 000000000..9dc9a972e --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/DataSetFileMiddleMapper.java @@ -0,0 +1,32 @@ +package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset; + +import java.util.*; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX; +import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddlePageReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import com.baomidou.dynamic.datasource.annotation.DS; +import org.apache.ibatis.annotations.Mapper; + +/** + * 数据集对应的文件地址 Mapper + * + * @author 管理员 + */ +@Mapper +@DS("slave") +public interface DataSetFileMiddleMapper extends BaseMapperX { + + default PageResult selectPage(DataSetFileMiddlePageReqVO reqVO) { + return selectPage(reqVO, new LambdaQueryWrapperX() + .eqIfPresent(DataSetFileMiddleDO::getDataSetId, reqVO.getDataSetId()) + .eqIfPresent(DataSetFileMiddleDO::getDataSetFileUrl, reqVO.getDataSetFileUrl()) + .eqIfPresent(DataSetFileMiddleDO::getDataSetFileType, reqVO.getDataSetFileType()) + .betweenIfPresent(DataSetFileMiddleDO::getCreateTime, reqVO.getCreateTime()) + .likeIfPresent(DataSetFileMiddleDO::getDatasetFileName, reqVO.getDatasetFileName()) + .orderByDesc(DataSetFileMiddleDO::getId)); + } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/DataSetMiddleMapper.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/DataSetMiddleMapper.java new file mode 100644 index 000000000..b8eeba5db --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/DataSetMiddleMapper.java @@ -0,0 +1,45 @@ +package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset; + +import java.util.*; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX; +import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddlePageReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO; +import org.apache.ibatis.annotations.Mapper; +import org.apache.ibatis.annotations.Param; +import org.apache.ibatis.annotations.Update; + +/** + * 中台中的数据集 Mapper + * + * @author 管理员 + */ +@Mapper +public interface DataSetMiddleMapper extends BaseMapperX { + + default PageResult selectPage(DataSetMiddlePageReqVO reqVO) { + return selectPage(reqVO, new LambdaQueryWrapperX() + .likeIfPresent(DataSetMiddleDO::getDatasetName, reqVO.getDatasetName()) + .eqIfPresent(DataSetMiddleDO::getDatasetSource, reqVO.getDatasetSource()) + .eqIfPresent(DataSetMiddleDO::getCleanStatus, reqVO.getCleanStatus()) + .eqIfPresent(DataSetMiddleDO::getMarkStatus, reqVO.getMarkStatus()) + .eqIfPresent(DataSetMiddleDO::getDatasetParentType, reqVO.getDatasetParentType()) + .eqIfPresent(DataSetMiddleDO::getRemark, reqVO.getRemark()) + .betweenIfPresent(DataSetMiddleDO::getCreateTime, reqVO.getCreateTime()) + .eqIfPresent(DataSetMiddleDO::getType, reqVO.getType()) + .eqIfPresent(DataSetMiddleDO::getDatasetIntro, reqVO.getDatasetIntro()) + .eqIfPresent(DataSetMiddleDO::getDatasetType, reqVO.getDatasetType()) + .eqIfPresent(DataSetMiddleDO::getDatasetCategory, reqVO.getDatasetCategory()) + .eqIfPresent(DataSetMiddleDO::getDataLength, reqVO.getDataLength()) + .eqIfPresent(DataSetMiddleDO::getAnnotateProgress, reqVO.getAnnotateProgress()) + .eqIfPresent(DataSetMiddleDO::getMongoId, reqVO.getMongoId()) + .orderByDesc(DataSetMiddleDO::getId)); + } + + @Update("update data_set_middle set annotate_progress = #{formattedRatio},status=#{status} where id = #{datasetId}") + void updateProcess(@Param("formattedRatio") Integer formattedRatio, @Param("datasetId") Long datasetId, @Param("status") Integer status); + + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetAnswerMapper.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetAnswerMapper.java new file mode 100644 index 000000000..024ed8060 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetAnswerMapper.java @@ -0,0 +1,40 @@ +package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX; +import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerRespVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO; +import com.baomidou.dynamic.datasource.annotation.DS; +import org.apache.ibatis.annotations.Delete; +import org.apache.ibatis.annotations.Mapper; +import org.apache.ibatis.annotations.Param; + +import java.util.List; + +/** + * 数据集数据问题标注内容 Mapper + * + * @author 华大大模型 + */ +@Mapper +@DS("slave") +public interface PlatformDatasetAnswerMapper extends BaseMapperX { + + default PageResult selectPage(PlatformDatasetAnswerPageReqVO reqVO) { + return selectPage(reqVO, new LambdaQueryWrapperX() + .eqIfPresent(PlatformDatasetAnswerDO::getDatasetId, reqVO.getDatasetId()) + .eqIfPresent(PlatformDatasetAnswerDO::getDatasetFilesId, reqVO.getDatasetFilesId()) + .eqIfPresent(PlatformDatasetAnswerDO::getQuestionId, reqVO.getQuestionId()) + .eqIfPresent(PlatformDatasetAnswerDO::getAnswer, reqVO.getAnswer()) + .betweenIfPresent(PlatformDatasetAnswerDO::getCreateTime, reqVO.getCreateTime()) + .orderByDesc(PlatformDatasetAnswerDO::getId)); + } + @Delete("delete from platform_dataset_answer where dataset_id = #{datasetPostId}") + void deleteTrue(@Param("datasetPostId") Long datasetPostId); + + List getAnswersToYourQuestions(@Param("collected") List collected); + + void deleteTheAnswer(@Param("id") Long id); +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetFilesMapper.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetFilesMapper.java new file mode 100644 index 000000000..1f2f1c261 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetFilesMapper.java @@ -0,0 +1,31 @@ +package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX; +import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesPageReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO; +import com.baomidou.dynamic.datasource.annotation.DS; +import org.apache.ibatis.annotations.Mapper; + + +/** + * 数据集数据文件 Mapper + * + * @author 华大大模型 + */ +@Mapper +@DS("slave") +public interface PlatformDatasetFilesMapper extends BaseMapperX { + + default PageResult selectPage(PlatformDatasetFilesPageReqVO reqVO) { + return selectPage(reqVO, new LambdaQueryWrapperX() + .eqIfPresent(PlatformDatasetFilesDO::getDatasetId, reqVO.getDatasetId()) + .eqIfPresent(PlatformDatasetFilesDO::getDataLength, reqVO.getDataLength()) + .eqIfPresent(PlatformDatasetFilesDO::getDatasetFile, reqVO.getDatasetFile()) + .eqIfPresent(PlatformDatasetFilesDO::getDatasetFileUrl, reqVO.getDatasetFileUrl()) + .betweenIfPresent(PlatformDatasetFilesDO::getCreateTime, reqVO.getCreateTime()) + .orderByDesc(PlatformDatasetFilesDO::getId)); + } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetQuestionMapper.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetQuestionMapper.java new file mode 100644 index 000000000..7a4206902 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mapper/dataset/PlatformDatasetQuestionMapper.java @@ -0,0 +1,41 @@ +package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX; +import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO; +import com.baomidou.dynamic.datasource.annotation.DS; +import org.apache.ibatis.annotations.Delete; +import org.apache.ibatis.annotations.Mapper; +import org.apache.ibatis.annotations.Param; + +import java.util.List; + + +/** + * 数据集数据问题 Mapper + * + * @author 华大大模型 + */ +@Mapper +@DS("slave") +public interface PlatformDatasetQuestionMapper extends BaseMapperX { + + default PageResult selectPage(PlatformDatasetQuestionPageReqVO reqVO) { + return selectPage(reqVO, new LambdaQueryWrapperX() + .eqIfPresent(PlatformDatasetQuestionDO::getDatasetId, reqVO.getDatasetId()) + .eqIfPresent(PlatformDatasetQuestionDO::getDatasetFilesId, reqVO.getDatasetFilesId()) + .eqIfPresent(PlatformDatasetQuestionDO::getQuestion, reqVO.getQuestion()) + .eqIfPresent(PlatformDatasetQuestionDO::getStatus, reqVO.getStatus()) + .betweenIfPresent(PlatformDatasetQuestionDO::getCreateTime, reqVO.getCreateTime()) + .orderByDesc(PlatformDatasetQuestionDO::getId)); + } + @Delete("delete from platform_dataset_answer where dataset_id = #{datasetPostId}") + void deleteTrue(@Param("datasetPostId") Long datasetPostId); + + List getAListOfIssues(@Param("datasetId") Long datasetId); + + void deleteTheIssue(@Param("id") Long id); +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongo/DataSetFileMiddleMongoDO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongo/DataSetFileMiddleMongoDO.java new file mode 100644 index 000000000..164a52f08 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongo/DataSetFileMiddleMongoDO.java @@ -0,0 +1,43 @@ +package cn.iocoder.yudao.module.mdpf.dal.mongo; + +import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; +import com.baomidou.mybatisplus.annotation.KeySequence; +import com.baomidou.mybatisplus.annotation.TableId; +import com.baomidou.mybatisplus.annotation.TableName; +import lombok.*; + +/** + * 数据集对应的文件地址 DO + * + * @author 管理员 + */ +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class DataSetFileMiddleMongoDO extends BaseDO { + + /** + * 主键ID + */ + private Long id; + /** + * 对应数据集的id + */ + private Long dataSetId; + /** + * 数据集文件对应的上传地址 + */ + private String dataSetFileUrl; + /** + * 数据集文件类型0数据文件1图片2视频 + */ + private String dataSetFileType; + /** + * 数据集文件名称 + */ + private String datasetFileName; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongo/DataSetMiddleMongoDO.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongo/DataSetMiddleMongoDO.java new file mode 100644 index 000000000..2840a9425 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongo/DataSetMiddleMongoDO.java @@ -0,0 +1,82 @@ +package cn.iocoder.yudao.module.mdpf.dal.mongo; + +import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; +import lombok.*; +import org.springframework.data.annotation.Id; +import org.springframework.data.mongodb.core.mapping.Document; + +import java.util.List; + +/** + * 中台中的数据集 DO + * + * @author 管理员 + */ +@Data +@EqualsAndHashCode(callSuper = true) +@ToString(callSuper = true) +@Builder +@NoArgsConstructor +@AllArgsConstructor +@Document(collation = "data_set") +public class DataSetMiddleMongoDO extends BaseDO { + + /** + * 主键ID + */ + @Id + private Long id; + /** + * 数据集名称 + */ + private String dataSetName; + /** + * 数据集来源 + */ + private String dataSetSource; + /** + * 清洗状态0未清洗1已经清洗 + */ + private Integer cleanStatus; + /** + * 数据标注状态0未完成1进行中2已完成 + */ + private Integer markStatus; + /** + * 数据集父类型(1文本数据集2多模态数据集) + */ + private Integer datasetParentType; + /** + * 备注 + */ + private String remark; + /** + * 数据集类型 0 普通 1 官方 + */ + private Integer type; + /** + * 数据集描述 + */ + private String datasetIntro; + /** + * 数据集类型,(1-训练数据集、2-评估数据集) + */ + private Integer datasetType; + /** + * 数据集类型,使用字典(llm_dataset_category_1、llm_dataset_category_2) + */ + private Integer datasetCategory; + /** + * 数据长度 + */ + private Long dataLength; + /** + * 标注进度 + */ + private Integer annotateProgress; + /** + * 对应mongodb中的数据iD + */ + private List dataSetFileList; + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongorepository/DataSetMiddleRepository.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongorepository/DataSetMiddleRepository.java new file mode 100644 index 000000000..c37d79910 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/dal/mongorepository/DataSetMiddleRepository.java @@ -0,0 +1,11 @@ +package cn.iocoder.yudao.module.mdpf.dal.mongorepository; + +import cn.iocoder.yudao.module.mdpf.dal.mongo.DataSetMiddleMongoDO; +import org.springframework.data.mongodb.repository.MongoRepository; +import org.springframework.stereotype.Repository; + +@Repository +public interface DataSetMiddleRepository extends MongoRepository { + @Override + DataSetMiddleMongoDO insert(DataSetMiddleMongoDO entity); +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/FileParserStrategyFactory.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/FileParserStrategyFactory.java new file mode 100644 index 000000000..e2ccb2406 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/FileParserStrategyFactory.java @@ -0,0 +1,74 @@ +package cn.iocoder.yudao.module.mdpf.factory.datset; + + +import cn.iocoder.yudao.module.mdpf.factory.datset.IFileParserStrategy; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import javax.annotation.PostConstruct; + +/** + * 文件解析策略工厂 + * 在 Spring 启动时自动注册所有 FileParserStrategy 实现,并根据文件扩展名提供对应策略。 + */ +@Component +@Slf4j +public class FileParserStrategyFactory { + + private final List allStrategies; // Spring 会自动注入所有 FileParserStrategy 的实现类 + private final Map strategyMap = new HashMap<>(); // 缓存策略的映射 + + @Autowired + public FileParserStrategyFactory(List allStrategies) { + this.allStrategies = allStrategies; + } + + @PostConstruct // 在所有依赖注入完成后执行此方法 + public void init() { + log.info("Initializing FileParserStrategyFactory, registering strategies..."); + for (IFileParserStrategy strategy : allStrategies) { + List supportedExtensions = strategy.getSupportedExtensions(); // 调用策略接口的新方法 + if (supportedExtensions == null || supportedExtensions.isEmpty()) { + log.warn("Strategy {} does not declare any supported extensions. It will not be registered.", + strategy.getClass().getSimpleName()); + continue; + } + + for (String extension : supportedExtensions) { + String lowerCaseExtension = extension.toLowerCase(); + if (strategyMap.containsKey(lowerCaseExtension)) { + log.warn("Duplicate file parser strategy registered for extension '{}'. Overwriting with {}. " + + "Previous strategy was {}.", + lowerCaseExtension, + strategy.getClass().getSimpleName(), + strategyMap.get(lowerCaseExtension).getClass().getSimpleName()); + } + strategyMap.put(lowerCaseExtension, strategy); + log.info("Registered strategy {} for extension '{}'.", strategy.getClass().getSimpleName(), lowerCaseExtension); + } + } + log.info("FileParserStrategyFactory initialization complete. Total {} strategies registered for {} extensions.", + allStrategies.size(), strategyMap.size()); + } + + /** + * 根据文件扩展名获取对应的解析策略。 + * 直接从预填充的 map 中获取,无需遍历。 + * + * @param fileExtension 文件扩展名 + * @return 对应的 FileParserStrategy 实例 + * @throws IllegalArgumentException 如果没有找到支持该类型文件的策略 + */ + public IFileParserStrategy getStrategy(String fileExtension) { + String lowerCaseExtension = fileExtension.toLowerCase(); + IFileParserStrategy strategy = strategyMap.get(lowerCaseExtension); + if (strategy == null) { + throw new IllegalArgumentException("No file parser strategy found for extension: " + fileExtension); + } + return strategy; + } +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/IFileParserStrategy.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/IFileParserStrategy.java new file mode 100644 index 000000000..ac8c9cd34 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/IFileParserStrategy.java @@ -0,0 +1,21 @@ +package cn.iocoder.yudao.module.mdpf.factory.datset; + +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO; + +import java.io.File; +import java.io.InputStream; +import java.time.LocalDateTime; +import java.util.List; +import java.util.Map; + +public interface IFileParserStrategy { + public boolean supports(String fileExtension); + List> parseFileContentToString(File file, InputStream inputStream, String datasetMetaId, + String originalMinioPath, LocalDateTime processTime, + Map additionalMetadata, PlatformDatasetFilesDO platformDatasetFilesDO); + public List getSupportedExtensions(); + + public DataSetFileMiddleDO createFileToMiIO(List cleanedFileRecordsForOneFile, DataSetMiddleSaveReqVO createReqVO,Long fileid,String filename,String extendFilename,String url); +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/JsonFileParser.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/JsonFileParser.java new file mode 100644 index 000000000..9f680b80f --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/JsonFileParser.java @@ -0,0 +1,181 @@ +package cn.iocoder.yudao.module.mdpf.factory.datset; + +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; +import cn.iocoder.yudao.module.infra.service.file.FileService; +import cn.iocoder.yudao.module.mdpf.controller.dataset.dto.PlatformDataJsonTemplate; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetAnswerMapper; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetQuestionMapper; +import cn.iocoder.yudao.module.mdpf.util.ParserUtils; +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.time.LocalDateTime; +import java.util.*; +import java.util.stream.Collectors; + +import com.fasterxml.jackson.core.type.TypeReference; +import org.springframework.util.StringUtils; +@Component +@Slf4j +public class JsonFileParser implements IFileParserStrategy { + + @Autowired + private ObjectMapper objectMapper; + + @Autowired + private ParserUtils parserUtils; + + @Autowired + private FileService fileService; + + @Autowired + private PlatformDatasetAnswerMapper platformDatasetAnswerMapper; + + @Autowired + private PlatformDatasetQuestionMapper platformDatasetQuestionMapper; + + + @Override + public boolean supports(String fileExtension) { + return "json".equals(fileExtension); + } + + @Override + public List> parseFileContentToString(File file, InputStream inputStream, String datasetMetaId, + String originalMinioPath, LocalDateTime processTime, Map additionalMetadata, PlatformDatasetFilesDO platformDatasetFilesDO) { + List> parsedRecords = new ArrayList<>(); + + if (inputStream == null) { + log.error("Input stream is required for JSON file parsing for datasetMetaId: {}", datasetMetaId); + return Collections.emptyList(); + } + + try { + // 读取整个 JSON 文件到一个 JsonNode + JsonNode rootNode = objectMapper.readTree(inputStream); + + // 将整个 JsonNode 序列化为字符串,作为 extractedText + String extractedText = objectMapper.writeValueAsString(rootNode); + + // 构建 sourceSpecificMetadata + Map segmentMetadata = new HashMap<>(additionalMetadata != null ? additionalMetadata : Collections.emptyMap()); + + if (rootNode.isObject()) { + // 如果根节点是 JSON 对象,将其内容直接放入元数据 Map + segmentMetadata.putAll(objectMapper.convertValue(rootNode, new TypeReference>() {})); + } else if (rootNode.isArray()) { + // 如果根节点是 JSON 数组,将其放入元数据 Map 的一个特定键下 + segmentMetadata.put("json_root_array_content", objectMapper.convertValue(rootNode, new TypeReference>() {})); + //解析入库 + jsonParsing(rootNode,platformDatasetFilesDO); + } else if (rootNode.isValueNode()) { + // 如果根节点是单个值(字符串、数字、布尔等),也放入特定键下 + segmentMetadata.put("json_root_value_content", rootNode.asText()); + } else { + // 其他情况,例如 null + segmentMetadata.put("json_root_content_type", "unsupported"); + segmentMetadata.put("json_root_content_raw", rootNode.toString()); + } + + log.info("Parsed entire JSON file as a single segment for datasetMetaId: {}.", datasetMetaId); + + // 使用 ParserUtils 创建唯一的文本片段 Map + Map singleSegment = parserUtils.createSegmentMap( + datasetMetaId, originalMinioPath, "json", extractedText, + segmentMetadata, processTime, "json_full_file_segment"); + + return Collections.singletonList(singleSegment); // 返回只包含这一个片段的列表 + + } catch (Exception e) { + log.error("Failed to parse JSON file content as single segment for datasetMetaId: {} (MinIO: {}): {}", + datasetMetaId, originalMinioPath, e.getMessage(), e); + String errorMessage = "Failed to parse entire JSON file as single segment: " + e.getMessage(); + Map errorMetadata = new HashMap<>(); + errorMetadata.putAll(additionalMetadata != null ? additionalMetadata : Collections.emptyMap()); + errorMetadata.put("error", errorMessage); + return Collections.singletonList(parserUtils.createSegmentMap( + datasetMetaId, originalMinioPath, "json", errorMessage, + errorMetadata, processTime, "json_parsing_error" + )); + } + } + + @Override + public List getSupportedExtensions() { + return Arrays.asList("json"); + } + + + public DataSetFileMiddleDO createFileToMiIO(List cleanedFileRecordsForOneFile, DataSetMiddleSaveReqVO createReqVO, Long fileid, String filename, String extendFilename, String url){ + String aggregatedJson = null; + try { + aggregatedJson = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(cleanedFileRecordsForOneFile); + byte[] jsonBytes = aggregatedJson.getBytes("UTF-8"); + + String outputObjectName = String.format("cleaned_data/%d/%d_aggregated_cleaned_files.json", createReqVO.getId(), fileid); +// String cleanedOutputMinioPath = minioService.uploadBytes(jsonBytes, outputObjectName, "application/json", minioService.getMinioBucketName()); + +// log.info("Successfully uploaded aggregated cleaned JSON to MinIO: {}", cleanedOutputMinioPath); + + String fileurl = fileService.createFile(filename, "", jsonBytes); + DataSetFileMiddleDO dataSetFileMiddleDO = cleanedFileRecordsForOneFile.get(0); + dataSetFileMiddleDO.setDataSetFileUrl(fileurl); + dataSetFileMiddleDO.setSourceFileExtension(extendFilename); + dataSetFileMiddleDO.setSourceFileUrl(url); + return dataSetFileMiddleDO; + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + + } + + + + private void jsonParsing(JsonNode jsonNode, PlatformDatasetFilesDO datasetFilesDO) throws JsonProcessingException,IOException { + // 使用Jackson解析 Json 字符串为List对象 + // 使用Jackson解析 Json 字符串为List对象 + // 使用 TypeReference 解析 JSON 字符串为 List + List jsonList = null; + jsonList = objectMapper.readValue(objectMapper.treeAsTokens(jsonNode), new TypeReference>() {}); + jsonList.forEach( + dataJsonTemplate -> { + List answers = dataJsonTemplate.getAnswers(); + PlatformDatasetQuestionDO datasetQuestionDO = BeanUtils.toBean(dataJsonTemplate, PlatformDatasetQuestionDO.class); + datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId()); + datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId()); + datasetQuestionDO.setStatus(CollectionUtils.isNotEmpty(answers) ? 2 : 0); + platformDatasetQuestionMapper.insert(datasetQuestionDO); + if (CollectionUtils.isNotEmpty(answers)) { + for (String answer : answers) { + PlatformDatasetAnswerDO datasetAnswerDO = new PlatformDatasetAnswerDO(); + datasetAnswerDO.setDatasetId(datasetFilesDO.getDatasetId()); + datasetAnswerDO.setDatasetFilesId(datasetFilesDO.getId()); + datasetAnswerDO.setQuestionId(datasetQuestionDO.getId()); + datasetAnswerDO.setAnswer(answer); + platformDatasetAnswerMapper.insert(datasetAnswerDO); + } + } + } + ); + + } + +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/ZIPFileParser.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/ZIPFileParser.java new file mode 100644 index 000000000..dae26d16c --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/factory/datset/ZIPFileParser.java @@ -0,0 +1,37 @@ +package cn.iocoder.yudao.module.mdpf.factory.datset; + +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.io.InputStream; +import java.time.LocalDateTime; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +@Component +public class ZIPFileParser implements IFileParserStrategy { + @Override + public boolean supports(String fileExtension) { + return "zip".equals(fileExtension); + } + + @Override + public List> parseFileContentToString(File file, InputStream inputStream, String datasetMetaId, String originalMinioPath, LocalDateTime processTime, Map additionalMetadata, PlatformDatasetFilesDO platformDatasetFilesDO) { + return null; + } + + @Override + public List getSupportedExtensions() { + return Arrays.asList("zip"); + } + + @Override + public DataSetFileMiddleDO createFileToMiIO(List cleanedFileRecordsForOneFile, DataSetMiddleSaveReqVO createReqVO, Long fileid, String filename, String extendFilename, String url) { + + return null; + } +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/framework/config/MdpfMongoRepositoryConfiguration.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/framework/config/MdpfMongoRepositoryConfiguration.java new file mode 100644 index 000000000..b25be0af5 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/framework/config/MdpfMongoRepositoryConfiguration.java @@ -0,0 +1,12 @@ +package cn.iocoder.yudao.module.mdpf.framework.config; + +import org.springframework.context.annotation.Configuration; +import org.springframework.data.mongodb.repository.config.EnableMongoRepositories; + +@Configuration // 标记为Spring配置类 +@EnableMongoRepositories( + basePackages = "cn.iocoder.yudao.module.mdpf.dal.mongorepository" // <--- 明确指定 MongoDB Repository 的扫描路径 +) +public class MdpfMongoRepositoryConfiguration { + // 这个类可以为空,它的主要作用是通过 @EnableMongoRepositories 注解来启用和配置 MongoDB Repository 扫描。 +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/DataSetFileMiddleService.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/DataSetFileMiddleService.java new file mode 100644 index 000000000..9ca483d9b --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/DataSetFileMiddleService.java @@ -0,0 +1,65 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset; + +import java.util.*; + +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddlePageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import com.baomidou.dynamic.datasource.annotation.DS; + +import javax.validation.Valid; + +/** + * 数据集对应的文件地址 Service 接口 + * + * @author 管理员 + */ +public interface DataSetFileMiddleService { + + /** + * 创建数据集对应的文件地址 + * + * @param createReqVO 创建信息 + * @return 编号 + */ + Long createSetFileMiddle(@Valid DataSetFileMiddleSaveReqVO createReqVO); + + @DS("slave") + Long createSetFileMiddle(DataSetFileMiddleDO createReqVO); + + /** + * 更新数据集对应的文件地址 + * + * @param updateReqVO 更新信息 + */ + void updateSetFileMiddle(@Valid DataSetFileMiddleSaveReqVO updateReqVO); + + @DS("slave") + void updateDataSetFileMiddle(DataSetFileMiddleSaveReqVO updateReqVO); + + /** + * 删除数据集对应的文件地址 + * + * @param id 编号 + */ + void deleteSetFileMiddle(Long id); + + /** + * 获得数据集对应的文件地址 + * + * @param id 编号 + * @return 数据集对应的文件地址 + */ + DataSetFileMiddleDO getSetFileMiddle(Long id); + + /** + * 获得数据集对应的文件地址分页 + * + * @param pageReqVO 分页查询 + * @return 数据集对应的文件地址分页 + */ + PageResult getSetFileMiddlePage(DataSetFileMiddlePageReqVO pageReqVO); + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/DataSetMiddleService.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/DataSetMiddleService.java new file mode 100644 index 000000000..d14371c0f --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/DataSetMiddleService.java @@ -0,0 +1,79 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset; + +import java.util.*; +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.pojo.PageParam; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleSaveReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddlePageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleRespVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO; +import com.baomidou.dynamic.datasource.annotation.DS; + +import javax.validation.Valid; + +/** + * 中台中的数据集 Service 接口 + * + * @author 管理员 + */ +public interface DataSetMiddleService { + + @DS("salve") + Long createDataSetMiddle(DataSetMiddleDO createReqVO); + + /** + * 创建中台中的数据集 + * + * @param createReqVO 创建信息 + * @return 编号 + */ + Long createSetMiddle(@Valid DataSetMiddleSaveReqVO createReqVO); + + @DS("slave") + void updataDataSetMiddle(DataSetMiddleSaveReqVO updateReqVO); + + /** + * 更新中台中的数据集 + * + * @param updateReqVO 更新信息 + */ + void updateSetMiddle(@Valid DataSetMiddleSaveReqVO updateReqVO); + + /** + * 删除中台中的数据集 + * + * @param id 编号 + */ + void deleteSetMiddle(Long id); + + /** + * 获得中台中的数据集 + * + * @param id 编号 + * @return 中台中的数据集 + */ + DataSetMiddleDO getSetMiddle(Long id); + + @DS("slave") + DataSetMiddleRespVO getOneInfo(Long id); + + /** + * 获得中台中的数据集分页 + * + * @param pageReqVO 分页查询 + * @return 中台中的数据集分页 + */ + PageResult getSetMiddlePage(DataSetMiddlePageReqVO pageReqVO); + + @DS("slave") + DataSetMiddleDO getOne(Long datasetid); + + @DS("slave") + List getDataSetMiddleList(Integer datasetParentType); + + @DS("slave") + void updateProcess(Integer formattedRatio, Long datasetId, Integer status); + +// String getDataSetUrl(Long datasetId,String hostUrl); +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetAnswerService.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetAnswerService.java new file mode 100644 index 000000000..a01236280 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetAnswerService.java @@ -0,0 +1,55 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO; + +import javax.validation.Valid; + +/** + * 数据集数据问题标注内容 Service 接口 + * + * @author 华大大模型 + */ +public interface PlatformDatasetAnswerService { + + /** + * 创建数据集数据问题标注内容 + * + * @param createReqVO 创建信息 + * @return 编号 + */ + Long createDatasetAnswer(@Valid PlatformDatasetAnswerSaveReqVO createReqVO); + + /** + * 更新数据集数据问题标注内容 + * + * @param updateReqVO 更新信息 + */ + void updateDatasetAnswer(@Valid PlatformDatasetAnswerSaveReqVO updateReqVO); + + /** + * 删除数据集数据问题标注内容 + * + * @param id 编号 + */ + void deleteDatasetAnswer(Long id); + + /** + * 获得数据集数据问题标注内容 + * + * @param id 编号 + * @return 数据集数据问题标注内容 + */ + PlatformDatasetAnswerDO getDatasetAnswer(Long id); + + /** + * 获得数据集数据问题标注内容分页 + * + * @param pageReqVO 分页查询 + * @return 数据集数据问题标注内容分页 + */ + PageResult getDatasetAnswerPage(PlatformDatasetAnswerPageReqVO pageReqVO); + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetFilesService.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetFilesService.java new file mode 100644 index 000000000..543898290 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetFilesService.java @@ -0,0 +1,59 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO; +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; + +import javax.validation.Valid; +import java.util.List; + +/** + * 数据集数据文件 Service 接口 + * + * @author 华大大模型 + */ +public interface PlatformDatasetFilesService { + + /** + * 创建数据集数据文件 + * + * @param createReqVO 创建信息 + * @return 编号 + */ + Long createDatasetFiles(@Valid PlatformDatasetFilesSaveReqVO createReqVO); + Long createDatasetFiles( PlatformDatasetFilesDO platformDatasetFilesDO); + + /** + * 更新数据集数据文件 + * + * @param updateReqVO 更新信息 + */ + void updateDatasetFiles(@Valid PlatformDatasetFilesSaveReqVO updateReqVO); + + /** + * 删除数据集数据文件 + * + * @param id 编号 + */ + void deleteDatasetFiles(Long id); + + /** + * 获得数据集数据文件 + * + * @param id 编号 + * @return 数据集数据文件 + */ + PlatformDatasetFilesDO getDatasetFiles(Long id); + + /** + * 获得数据集数据文件分页 + * + * @param pageReqVO 分页查询 + * @return 数据集数据文件分页 + */ + PageResult getDatasetFilesPage(PlatformDatasetFilesPageReqVO pageReqVO); + + List selectList(LambdaQueryWrapper query); +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetQuestionService.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetQuestionService.java new file mode 100644 index 000000000..fe64c69ad --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/PlatformDatasetQuestionService.java @@ -0,0 +1,71 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO; +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; + +import javax.validation.Valid; +import java.util.List; + +/** + * 数据集数据问题 Service 接口 + * + * @author 华大大模型 + */ +public interface PlatformDatasetQuestionService { + + /** + * 创建数据集数据问题 + * + * @param createReqVO 创建信息 + * @return 编号 + */ + Long createDatasetQuestion (@Valid PlatformDatasetQuestionSaveReqVO createReqVO); + + /** + * 更新数据集数据问题 + * + * @param updateReqVO 更新信息 + */ + void updateDatasetQuestion (@Valid PlatformDatasetQuestionSaveReqVO updateReqVO); + + /** + * 删除数据集数据问题 + * + * @param id 编号 + */ + void deleteDatasetQuestion (Long id); + + /** + * 获得数据集数据问题 + * + * @param id 编号 + * @return 数据集数据问题 + */ + PlatformDatasetQuestionDO getDatasetQuestion (Long id); + + /** + * 获得数据集数据问题分页 + * + * @param pageReqVO 分页查询 + * @return 数据集数据问题分页 + */ + PageResult getDatasetQuestionPage (PlatformDatasetQuestionPageReqVO pageReqVO); + + void updateDatasetQuestionDataAnno (List updateReqVOS); + + /** + * 获得 数据集数据问题 列表 + * + * @param datasetId 数据集ID + * @return 数据集数据问题 列表 + */ + List getDatasetQuestionList (Long datasetId); + + Long getCountByDataSetId(Long datasetid); + + Long getCountByDatasetid(LambdaQueryWrapper query); +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/DataSetFileMiddleServiceImpl.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/DataSetFileMiddleServiceImpl.java new file mode 100644 index 000000000..207e48e67 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/DataSetFileMiddleServiceImpl.java @@ -0,0 +1,96 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset.impl; + +import cn.iocoder.module.mdpf.enums.ErrorCodeConstants; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddlePageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.DataSetFileMiddleMapper; +import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetFileMiddleService; +import com.baomidou.dynamic.datasource.annotation.DS; +import org.springframework.stereotype.Service; +import org.springframework.validation.annotation.Validated; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; + + +import javax.annotation.Resource; + +import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception; + +/** + * 数据集对应的文件地址 Service 实现类 + * + * @author 管理员 + */ +@Service +@Validated +public class DataSetFileMiddleServiceImpl implements DataSetFileMiddleService { + + @Resource + private DataSetFileMiddleMapper setFileMiddleMapper; + + + + @Override + @DS("slave") + public Long createSetFileMiddle(DataSetFileMiddleSaveReqVO createReqVO) { + // 插入 + DataSetFileMiddleDO setFileMiddle = BeanUtils.toBean(createReqVO, DataSetFileMiddleDO.class); +// setFileMiddleMapper.insert(setFileMiddle); + // 返回 + return setFileMiddle.getId(); + } + + @Override + @DS("slave") + public Long createSetFileMiddle(DataSetFileMiddleDO createReqVO) { + // 插入 + setFileMiddleMapper.insert(createReqVO); + // 返回 + return createReqVO.getId(); + } + + @Override + public void updateSetFileMiddle(DataSetFileMiddleSaveReqVO updateReqVO) { + // 校验存在 + validateSetFileMiddleExists(updateReqVO.getId()); + // 更新 + DataSetFileMiddleDO updateObj = BeanUtils.toBean(updateReqVO, DataSetFileMiddleDO.class); + setFileMiddleMapper.updateById(updateObj); + } + + @Override + @DS("slave") + public void updateDataSetFileMiddle(DataSetFileMiddleSaveReqVO updateReqVO) { + // 校验存在 + // 更新 + DataSetFileMiddleDO updateObj = BeanUtils.toBean(updateReqVO, DataSetFileMiddleDO.class); + setFileMiddleMapper.updateById(updateObj); + } + + @Override + public void deleteSetFileMiddle(Long id) { + // 校验存在 + validateSetFileMiddleExists(id); + // 删除 + setFileMiddleMapper.deleteById(id); + } + + private void validateSetFileMiddleExists(Long id) { + if (setFileMiddleMapper.selectById(id) == null) { + throw exception(ErrorCodeConstants.SET_FILE_MIDDLE_NOT_EXISTS); + } + } + + @Override + public DataSetFileMiddleDO getSetFileMiddle(Long id) { + return setFileMiddleMapper.selectById(id); + } + + @Override + public PageResult getSetFileMiddlePage(DataSetFileMiddlePageReqVO pageReqVO) { + return setFileMiddleMapper.selectPage(pageReqVO); + } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/DataSetMiddleServiceImpl.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/DataSetMiddleServiceImpl.java new file mode 100644 index 000000000..9249d83e9 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/DataSetMiddleServiceImpl.java @@ -0,0 +1,440 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset.impl; + +import cn.hutool.core.util.IdUtil; +import cn.iocoder.module.mdpf.enums.DatasetStatusMiddleEnum; +import cn.iocoder.module.mdpf.enums.ErrorCodeConstants; +import cn.iocoder.yudao.framework.common.exception.ErrorCode; +import cn.iocoder.yudao.framework.common.exception.ServiceException; +import cn.iocoder.yudao.module.infra.service.file.FileService; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.*; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.DataSetFileMiddleMapper; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.DataSetMiddleMapper; +import cn.iocoder.yudao.module.mdpf.dal.mongorepository.DataSetMiddleRepository; +import cn.iocoder.yudao.module.mdpf.dal.mongo.DataSetMiddleMongoDO; +import cn.iocoder.yudao.module.mdpf.factory.datset.FileParserStrategyFactory; +import cn.iocoder.yudao.module.mdpf.factory.datset.IFileParserStrategy; +import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetFileMiddleService; +import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetMiddleService; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetFilesService; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetQuestionService; +import cn.iocoder.yudao.module.mdpf.util.HttpURLConnectionUtil; +import cn.iocoder.yudao.module.mdpf.util.TextProcessor; +import com.baomidou.dynamic.datasource.annotation.DS; +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; +import com.baomidou.mybatisplus.core.toolkit.Wrappers; +import com.fasterxml.jackson.databind.ObjectMapper; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Lazy; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; +import org.springframework.validation.annotation.Validated; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; + + +import javax.annotation.Resource; + +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.time.LocalDateTime; +import java.util.*; + +import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception; + +/** + * 中台中的数据集 Service 实现类 + * + * @author 管理员 + */ +@Service +@Validated +@Slf4j +public class DataSetMiddleServiceImpl implements DataSetMiddleService { + + @Resource + private DataSetMiddleMapper setMiddleMapper; + + @Autowired + private DataSetMiddleRepository dataSetMiddleRepository; + + @Autowired + private FileParserStrategyFactory fileParserStrategyFactory; + + @Autowired + private ObjectMapper objectMapper; + @Autowired + private TextProcessor textProcessor; + + @Autowired + private DataSetFileMiddleMapper dataSetFileMiddleMapper; + + @Autowired + private FileService fileService; + + @Autowired + private DataSetFileMiddleService dataSetFileMiddleService; + + @Autowired + @Lazy + private DataSetMiddleService dataSetMiddleService; + + @Autowired + private PlatformDatasetFilesService platformDatasetFilesService; + + @Autowired + private PlatformDatasetQuestionService platformDatasetQuestionService; + + + + @Override + @DS("slave") + public Long createDataSetMiddle(DataSetMiddleDO createReqVO){ + setMiddleMapper.insert(createReqVO); + return createReqVO.getId(); + } + + @Override + public Long createSetMiddle(DataSetMiddleSaveReqVO createReqVO) { + // 插入 + DataSetMiddleDO setMiddle = BeanUtils.toBean(createReqVO, DataSetMiddleDO.class); + DataSetMiddleMongoDO dataSetMiddleMongoDO = BeanUtils.toBean(createReqVO, DataSetMiddleMongoDO.class); + long mongoid = IdUtil.getSnowflake(1, 1).nextId(); + dataSetMiddleMongoDO.setId(mongoid); + //将原始数据入库到Mongodb + DataSetMiddleMongoDO DataSetMiddleMongoDO = dataSetMiddleRepository.insert(dataSetMiddleMongoDO); + setMiddle.setMongoId(dataSetMiddleMongoDO.getId()); + setMiddle.setCleanStatus(0); + setMiddle.setMarkStatus(0); + //将表单数据入库 + dataSetMiddleService.createDataSetMiddle(setMiddle); + createReqVO.setId(setMiddle.getId()); + dataProcessClean(createReqVO); + // 返回 + return setMiddle.getId(); + } + + @Async + public void dataProcessClean(DataSetMiddleSaveReqVO createReqVO){ + List> filesList = createReqVO.getFilesList(); + // + String filetypes[]=new String[]{"docx","doc","txt","json","xls","xlsx","pdf"}; + String ziptypes []=new String[]{"zip"}; + String imagetypes []=new String[]{"png","jpg","jpeg","gif"}; + String mediaTypes []=new String[]{"mp3","mp4","rmvb"}; + filesList.stream().forEach(filemap ->{ + //获取文件名 + String filename = filemap.get("filename"); + Long fileid = Long.parseLong(filemap.get("id")); + //获取文件名中最后一个.对应的索引值 + int index = filename.lastIndexOf("."); + //获取文件扩展名 + String extendFilename=filename.substring(index+1,filename.length()); + //获取文件的url + String url=filemap.get("url"); + //判断是否时文档文件 + HttpURLConnection connection = HttpURLConnectionUtil.readFile(url); + List> questionAnswerList=new ArrayList<>(); + List allCleanedFileRecords = new ArrayList<>(); + //将文件信息入库到platform_dataset_files表中 + PlatformDatasetFilesDO platformDatasetFilesDO=new PlatformDatasetFilesDO(); + platformDatasetFilesDO.setDatasetFile(fileid); + platformDatasetFilesDO.setDatasetFileUrl(url); + platformDatasetFilesDO.setDatasetId(createReqVO.getId()); + platformDatasetFilesDO.setCreateTime(LocalDateTime.now()); + platformDatasetFilesDO.setDatasetFileName(filename); + platformDatasetFilesService.createDatasetFiles(platformDatasetFilesDO); + if (connection != null) { + try { + InputStream inputStream = connection.getInputStream(); +// JsonNode rootNode = objectMapper.readTree(inputStream); + IFileParserStrategy parserStrategy = fileParserStrategyFactory.getStrategy(extendFilename); + List> extractedSegmentsForOneFile=parserStrategy.parseFileContentToString(null,inputStream,"","",null,new HashMap<>(),platformDatasetFilesDO); + Map additionalMetadataForParser = new HashMap<>(); + additionalMetadataForParser.put("datasetId", createReqVO.getId()); // MySQL 数据集 ID + additionalMetadataForParser.put("sourceFileId", fileid); // MongoDB 数据集元数据 ID +// additionalMetadataForParser.put("dataSetFileUrl", url); // 实际下载 URL + additionalMetadataForParser.put("dataSetFileType", 0); + additionalMetadataForParser.put("datasetFileName", filename); + additionalMetadataForParser.put("sourceFileName", filename); // 原始文件名 + additionalMetadataForParser.put("originalMinioPath", url); // 原始 MinIO 路径信息 + if (extractedSegmentsForOneFile != null && !extractedSegmentsForOneFile.isEmpty()) { + log.info("Successfully received {} segments from parser for file '{}'. Proceeding to clean.", extractedSegmentsForOneFile.size(), filename); + // **对当前文件解析出的原始文本片段进行深度清洗和质量评估** + // TextProcessor 现在返回 List + List cleanedFileRecordsForOneFile = textProcessor.cleanAndEvaluateList( + extractedSegmentsForOneFile, + createReqVO.getId(), + fileid + ); + if (!cleanedFileRecordsForOneFile.isEmpty()) { + allCleanedFileRecords.addAll(cleanedFileRecordsForOneFile); // 聚合清洗后的 DataSetFileMiddleDO + log.info("Successfully cleaned {} high-quality records from file '{}'.", cleanedFileRecordsForOneFile.size(), filename); + } else { + log.warn("No high-quality cleaned records found for file '{}' after cleaning. (Dataset ID: {})", filename, fileid); + } + DataSetFileMiddleDO dataSetFileMiddleDO=parserStrategy.createFileToMiIO(cleanedFileRecordsForOneFile,createReqVO,fileid,filename,extendFilename,url); + dataSetFileMiddleService.createSetFileMiddle(dataSetFileMiddleDO); + + } else { + log.warn("Parser returned no segments for file '{}' (URL: {}) in DataSetMiddleMongoDO ID {}. Skipping cleaning for this file.", + filename, url, fileid); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }); + + Long count = platformDatasetQuestionService.getCountByDatasetid(new LambdaQueryWrapper() + .eq(PlatformDatasetQuestionDO::getDatasetId, createReqVO.getId())); + + if (count <= 0) { + throw new ServiceException(new ErrorCode(20000, "数据集问题不能为空")); + } + + createReqVO.setDataLength(count); + Long annoCount = platformDatasetQuestionService.getCountByDatasetid(new LambdaQueryWrapper() + .eq(PlatformDatasetQuestionDO::getDatasetId, createReqVO.getId()) + .eq(PlatformDatasetQuestionDO::getStatus, 2)); + double ratio = count == 0 ? 0 : ((double) annoCount / count) * 100; + Integer formattedRatio = ratio == 0 ? 0 : (int) ratio; + Integer status = formattedRatio == 100 ? 2 : 1; + if (formattedRatio != null) { + createReqVO.setAnnotateProgress(formattedRatio); + } + if (annoCount == 0) { + status = 0; + } + if (CollectionUtils.isEmpty(filesList)) { + throw new ServiceException(new ErrorCode( + 20000, "数据集文件不能为空")); + } + if (createReqVO.getDatasetType() == 2) { + if (status != 2) { + throw new ServiceException(new ErrorCode( + 20000, "评估数据集只能上传标注完成的数据")); + } + } else { + if (createReqVO.getMarkStatus() != status) { + throw new ServiceException(new ErrorCode( + 20000, "数据集标注状态错误!应该是【" + DatasetStatusMiddleEnum.getStatusByName(status) + "】")); + } + } + //所文件都处理完成以后将数据集中的clean_status字段设置为1 + createReqVO.setCleanStatus(1); + dataSetMiddleService.updataDataSetMiddle(createReqVO); + } + + public static void main(String[] args) { + String a="adsfdsa.234.erterter.pdf"; + int index = a.lastIndexOf("."); + String substring = a.substring(index + 1, a.length()); + System.out.println(substring); + } + + public void insertSetMiddle(){ + + } + + + @Override + @DS("slave") + public void updataDataSetMiddle(DataSetMiddleSaveReqVO updateReqVO){ + DataSetMiddleDO updateObj = BeanUtils.toBean(updateReqVO, DataSetMiddleDO.class); + setMiddleMapper.updateById(updateObj); + } + + @Override + @DS("slave") + public void updateSetMiddle(DataSetMiddleSaveReqVO updateReqVO) { + // 校验存在 + validateSetMiddleExists(updateReqVO.getId()); + // 更新 + DataSetMiddleDO updateObj = BeanUtils.toBean(updateReqVO, DataSetMiddleDO.class); + setMiddleMapper.updateById(updateObj); + } + + @Override + public void deleteSetMiddle(Long id) { + // 校验存在 + validateSetMiddleExists(id); + // 删除 + setMiddleMapper.deleteById(id); + } + + private void validateSetMiddleExists(Long id) { + if (setMiddleMapper.selectById(id) == null) { + throw exception(ErrorCodeConstants.SET_FILE_MIDDLE_NOT_EXISTS); + } + } + + @Override + @DS("slave") + public DataSetMiddleDO getSetMiddle(Long id) { + return setMiddleMapper.selectById(id); + } + + @Override + @DS("slave") + public DataSetMiddleRespVO getOneInfo(Long id) { + DataSetMiddleDO datasetDO = dataSetMiddleService.getOne(id); + DataSetMiddleRespVO datasetRespVO = BeanUtils.toBean(datasetDO, DataSetMiddleRespVO.class); + Integer datasetParentType = datasetDO.getDatasetParentType(); + List datasetFilesDOS = platformDatasetFilesService.selectList(new LambdaQueryWrapper().eq(PlatformDatasetFilesDO::getDatasetId, id)); + datasetRespVO.setDatasetFiles(BeanUtils.toBean(datasetFilesDOS, PlatformDatasetFilesRespVO.class)); + + /*List datasetQuestionDO = datasetQuestionMapper.selectList(new LambdaQueryWrapper().eq(DatasetQuestionDO::getDatasetId, id)); + List datasetQuestionRespVOS = BeanUtils.toBean(datasetQuestionDO, DatasetQuestionRespVO.class); + datasetRespVO.setDatasetQuestionRespVOS(datasetQuestionRespVOS);*/ + return datasetRespVO; + } + + @Override + @DS("slave") + public PageResult getSetMiddlePage(DataSetMiddlePageReqVO pageReqVO) { + return setMiddleMapper.selectPage(pageReqVO); + } + + @Override + @DS("slave") + public DataSetMiddleDO getOne(Long datasetid){ + return setMiddleMapper.selectById(datasetid); + } + + @Override + @DS("slave") + public List getDataSetMiddleList(Integer datasetParentType){ + List resultlist = setMiddleMapper.selectList(Wrappers.lambdaQuery() +// .eq(DataSetMiddleDO::getDatasetParentType, datasetParentType) + .eq(DataSetMiddleDO::getMarkStatus, 2) + .eq(DataSetMiddleDO::getDeleted,0) + ); + return resultlist; + } + + @Override + @DS("slave") + public void updateProcess(Integer formattedRatio, Long datasetId, Integer status){ + setMiddleMapper.updateProcess(formattedRatio,datasetId,status); + } + + +// @Override +// public String getDataSetUrl(Long datasetId,String hostUrl){ +// DataSetMiddleDO dataset = dataSetMiddleService.getOne(datasetId); +// if (dataset == null) { +// log.error("未找到数据集信息,数据集ID: {}", datasetId); +// throw new RuntimeException("数据集信息不存在"); +// } +// log.debug("数据集信息查询成功。数据集名称: {}", dataset.getDatasetName()); +// +// // 查询数据集问题列表 +// log.debug("正在查询数据集问题列表,数据集ID: {}", dataset.getId()); +// List datasetQuestionList = platformDatasetQuestionService.getDatasetQuestionList(dataset.getId()); +// log.debug("数据集问题列表查询成功。问题数量: {}", datasetQuestionList.size()); +// +// // 将数据集信息转换为 DO 对象 +// log.debug("正在转换数据集信息为 DO 对象..."); +// +// // 生成 JSON 文件并获取文件 URL +// log.debug("正在生成 JSON 文件并获取文件 URL..."); +// String fileUrl = JsonFileWriteFine(hostUrl, dataset, datasetQuestionList); +// return fileUrl; +// } +// public String JsonFileWriteFine (String hostUrl, DataSetMiddleDO datasetDO, List datasetQuestionList) { +// try { +// log.info("开始生成 JSON 文件并上传,数据集ID: {}", datasetDO.getId()); +// +// // 构建 AigcDatasetVo 列表 +// log.debug("正在构建 AigcDatasetVo 列表..."); +// List aigcDatasetVoList = new ArrayList<>(); +// for (PlatformDatasetQuestionRespVO dataSource : datasetQuestionList) { +// AigcDatasetMiddleVo aigcDatasetVo = new AigcDatasetMiddleVo(); +// aigcDatasetVo.setInstruction(StringUtils.isNotBlank(dataSource.getSystem()) ? dataSource.getSystem() : ""); +// aigcDatasetVo.setInput(StringUtils.isNotBlank(dataSource.getQuestion()) ? dataSource.getQuestion() : ""); +// +// // 检查答案列表是否为空 +// if (!CollectionUtils.isAnyEmpty(dataSource.getDatasetAnswerRespVO())) { +// aigcDatasetVo.setOutput(StringUtils.isNotBlank(dataSource.getDatasetAnswerRespVO().get(0).getAnswer()) ? +// dataSource.getDatasetAnswerRespVO().get(0).getAnswer() : ""); +// } else { +// aigcDatasetVo.setOutput(""); +// } +// aigcDatasetVoList.add(aigcDatasetVo); +// } +// log.debug("AigcDatasetVo 列表构建完成。记录数量: {}", aigcDatasetVoList.size()); +// +// // 将 AigcDatasetVo 列表转换为 JSON 字符串 +// log.debug("正在将 AigcDatasetVo 列表转换为 JSON 字符串..."); +// ObjectMapper mapper = new ObjectMapper(); +// StringBuilder sb = new StringBuilder(); +// for (AigcDatasetMiddleVo aigcDatasetVo : aigcDatasetVoList) { +// String json = mapper.writeValueAsString(aigcDatasetVo); +// sb.append(json).append("\n"); +// } +// +// // 将 JSON 字符串转换为输入流 +// log.debug("正在将 JSON 字符串转换为输入流..."); +// InputStream inputStream = new ByteArrayInputStream(sb.toString().getBytes()); +// +// // 上传文件 +// log.info("正在上传 JSON 文件..."); +// String fileName = datasetDO.getDatasetName() + "new" + datasetDO.getId() + ".json"; +// AigcDatasetFileMiddleRespV0 aigcDatasetFileRespV0 = trainHttpService.AigcUploadFile(new HashMap<>(), hostUrl, inputStream, fileName); +// +// if (aigcDatasetFileRespV0 != null) { +// log.debug("文件上传成功。文件ID: {}", aigcDatasetFileRespV0.getFileId()); +// +// // 更新数据集的 Job ID +// log.debug("正在更新数据集的 Job ID..."); +// datasetMapper.setJobid(datasetDO.getId(), aigcDatasetFileRespV0.getFileId()); +// +// log.info("hostUrl:{}", hostUrl); +// // 更新数据集的 URL +// String s3Url = aigcDatasetFileRespV0.getS3Url(); +// log.info("s3Url:{}", s3Url); +// +// // int lastIndex = s3Url.lastIndexOf("/storage"); +// // String url = s3Url.substring(lastIndex + 1); +// // log.info("url:{}", url); +// // 找到 "/uploads" 的位置 +// int uploadsIndex = s3Url.indexOf("/uploads"); +// if (uploadsIndex == -1) { +// log.error("s3Url 中未找到 '/uploads' 路径"); +// return ""; +// } +// +// // 提取 "/uploads" 及之后的部分 +// String uploadsPath = s3Url.substring(uploadsIndex); +// log.info("uploadsPath: {}", uploadsPath); +// +// // 构建新的完整 URL +// String newUrl = hostUrl + uploadsPath; +// log.info("newUrl: {}", newUrl); +// datasetMapper.setUrl(datasetDO.getId(), newUrl); +// +// // 返回结果 +// String result = newUrl.substring(hostUrl.length()); +// log.info("JSON 文件生成并上传成功。返回结果: {}", result); +// +// return result; +// } else { +// log.error("文件上传失败。数据集ID: {}", datasetDO.getId()); +// return ""; +// } +// +// } catch (IOException e) { +// log.error("生成或上传 JSON 文件时发生异常。数据集ID: {}", datasetDO.getId(), e); +// return ""; +// } +// } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetAnswerServiceImpl.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetAnswerServiceImpl.java new file mode 100644 index 000000000..515444f5c --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetAnswerServiceImpl.java @@ -0,0 +1,70 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset.impl; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetAnswerMapper; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetAnswerService; +import org.springframework.stereotype.Service; +import org.springframework.validation.annotation.Validated; + +import javax.annotation.Resource; + + +/** + * 数据集数据问题标注内容 Service 实现类 + * + * @author 华大大模型 + */ +@Service +@Validated +public class PlatformDatasetAnswerServiceImpl implements PlatformDatasetAnswerService { + + @Resource + private PlatformDatasetAnswerMapper platformDatasetAnswerMapper; + + @Override + public Long createDatasetAnswer(PlatformDatasetAnswerSaveReqVO createReqVO) { + // 插入 + PlatformDatasetAnswerDO datasetAnswer = BeanUtils.toBean(createReqVO, PlatformDatasetAnswerDO.class); + platformDatasetAnswerMapper.insert(datasetAnswer); + // 返回 + return datasetAnswer.getId(); + } + + @Override + public void updateDatasetAnswer(PlatformDatasetAnswerSaveReqVO updateReqVO) { + // 校验存在 + validateDatasetAnswerExists(updateReqVO.getId()); + // 更新 + PlatformDatasetAnswerDO updateObj = BeanUtils.toBean(updateReqVO, PlatformDatasetAnswerDO.class); + platformDatasetAnswerMapper.updateById(updateObj); + } + + @Override + public void deleteDatasetAnswer(Long id) { + // 校验存在 + validateDatasetAnswerExists(id); + // 删除 + platformDatasetAnswerMapper.deleteById(id); + } + + private void validateDatasetAnswerExists(Long id) { + if (platformDatasetAnswerMapper.selectById(id) == null) { + throw new RuntimeException("数据集回答不存在"); + } + } + + @Override + public PlatformDatasetAnswerDO getDatasetAnswer(Long id) { + return platformDatasetAnswerMapper.selectById(id); + } + + @Override + public PageResult getDatasetAnswerPage(PlatformDatasetAnswerPageReqVO pageReqVO) { + return platformDatasetAnswerMapper.selectPage(pageReqVO); + } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetFilesServiceImpl.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetFilesServiceImpl.java new file mode 100644 index 000000000..0f860ee29 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetFilesServiceImpl.java @@ -0,0 +1,83 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset.impl; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesPageReqVO; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesSaveReqVO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetFilesMapper; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetFilesService; +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; +import org.springframework.stereotype.Service; +import org.springframework.validation.annotation.Validated; + +import javax.annotation.Resource; +import java.util.List; + + +/** + * 数据集数据文件 Service 实现类 + * + * @author 华大大模型 + */ +@Service +@Validated +public class PlatformDatasetFilesServiceImpl implements PlatformDatasetFilesService { + + @Resource + private PlatformDatasetFilesMapper platformDatasetFilesMapper; + @Override + public Long createDatasetFiles(PlatformDatasetFilesSaveReqVO createReqVO) { + // 插入 + PlatformDatasetFilesDO datasetFiles = BeanUtils.toBean(createReqVO, PlatformDatasetFilesDO.class); + platformDatasetFilesMapper.insert(datasetFiles); + // 返回 + return datasetFiles.getId(); + } + + @Override + public Long createDatasetFiles(PlatformDatasetFilesDO platformDatasetFilesDO) { + platformDatasetFilesMapper.insert(platformDatasetFilesDO); + + return platformDatasetFilesDO.getId(); + } + + @Override + public void updateDatasetFiles(PlatformDatasetFilesSaveReqVO updateReqVO) { + // 校验存在 + validateDatasetFilesExists(updateReqVO.getId()); + // 更新 + PlatformDatasetFilesDO updateObj = BeanUtils.toBean(updateReqVO, PlatformDatasetFilesDO.class); + platformDatasetFilesMapper.updateById(updateObj); + } + + @Override + public void deleteDatasetFiles(Long id) { + // 校验存在 + validateDatasetFilesExists(id); + // 删除 + platformDatasetFilesMapper.deleteById(id); + } + + private void validateDatasetFilesExists(Long id) { + if (platformDatasetFilesMapper.selectById(id) == null) { + throw new RuntimeException("数据集文件不存在"); + } + } + + @Override + public PlatformDatasetFilesDO getDatasetFiles(Long id) { + return platformDatasetFilesMapper.selectById(id); + } + + @Override + public PageResult getDatasetFilesPage(PlatformDatasetFilesPageReqVO pageReqVO) { + return platformDatasetFilesMapper.selectPage(pageReqVO); + } + + @Override + public List selectList(LambdaQueryWrapper query){ + return platformDatasetFilesMapper.selectList(query); + } + +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetQuestionServiceImpl.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetQuestionServiceImpl.java new file mode 100644 index 000000000..4d01b63e8 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/service/dataset/impl/PlatformDatasetQuestionServiceImpl.java @@ -0,0 +1,246 @@ +package cn.iocoder.yudao.module.mdpf.service.dataset.impl; + +import cn.iocoder.yudao.framework.common.pojo.PageResult; +import cn.iocoder.yudao.framework.common.util.object.BeanUtils; +import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.*; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO; +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.DataSetMiddleMapper; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetAnswerMapper; +import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetQuestionMapper; +import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetMiddleService; +import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetQuestionService; +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; +import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; +import com.baomidou.mybatisplus.core.toolkit.Wrappers; +import jodd.util.StringUtil; +import org.springframework.context.annotation.Lazy; +import org.springframework.stereotype.Service; +import org.springframework.validation.annotation.Validated; + +import javax.annotation.Resource; +import java.util.*; +import java.util.stream.Collectors; + + +/** + * 数据集数据问题 Service 实现类 + * + * @author 华大大模型 + */ +@Service +@Validated +public class PlatformDatasetQuestionServiceImpl implements PlatformDatasetQuestionService { + + @Resource + private PlatformDatasetQuestionMapper platformDatasetQuestionMapper; + @Resource + private PlatformDatasetAnswerMapper platformDatasetAnswerMapper; + @Resource + private DataSetMiddleMapper dataSetMiddleMapper; + + @Resource + @Lazy + private DataSetMiddleService dataSetMiddleService; + +// @Resource +// private DatasetQuestionAnswerImageMapper datasetQuestionAnswerImageMapper; + + @Override + public Long createDatasetQuestion(PlatformDatasetQuestionSaveReqVO createReqVO) { + // 插入 + PlatformDatasetQuestionDO datasetQuestion = BeanUtils.toBean(createReqVO, PlatformDatasetQuestionDO.class); + platformDatasetQuestionMapper.insert(datasetQuestion); + // 返回 + return datasetQuestion.getId(); + } + + @Override + public void updateDatasetQuestion(PlatformDatasetQuestionSaveReqVO updateReqVO) { + // 校验存在 + validateDatasetQuestionExists(updateReqVO.getId()); + // 更新 + PlatformDatasetQuestionDO updateObj = BeanUtils.toBean(updateReqVO, PlatformDatasetQuestionDO.class); + platformDatasetQuestionMapper.updateById(updateObj); + } + + @Override + public void deleteDatasetQuestion(Long id) { + // 校验存在 + validateDatasetQuestionExists(id); + // 删除 + platformDatasetQuestionMapper.deleteById(id); + } + + private void validateDatasetQuestionExists(Long id) { + if (platformDatasetQuestionMapper.selectById(id) == null) { + throw new RuntimeException("数据不存在"); + } + } + + @Override + public PlatformDatasetQuestionDO getDatasetQuestion(Long id) { + return platformDatasetQuestionMapper.selectById(id); + } + + @Override + public PageResult getDatasetQuestionPage(PlatformDatasetQuestionPageReqVO pageReqVO) { + PageResult datasetQuestionDOPageResult = platformDatasetQuestionMapper.selectPage(pageReqVO); + Long datasetId = pageReqVO.getDatasetId(); + DataSetMiddleDO datasetDO = dataSetMiddleService.getOne(datasetId); + Integer datasetParentType = datasetDO.getDatasetParentType(); + PageResult result = BeanUtils.toBean(datasetQuestionDOPageResult, PlatformDatasetQuestionRespVO.class); + if (CollectionUtils.isNotEmpty(result.getList())) { +// result.getList().forEach(item -> { +// List datasetAnswerDOS = datasetAnswerMapper.selectList(new LambdaQueryWrapper<>(DatasetAnswerDO.class) +// .eq(DatasetAnswerDO::getQuestionId, item.getId())); +// item.setDatasetAnswerRespVO(BeanUtils.toBean(datasetAnswerDOS, DatasetAnswerRespVO.class)); +// +// }); + // 优化代码 + List list = result.getList(); + // 获取答案 + Set collect = list.stream().map(PlatformDatasetQuestionRespVO::getId).collect(Collectors.toSet()); + LambdaQueryWrapper wrapper = new LambdaQueryWrapper() + .in(PlatformDatasetAnswerDO::getQuestionId, collect); + List datasetAnswerDOS = platformDatasetAnswerMapper.selectList(wrapper); + List respVOS = BeanUtils.toBean(datasetAnswerDOS, PlatformDatasetAnswerRespVO.class); + Map> collect1 = respVOS.stream().collect(Collectors.groupingBy(PlatformDatasetAnswerRespVO::getQuestionId)); + list.forEach(item -> { + item.setDatasetAnswerRespVO(collect1.get(item.getId())); +// if(datasetParentType==2){ +// LambdaQueryWrapper imagewrapper = new LambdaQueryWrapper() +// .eq(DatasetQuestionAnswerImageDO::getQuestionId, item.getId()) +// .eq(DatasetQuestionAnswerImageDO::getDatasetId,item.getDatasetId()); +// +// List datasetQuestionAnswerImageDOList = datasetQuestionAnswerImageMapper.selectList(imagewrapper); +// List imageUrlList = datasetQuestionAnswerImageDOList.stream().map(DatasetQuestionAnswerImageDO::getImageUrl).collect(Collectors.toList()); +// item.setImagesList(imageUrlList); +// } + }); + + } + return result; + } + + /** + * 获得 数据集数据问题 列表 + * + * @param datasetId 数据集ID + * @return 数据集数据问题 列表 + */ + @Override + public List getDatasetQuestionList(Long datasetId) { +// List datasetQuestionDOS = datasetQuestionMapper.selectList(new LambdaQueryWrapper<>(DatasetQuestionDO.class) +// .eq(DatasetQuestionDO::getDatasetId, datasetId)); +// List result = BeanUtils.toBean(datasetQuestionDOS, DatasetQuestionRespVO.class); + List result = platformDatasetQuestionMapper.getAListOfIssues(datasetId); + + if (CollectionUtils.isNotEmpty(result)) { + // 1. 获取待查询的 questionId 列表(假设 result 已分页,避免过大) + List collected = result.stream() + .map(PlatformDatasetQuestionRespVO::getId) + .collect(Collectors.toList()); + + // 2. 分批次查询答案(避免单次 IN 语句过长) + int batchSize = 1000; + List allAnswers = new ArrayList<>(); + for (int i = 0; i < collected.size(); i += batchSize) { + int end = Math.min(i + batchSize, collected.size()); + List batchIds = collected.subList(i, end); + allAnswers.addAll(platformDatasetAnswerMapper.getAnswersToYourQuestions(batchIds)); + } + + // 3. 构建 questionId 到答案列表的映射(加速匹配) + Map> answerMap = new HashMap<>(); + for (PlatformDatasetAnswerRespVO answer : allAnswers) { + answerMap.computeIfAbsent(answer.getQuestionId(), k -> new ArrayList<>()) + .add(answer); + } + +// 4. 为每个 question 绑定答案(O(n) 效率) + result.forEach(item -> { + item.setDatasetAnswerRespVO(answerMap.getOrDefault(item.getId(), Collections.emptyList())); + }); +// result.forEach(item -> { +// List datasetAnswerDOS = datasetAnswerMapper.selectList(new LambdaQueryWrapper<>(DatasetAnswerDO.class) +// .eq(DatasetAnswerDO::getQuestionId, item.getId())); +// item.setDatasetAnswerRespVO(BeanUtils.toBean(datasetAnswerDOS, DatasetAnswerRespVO.class)); +// }); + } + return result; + } + + @Override + public void updateDatasetQuestionDataAnno(List updateReqVOS) { + List ids = new ArrayList<>(); + Long datasetId = null; + for (PlatformDatasetQuestionSaveReqVO updateReqVO : updateReqVOS) { + if (datasetId == null) { + datasetId = updateReqVO.getDatasetId(); + } + PlatformDatasetQuestionDO datasetQuestionDO = BeanUtils.toBean(updateReqVO, PlatformDatasetQuestionDO.class); + + List datasetAnswerSaveReqVO = updateReqVO.getDatasetAnswerRespVO(); + List datasetAnswerDOS = BeanUtils.toBean(datasetAnswerSaveReqVO, PlatformDatasetAnswerDO.class); + if (CollectionUtils.isNotEmpty(datasetAnswerDOS)) { + for (PlatformDatasetAnswerDO datasetAnswerDO : datasetAnswerDOS) { + if (StringUtil.isNotBlank(datasetAnswerDO.getAnswer())) { + datasetQuestionDO.setStatus(2); + } + if (datasetAnswerDO.getId() == null) { + platformDatasetAnswerMapper.insert(datasetAnswerDO); + ids.add(datasetAnswerDO.getId()); + } else { + ids.add(datasetAnswerDO.getId()); + platformDatasetAnswerMapper.updateById(datasetAnswerDO); + } + } +// datasetAnswerMapper.insertOrUpdate(datasetAnswerDOS); + } + platformDatasetQuestionMapper.updateById(datasetQuestionDO); + } + List collect1 = updateReqVOS.stream().map(PlatformDatasetQuestionSaveReqVO::getId).collect(Collectors.toList()); + LambdaQueryWrapper queryWrapper = new LambdaQueryWrapper<>(); + queryWrapper.in(collect1 != null, PlatformDatasetAnswerDO::getQuestionId, collect1); + List datasetAnswerDOS = platformDatasetAnswerMapper.selectList(queryWrapper); + List collect = datasetAnswerDOS.stream().map(PlatformDatasetAnswerDO::getId).collect(Collectors.toList()); + List diff1 = new ArrayList<>(); + if (CollectionUtils.isNotEmpty(collect)) { + HashSet set1 = new HashSet<>(ids); + HashSet set2 = new HashSet<>(collect); + // 获取 set2 中有但 set1 中没有的元素 + set2.removeAll(set1); + diff1 = new ArrayList<>(set2); + } + if (CollectionUtils.isNotEmpty(diff1)) { + platformDatasetAnswerMapper.deleteBatchIds(diff1); + } + // 标注进度修改 + LambdaQueryWrapper wrapper = new LambdaQueryWrapper() + .eq(PlatformDatasetQuestionDO::getDatasetId, updateReqVOS.get(0).getDatasetId()); + Long sumCount = platformDatasetQuestionMapper.selectCount(wrapper); + wrapper.eq(PlatformDatasetQuestionDO::getStatus, 2); + Long annoCount = platformDatasetQuestionMapper.selectCount(wrapper); + double ratio = sumCount == 0 ? 0 : ((double) annoCount / sumCount) * 100; + Integer formattedRatio = ratio == 0 ? 0 : (int) ratio; + Integer status = formattedRatio == 100 ? 2 : 1; + dataSetMiddleService.updateProcess(formattedRatio, updateReqVOS.get(0).getDatasetId(), status); + } + + @Override + public Long getCountByDataSetId(Long datasetid){ + Long count = platformDatasetQuestionMapper.selectCount(Wrappers.lambdaQuery() + .eq(PlatformDatasetQuestionDO::getDatasetId, datasetid) + ); + return count; + } + + @Override + public Long getCountByDatasetid(LambdaQueryWrapper query){ + Long count = platformDatasetQuestionMapper.selectCount(query); + return count; + } + +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/DataProcessPlatformUtil.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/DataProcessPlatformUtil.java new file mode 100644 index 000000000..465ab2e85 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/DataProcessPlatformUtil.java @@ -0,0 +1,799 @@ +package cn.iocoder.yudao.module.mdpf.util; + + +import com.github.houbb.opencc4j.util.ZhConverterUtil; +import com.github.houbb.sensitive.word.core.SensitiveWordHelper; +import lombok.extern.slf4j.Slf4j; + +import java.nio.charset.StandardCharsets; +import java.text.DecimalFormat; +import java.time.Year; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + + @Slf4j + public class DataProcessPlatformUtil { + + /* + * --------------------------------------------------------------- + * 🔖 【 异常清洗配置 】 + * --------------------------------------------------------------- + */ + + /** + * 移除不可见字 + * 移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围 + * + * @param input + * @return + */ + public static String removeNonVisibleAsciiChars (String input) { + // 使用StringBuilder来构建正则表达式,因为我们需要动态地添加字符范围 + StringBuilder regex = new StringBuilder(); + regex.append("[\\x00-\\x1F]"); // 0-31范围的字符 + regex.append("|"); // OR 操作符 + regex.append("[\\x7F-\\xA0]"); // 127-160范围的字符 + + // 使用replaceAll方法和构建的正则表达式来移除不可见字符 + return input.replaceAll(regex.toString(), ""); + } + + /** + * 移除不可见字符 + *

+ * 将不同的unicode空格比如  u2008,转成正常的空格 + * + * @param input + * @return + */ + public static String convertUnicodeSpacesToNormalSpaces (String input) { + // Unicode空格字符的正则表达式,包括但不限于u2008等 + String unicodeSpacesRegex = "[\\u0020\\u00A0\\u1680\\u2000-\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]"; + + // 使用正则表达式替换匹配的Unicode空格字符为普通空格 + return input.replaceAll(unicodeSpacesRegex, " "); + } + + /** + * 移除不可见字符 + *

+ * 去除乱码和无意义的unicode + * + * @param input + * @return + */ + public static String removeNonPrintableUnicodeChars (String input) { + // 构建一个正则表达式,匹配所有非打印ASCII和非打印Unicode字符 + // \p{C} 匹配所有控制字符和格式字符 + // \p{Zs} 匹配所有空白分隔符(比如U+2000到U+200F之间的字符) + // 注意:有些空白字符可能是有意义的,比如空格(U+0020),所以这里的选择要谨慎 + // 如果你确定某些空白字符是无意义的,可以将其添加到正则表达式中 + String regex = "[\\p{C}\\p{Zs}&&[^\\s]]+|\\u0000"; // \\u0000 是NULL字符,通常是无意义的 + + // 使用replaceAll方法移除匹配的字符 + // 注意:这里使用了两个替换步骤,因为直接替换可能会导致正则表达式匹配问题 + // 首先替换掉所有匹配的字符为一个占位符(比如"*"),然后再替换掉占位符为空字符串 + // 这样做是为了避免在替换过程中正则表达式匹配到已经被替换掉的部分 + // 但在这种情况下,由于我们使用的是字符类匹配,其实直接替换为空字符串也是可以的 + // 下面的代码为了演示这种可能的复杂性而保留了两步替换的逻辑 + String intermediate = input.replaceAll(regex, "*"); // 这一步其实是多余的,但为了说明而保留 + return intermediate.replaceAll("[*]+", ""); // 这一步实际上完成了去除非打印字符的任务 + + // 简化版:直接替换为空字符串 + // return input.replaceAll(regex, ""); + } + + /** + * 繁体转简体 + *

+ * 繁体转简体,如“不經意,妳的笑容”清洗成“不经意,你的笑容” + * + * @param input + * @return + */ + public static String traditionalToSimplified (String input) { + return ZhConverterUtil.toSimple(input); + } + + // 使用正则表达式匹配HTML标签 + private static final String HTML_TAG_REGEX = "<[^>]+>"; + + /** + * 去除网页标识符 + *

+ * 移除文档中的html标签,如,

等 + * + * @param input + * @return + */ + public static String removeHtmlTags (String input) { + if (input == null || input.isEmpty()) { + return input; + } + // 使用replaceAll方法替换匹配的HTML标签为空字符串 + return input.replaceAll(HTML_TAG_REGEX, ""); + } + + // 这是一个简化的正则表达式,用于匹配常见的emoji表情符号。 + // 请注意,它可能不会涵盖所有可能的emoji,因为Unicode标准在不断发展。 + private static final String EMOJI_REGEX = "[\\uD83C-\\uD83D\\uD83E-\\uD83F\\u2600-\\u27FF" + + "\\u2B00-\\u2BFF\\u2F00-\\u2FFF\\u3000-\\u303F" + + "\\u3200-\\u32FF\\uA490-\\uA4CF\\uA900-\\uA97F" + + "\\uAC00-\\uAC7F\\uAC80-\\uACFF\\uD700-\\uD7AF" + + "\\uF900-\\uFAFF\\uFB00-\\uFB4F\\uFB50-\\uFDFF" + + "\\uFE00-\\uFE6F\\uFE70-\\uFEFF\\uFF00-\\uFFEF]"; + + /** + * 去除表情 + *

+ * 去除文档中的表情,如‘🐰’、‘👵’等 + * + * @param input + * @return + */ + public static String removeEmojis (String input) { + if (input == null || input.isEmpty()) { + return input; + } + Pattern pattern = Pattern.compile(EMOJI_REGEX, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(input); + return matcher.replaceAll(""); + } + + // 正则表达式用于匹配中文词汇(这里假设词汇由连续的中文字符组成) + private static final String CHINESE_WORD_REGEX = "[\\u4e00-\\u9fff]+"; + + // 方法:计算字符串中的中文字符数量 + // 注意:这里假设输入字符串只包含中文字符和可能的分隔符(如空格、标点符号等) + // 并且中文字符在UTF-16编码中占用两个char,但被视为一个逻辑字符 + private static int countChineseChars (String input) { + // 使用正则表达式匹配中文词汇,并计算匹配到的字符总数(这里需要除以2,因为每个中文字符占用两个char) + // 但为了简化,我们可以直接遍历字符,检查每个字符是否在中文范围内 + int count = 0; + for (char c : input.toCharArray()) { + if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS + || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS + || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A + // 可以根据需要添加更多Unicode块 + ) { + count++; + } + } + return count; + } + + /* + * --------------------------------------------------------------- + * 🔖 【 过滤配置 】 + * --------------------------------------------------------------- + */ + + /** + * 检查文档的词数目 + * 词数目不在指定范围会被过滤掉,如中文[1,1000000] + * + * @param text + * @param minChars + * @param maxChars + * @return + */ + public static List filterWords (String text, int minChars, int maxChars) { + List result = new ArrayList<>(); + Pattern pattern = Pattern.compile(CHINESE_WORD_REGEX); + Matcher matcher = pattern.matcher(text); + + while (matcher.find()) { + String word = matcher.group(); + int chineseCharCount = countChineseChars(word); // 计算中文字符数量 + if (chineseCharCount >= minChars && chineseCharCount <= maxChars) { + result.add(word); + } + } + return result; + } + + /** + * 检查文档的字重复率 + *

+ * 如果字重复率太高,意味着文档中重复的字太多,文档会被过滤掉 + *

+ * + * @param content 文档行 + * @param threshold 设置字重复率的阈值,例如10% + * @return true表示字重复率低于阈值,false表示字重复率高于阈值,文档会被过滤掉 + */ + public static boolean calculateCharacterRepetitionRate(String content, double threshold) { + // 输入校验 + if (content == null || content.trim().isEmpty()) { + return false; + } + + // 预处理(去空格、标点等) + String processedContent = content + .replaceAll("\\s+", "") + .replaceAll("[\\pP\\pS]", ""); + + // 短文本不检查 + if (processedContent.length() < 5) { + return false; + } + + // 统计字符频率 + Map charCount = new HashMap<>(); + char[] chars = processedContent.toCharArray(); + for (char c : chars) { + if (isChineseCharacter(c)) { // 可选:仅统计中文 + charCount.put(c, charCount.getOrDefault(c, 0) + 1); + } + } + + // 计算重复率(方式1:传统重复率) + int totalChars = chars.length; + double repetitionRate = (double) (totalChars - charCount.size()) / totalChars; + + // 将重复率转换为百分比(0~100),以便与阈值直接比较 + double repetitionPercent = repetitionRate * 100; + + // 调试日志(输出百分比) + log.info("总字数: {}", totalChars); + log.info("重复字数: {}", totalChars - charCount.size()); + log.info("字重复率: {}%", String.format("%.2f", repetitionPercent)); + + // 比较前可添加浮点数容差(可选) + final double EPSILON = 0.0001; + return repetitionPercent - threshold > EPSILON; + } + + + // 判断是否为中文字符(可选) + private static boolean isChineseCharacter(char c) { + Character.UnicodeScript sc = Character.UnicodeScript.of(c); + return sc == Character.UnicodeScript.HAN; + } + + // 简单的基于空格和标点符号的分词方法 + private static List tokenize (String text) { + // 使用正则表达式匹配非单词字符(包括空格、标点符号等),并将它们作为分隔符 + Pattern pattern = Pattern.compile("\\W+"); + String[] words = pattern.split(text.toLowerCase()); // 转换为小写以进行不区分大小写的比较 + List tokens = new ArrayList<>(); + for (String word : words) { + if (!word.isEmpty()) { // 排除空字符串 + tokens.add(word); + } + } + return tokens; + } + + // 方法:计算文档的词重复率 + + /** + * 检查文档的词重复率 + *

+ * 如果词重复率太高,意味着文档中重复的词太多,文档会被过滤掉 + * + * @param content + * @param threshold + * @return + */ + public static boolean calculateWordRepetitionRate (String content, double threshold) { + // 分词 + List words = tokenize(content); + + // 统计词出现次数 + Map wordCount = new HashMap<>(); + for (String word : words) { + wordCount.put(word, wordCount.getOrDefault(word, 0) + 1); + } + + // 计算重复词数和总词数 + int totalWords = words.size(); + int repeatedWords = 0; + for (int count : wordCount.values()) { + if (count > 1) { + repeatedWords += (count - 1); // 只计算重复的部分 + } + } + + // 计算词重复率 + double repetitionRate = (double) repeatedWords / totalWords; + + // 打印重复率和阈值,方便调试 + log.info("词重复率: " + repetitionRate); + log.info("阈值: " + threshold); + + // 如果重复率超过阈值,返回true表示需要过滤掉文档 + return repetitionRate > threshold; + } + + /** + * 检查文档的特殊字符率 + * 如果特殊字符率太高,意味着文档中特殊字符太多,文档会被过滤掉 + * + * @param content + * @param threshold + * @return + */ + /** + * 检测文本中特殊字符率是否超过阈值(阈值范围0~100.00) + * @param content 待检测文本 + * @param threshold 百分比阈值(如传入10表示10%) + * @return 超过阈值返回true + */ + public static boolean checkSpecialCharacterRate(String content, double threshold) { + // 参数校验 + if (content == null || content.isEmpty()) { + log.warn("输入内容为空"); + return false; + } + if (threshold < 0 || threshold > 100) { + throw new IllegalArgumentException("阈值必须是0~100之间的数值"); + } + + // 预处理:去除所有空白字符(可选) + String processedContent = content.replaceAll("\\s+", ""); + int totalCharCount = processedContent.length(); + + // 空文本或纯空白内容处理 + if (totalCharCount == 0) { + log.info("有效字符数为0"); + return false; + } + + // 统计特殊字符(非字母、数字、汉字) + // 正则说明: + // [^a-zA-Z0-9\\p{Script=Han}] → 排除字母数字和汉字 + // 如需包含其他语言字符,需调整正则 + Pattern pattern = Pattern.compile("[^a-zA-Z0-9\\p{Script=Han}]"); + Matcher matcher = pattern.matcher(processedContent); + + int specialCharCount = 0; + while (matcher.find()) { + specialCharCount++; + } + + // 计算特殊字符率(转换为百分比) + double specialCharRatePercent = (double) specialCharCount / totalCharCount * 100; + + // 调试日志(保留2位小数) + DecimalFormat df = new DecimalFormat("0.00"); + log.info("特殊字符检测结果: {}/{}={}% (阈值: {}%)", + specialCharCount, + totalCharCount, + df.format(specialCharRatePercent), + df.format(threshold)); + + // 浮点数精确比较(添加1e-6容差) + final double EPSILON = 1e-6; + return specialCharRatePercent - threshold > EPSILON; + } + + /** + * 检查文档的色情暴力词率 + *

+ * 如果色情暴力词率太高,文档会被过滤掉,取值范围[0,100]。 + *

+ * + * @param content 文本内容 + * @param threshold 阈值 + * @return 是否过滤文档 + */ + public static boolean checkSensitiveWordRate (String content, double threshold) { + // TODO: 先使用 sensitive-word 处理,有修改再调整 + + // 检测是否包含色情暴力词 + boolean isFalse = SensitiveWordHelper.contains(content); + if (!isFalse) { + return false; + } + + //返回所有敏感词 + List wordList = SensitiveWordHelper.findAll(content); + log.info("返回所有敏感词====>>>>{}", wordList); + + // 统计敏感词的字符数量 + int sensitiveWordLength = 0; + for (String word : wordList) { + sensitiveWordLength += word.length(); + } + // 计算文档的总字符数(不包括换行符等空白字符,可以根据需要调整) + // 或者使用 content.replaceAll("\\s+", "").length() 来排除空白字符 + int totalCharCount = content.length(); + + // 计算敏感词长度占总长度的百分比 + double specialCharRate = ((double) sensitiveWordLength / totalCharCount) * 100; + + // 打印敏感词字符率和阈值,方便调试 + log.info("敏感词字符率: {}", String.format("%.3f", specialCharRate)); + log.info("阈值: {}", threshold); + + // 如果敏感词字符率超过阈值,返回true表示需要过滤掉文档 + return specialCharRate > threshold; + } + /* + * --------------------------------------------------------------- + * 🔖 【 去重配置 】 + * --------------------------------------------------------------- + */ + + /** + * 相似度去重配置 + * + * @param contentMap 文本内容列表 + * @param threshold 相似度阈值 + * @return 是否需要去重 + */ + /** + * 基于SimHash的文本相似度去重 + * @param contentMap 文本集合(Key: 文档ID, Value: 文本内容) + * @param threshold 相似度阈值(0~1,如0.8表示80%相似) + * @return 需要删除的文档ID列表 + */ + public static List similarityDeduplication(Map contentMap, double threshold) { + // 参数校验 + if (contentMap == null || contentMap.isEmpty()) { + return Collections.emptyList(); + } + if (threshold < 0 || threshold > 1) { + throw new IllegalArgumentException("相似度阈值必须在0~1之间"); + } + + long startTime = System.currentTimeMillis(); + + // 1. 按文档ID排序(保持处理顺序确定性) + LinkedHashMap sortedMap = contentMap.entrySet().stream() + .sorted(Map.Entry.comparingByKey()) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e1, + LinkedHashMap::new)); + + // 2. 并行计算SimHash(提升大数据量性能) + Map simHashMap = sortedMap.entrySet().parallelStream() + .collect(Collectors.toMap( + Map.Entry::getKey, + entry -> HammingUtils.getSimHash(entry.getValue()), + (e1, e2) -> e1, + LinkedHashMap::new)); + + // 3. 相似度检测 + List duplicateKeys = new ArrayList<>(); + List processedIds = new ArrayList<>(simHashMap.keySet()); + + for (int i = 0; i < processedIds.size(); i++) { + Long currentId = processedIds.get(i); + if (duplicateKeys.contains(currentId)) { + continue; + } + + String hash1 = simHashMap.get(currentId); + + // 只与后续未处理的文档比较 + for (int j = i + 1; j < processedIds.size(); j++) { + Long compareId = processedIds.get(j); + if (duplicateKeys.contains(compareId)) { + continue; + } + + double similarity = HammingUtils.getSimilarity( + hash1, + simHashMap.get(compareId)); + + log.debug("文档 {} 与 {} 的相似度: {:.2f}%", + currentId, compareId, similarity * 100); + + if (similarity > threshold) { + duplicateKeys.add(compareId); + log.info("标记为相似: {} ≈ {} (相似度: {:.2f}%)", + currentId, compareId, similarity * 100); + } + } + } + + // 4. 性能日志 + long cost = System.currentTimeMillis() - startTime; + log.info("去重完成: 总数={}, 重复数={}, 耗时={}ms", + contentMap.size(), + duplicateKeys.size(), + cost); + + return duplicateKeys; + } + + /* + * --------------------------------------------------------------- + * 🔖 【 去隐私配置 】 + * --------------------------------------------------------------- + */ + // 定义一个正则表达式来匹配电子邮件地址 + private static final String EMAIL_REGEX = + "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,6}"; + + // 编译正则表达式为Pattern对象 + private static final Pattern EMAIL_PATTERN = Pattern.compile(EMAIL_REGEX); + + // 去除文本中的电子邮件地址 + private static String removeEmails (String text) { + Matcher matcher = EMAIL_PATTERN.matcher(text); + // 使用空字符串替换匹配的电子邮件地址 + return matcher.replaceAll(""); + } + + /** + * 去除Email + *

+ * 去除email地址 + * + * @param content + */ + public static String processFile (String content) { + + // 去除电子邮件地址 + String modifiedContent = removeEmails(content); + + // 或者打印到控制台以查看结果 + log.info("去除电子邮件地址:{}", modifiedContent); + return modifiedContent; + } + + // 定义一个正则表达式来匹配IPv4地址 + private static final String IPV4_REGEX = + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." + + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." + + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." + + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"; + + // 定义一个正则表达式来匹配IPv6地址 + // 这个正则表达式相对简单,可能无法匹配所有复杂的IPv6地址格式 + // 但它可以匹配常见的IPv6地址,如2001:0db8:85a3:0000:0000:8a2e:0370:7334 + private static final String IPV6_REGEX = + "([0-9a-fA-F]{1,4}:){7}([0-9a-fA-F]{1,4})"; + + // 编译IPv4正则表达式为Pattern对象 + private static final Pattern IPV4_PATTERN = Pattern.compile(IPV4_REGEX); + + // 编译IPv6正则表达式为Pattern对象 + private static final Pattern IPV6_PATTERN = Pattern.compile(IPV6_REGEX); + + /** + * 去除文本中的IPv4和IPv6地址 + */ + public static String removeIPAddresses (String text) { + Matcher ipv4Matcher = IPV4_PATTERN.matcher(text); + text = ipv4Matcher.replaceAll(""); + Matcher ipv6Matcher = IPV6_PATTERN.matcher(text); + return ipv6Matcher.replaceAll(""); + } + + /** + * 手机号码的正则表达式 + */ + private static final String MOBILE_REGEX = "1\\d{10}"; + + /** + * 国内电话号码的正则表达式 + */ + private static final String DOMESTIC_PHONE_REGEX = "(\\d{4}-|\\d{3}-)?(\\d{8}|\\d{7})"; + + private static final String HOTLINE_REGEX = "^\\d{3,4}(-\\d{3,4})+$"; + /** + * 电话号码(400)的正则表达式 + */ + private static final String PHONE_REGEX = "400(-\\d{3,4}){2}|^800(-\\d{3,4}){2}"; + + /** + * 信用卡号的正则表达式 + */ + private static final String CREDIT_CARD_REGEX = "^([1-9]{1})(\\d{15}|\\d{18})$"; + + /** + * 十六进制散列的正则表达式(32或24 位十六进制数,用于 SHA-256 等) + */ + private static final String HASH_REGEX = "[a-fA-F0-9]{32}|[a-fA-F0-9]{24}"; + + // 编译正则表达式为Pattern对象 + private static final Pattern MOBILE_PATTERN = Pattern.compile(MOBILE_REGEX); + private static final Pattern DOMESTIC_PHONE_PATTERN = Pattern.compile(DOMESTIC_PHONE_REGEX); + private static final Pattern PHONE_PATTERN = Pattern.compile(PHONE_REGEX); + private static final Pattern HOTLINE_PATTERN = Pattern.compile(HOTLINE_REGEX); + private static final Pattern CREDIT_CARD_PATTERN = Pattern.compile(CREDIT_CARD_REGEX); + private static final Pattern HASH_PATTERN = Pattern.compile(HASH_REGEX); + + // 定义一个年份格式 + private static final DateTimeFormatter YEAR_FORMAT = DateTimeFormatter.ofPattern("yyyy"); + + // 定义一个集合来存储要跳过的年份(这里我们假设跳过当前年份和前几年的范围) + private static final Set YEARS_TO_SKIP = new HashSet<>(); + + static { + int currentYear = Year.now().getValue(); + for (int i = currentYear - 5; i <= currentYear + 5; i++) { + YEARS_TO_SKIP.add(String.valueOf(i)); + } + } + + /** + * 去除数字 + *

+ * 去除数字和字母数字标识符,如电话号码、信用卡号、十六进制散列等,同时跳过年份和简单数字的实例 + * + * @param text + * @return + */ + public static String removeIdentifiers (String text) { + // 使用正则表达式匹配电话号码 + text = removePhone(text); + + // 使用正则表达式匹配信用卡号 + text = removeCreditCard(text); + + // 使用正则表达式匹配十六进制散列 + text = removeHashMatcher(text); + + // // 使用StringBuilder和StringBuilder的replace方法去除其他数字,但跳过年份和简单数字 + // // TODO: 这里目前有bug,先注释掉了。 + // StringBuilder sb = new StringBuilder(text); + // int index = 0; + // while ((index = findNextNumberToReplace(sb.toString())) != -1) { + // String number = sb.substring(index, findEndOfNumber(sb.toString(), index)); + // if (!isYear(number) && !isSimpleNumber(number)) { + // sb.replace(index, index + number.length(), ""); + // } + // } + return text; + } + + + /** + * 去除电话号码 + * + * @param text 文本 + * @return 去除电话号码后的文本 + */ + private static String removePhone (String text) { + // 手机号码的正则表达式 + Matcher mobileMatcher = MOBILE_PATTERN.matcher(text); + text = mobileMatcher.replaceAll(""); + + // 国内电话号码的正则表达式 + Matcher domesticPhoneMatcher = DOMESTIC_PHONE_PATTERN.matcher(text); + text = domesticPhoneMatcher.replaceAll(""); + + // 电话号码(400)的正则表达式 + Matcher phoneMatcher = PHONE_PATTERN.matcher(text); + text = phoneMatcher.replaceAll(""); + + // 热线电话格式的正则表达式 + Matcher hotlinePhoneMatcher = HOTLINE_PATTERN.matcher(text); + text = hotlinePhoneMatcher.replaceAll(""); + + return text; + } + + /** + * 去除信用卡号 + * + * @param text 文本 + * @return 去除信用卡号后的文本 + */ + private static String removeCreditCard (String text) { + Matcher creditCardMatcher = CREDIT_CARD_PATTERN.matcher(text); + text = creditCardMatcher.replaceAll(""); + return text; + } + + /** + * 去除十六进制散列 + * + * @param text 文本 + * @return 去除十六进制散列后的文本 + */ + private static String removeHashMatcher (String text) { + Matcher hashMatcher = HASH_PATTERN.matcher(text); + text = hashMatcher.replaceAll(""); + return text; + } + + // 查找下一个要替换的数字的起始索引 + private static int findNextNumberToReplace (String text) { + // 这里可以添加更复杂的逻辑来定位要替换的数字,但为了简化,我们假设数字以空格或非数字字符分隔 + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (Character.isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + // 找到数字的起始位置 + while (i < text.length() && (Character.isDigit(text.charAt(i)) || + (text.charAt(i) >= 'a' && text.charAt(i) <= 'f') || + (text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) { + i++; + } + // 返回数字的起始索引(减1,因为我们要在循环外部处理i的递增) + return i - 1 > 0 ? i - 1 : 0; + } + } + return -1; // 没有找到要替换的数字 + } + + // 找到数字的结束索引 + private static int findEndOfNumber (String text, int startIndex) { + // 从startIndex开始向后查找,直到遇到非数字字符 + for (int i = startIndex; i < text.length(); i++) { + if (!(Character.isDigit(text.charAt(i)) || + (text.charAt(i) >= 'a' && text.charAt(i) <= 'f') || + (text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) { + return i; + } + } + return text.length(); // 如果字符串以数字结束,则返回字符串的长度 + } + + // 检查一个字符串是否是年份 + private static boolean isYear (String str) { + try { + int year = Integer.parseInt(str); + Year y = Year.parse(str, YEAR_FORMAT); + return YEARS_TO_SKIP.contains(str); + } catch (Exception e) { + return false; + } + } + + // 检查一个字符串是否是简单数字(这里假设不超过六位的连续数字) + private static boolean isSimpleNumber (String str) { + try { + int number = Integer.parseInt(str); + return String.valueOf(number).equals(str) && number >= 0 && number < 1000000; + } catch (NumberFormatException e) { + return false; + } + } + + public static void main (String[] args) { + String textWithIdentifiers = "Here are some identifiers: 123-456-7890, 1234567812345678, a1b2c3d4e5f6a1b2c3d4e5f6, 2023, and 987654."; + // 去除标识符 + String textWithoutIdentifiers = removeIdentifiers(textWithIdentifiers); + // 打印结果 + log.info(textWithoutIdentifiers); + + // String traditionalText = "不經意,妳的笑容"; + // String simplifiedText = traditionalToSimplified(traditionalText); + // + // log.info("繁体文本: [" + traditionalText + "]"); + // log.info("简体文本: [" + simplifiedText + "]"); + //String dirtyString="?��简体文���f?�G��?��??�G�G��پ?�l?,,,杩欐槸涓€涓\\uE043贡鐮"; + // // 先进行编码转换 + // dirtyString = convertEncoding(dirtyString); + // // 再进行乱码和无意义 Unicode 字符的清理 + // String cleanString = clean(dirtyString); + //// String s1 = removeNonPrintableUnicodeChars(s); + // log.info("去除乱码:[{}]", cleanString); + } + + public static String clean (String input) { + // 更广泛的乱码字符范围,包括一些扩展的不可打印字符 + String cleanString = input.replaceAll("[\\x00-\\x1F\\x7F-\\x9F\\uFFFD]", ""); + // 去除无意义的 Unicode 字符,这里范围可根据实际情况修改 + cleanString = cleanString.replaceAll("[\\uE000-\\uF8FF]", ""); + return cleanString; + } + + public static String convertEncoding (String input) { + // 尝试多种编码转换,找到正确的编码 + String[] encodings = {"UTF-8", "GBK", "Big5", "ISO-8859-1"}; + for (String encoding : encodings) { + try { + byte[] bytes = input.getBytes(encoding); + String result = new String(bytes, StandardCharsets.UTF_8); + return result; + } catch (Exception e) { + // 编码转换失败,继续尝试下一个编码 + continue; + } + } + return input; + } + } + diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/HammingUtils.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/HammingUtils.java new file mode 100644 index 000000000..2a744a3ca --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/HammingUtils.java @@ -0,0 +1,83 @@ +package cn.iocoder.yudao.module.mdpf.util; + +import com.hankcs.hanlp.HanLP; +import lombok.extern.slf4j.Slf4j; + +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.util.ArrayList; +import java.util.List; + +@Slf4j +public class HammingUtils { + + // ======================== 新增方法 ======================== + /** + * 短文本处理逻辑(按字符拆分) + */ + private static List handleShortText(String str) { + List result = new ArrayList<>(); + for (char c : str.toCharArray()) { + result.add(String.valueOf(c)); + } + return result; + } + + // ======================== 原始方法(优化后) ======================== + public static String getHash(String str) { + try { + MessageDigest md = MessageDigest.getInstance("MD5"); + byte[] hash = md.digest(str.getBytes(StandardCharsets.UTF_8)); + return new BigInteger(1, hash).toString(2); + } catch (Exception e) { + log.error("Hash计算失败: {}", e.getMessage()); + return str; // 降级处理 + } + } + + public static String getSimHash(String str) { + int[] v = new int[128]; + // 修复点:调用已定义的handleShortText方法 + List keywords = str.length() < 200 ? + handleShortText(str) : + HanLP.extractKeyword(str, str.length()); + + for (int i = 0; i < keywords.size(); i++) { + String keywordHash = getHash(keywords.get(i)); + // 补全128位 + keywordHash = String.format("%128s", keywordHash) + .replace(' ', '0') + .substring(0, 128); + + int weight = 10 - (i / (keywords.size() / 10)); + for (int j = 0; j < 128; j++) { + v[j] += (keywordHash.charAt(j) == '1') ? weight : -weight; + } + } + + StringBuilder simHash = new StringBuilder(); + for (int bit : v) { + simHash.append(bit > 0 ? "1" : "0"); + } + return simHash.toString(); + } + + public static int getHammingDistance(String hash1, String hash2) { + if (hash1.length() != hash2.length()) { + return -1; + } + int distance = 0; + for (int i = 0; i < hash1.length(); i++) { + if (hash1.charAt(i) != hash2.charAt(i)) { + distance++; + } + } + return distance; + } + + public static double getSimilarity(String hash1, String hash2) { + int distance = getHammingDistance(hash1, hash2); + return 1.0 - (double) distance / 128; // 标准化到[0,1] + } +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/HttpURLConnectionUtil.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/HttpURLConnectionUtil.java new file mode 100644 index 000000000..4ecaf3b0f --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/HttpURLConnectionUtil.java @@ -0,0 +1,24 @@ +package cn.iocoder.yudao.module.mdpf.util; + +import java.net.HttpURLConnection; +import java.net.URL; + +public class HttpURLConnectionUtil { + public static HttpURLConnection readFile (String filePath) { + try { + URL url = new URL(filePath); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + + if (connection.getResponseCode() == HttpURLConnection.HTTP_OK) { + return connection; + } else { + System.out.println("Failed to fetch file. Server returned HTTP code: " + connection.getResponseCode()); + } + connection.disconnect(); + } catch (Exception e) { + System.out.println("Error fetching file from URL: " + e.getMessage()); + } + return null; + } +} diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/ParserUtils.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/ParserUtils.java new file mode 100644 index 000000000..241b836cb --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/ParserUtils.java @@ -0,0 +1,44 @@ +package cn.iocoder.yudao.module.mdpf.util; + + +import com.fasterxml.jackson.databind.ObjectMapper; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import java.time.LocalDateTime; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +/** + * 文件解析通用辅助工具类,提供创建文本片段Map等功能。 + */ +@Component +@Slf4j +public class ParserUtils { + + @Autowired + private ObjectMapper objectMapper; + + /** + * 创建一个包含原始文本和元数据的 Map。 + * 这个 Map 将作为中间数据结构,传递给 TextProcessor。 + */ + public Map createSegmentMap(String datasetMetaId, String originalMinioPath, + String fileExtension, String extractedText, + Map sourceSpecificMetadata, LocalDateTime processTime, + String segmentType) { + Map segmentMap = new HashMap<>(); + segmentMap.put("id", UUID.randomUUID().toString()); // 临时ID,便于在内存中追踪或作为MySQL的rawTextSegmentMongoId字段 + segmentMap.put("datasetMetaId", datasetMetaId); + segmentMap.put("originalMinioPath", originalMinioPath); + segmentMap.put("sourceFileExtension", fileExtension); + segmentMap.put("extractedText", extractedText != null ? extractedText : ""); + segmentMap.put("sourceSpecificMetadata", sourceSpecificMetadata != null ? sourceSpecificMetadata : Collections.emptyMap()); + segmentMap.put("extractTime", processTime); + segmentMap.put("segmentType", segmentType); + return segmentMap; + } +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/TextCleaningUtil.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/TextCleaningUtil.java new file mode 100644 index 000000000..5d96c0721 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/TextCleaningUtil.java @@ -0,0 +1,158 @@ +package cn.iocoder.yudao.module.mdpf.util; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * 文本清洗辅助工具类 + */ +public class TextCleaningUtil { + + // 简单HTML标签去除 + private static final Pattern HTML_TAG_PATTERN = Pattern.compile("<[^>]*>"); + // 简单Markdown格式去除 (粗体、斜体、链接、图片等) + private static final Pattern MARKDOWN_PATTERN = Pattern.compile("(\\*\\*|__)(.*?)\\1|(\\*|_)(.*?)\\3|\\[(.*?)\\]\\((.*?)\\)|!\\((.*?)\\)\\[(.*?)\\]"); + // 简单的邮箱和电话号码识别 (用于PII匿名化) + private static final Pattern EMAIL_PII_PATTERN = Pattern.compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,6}"); + private static final Pattern PHONE_PII_PATTERN = Pattern.compile("\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{4}|\\(\\d{3}\\)[-\\s]?\\d{3}[-\\s]?\\d{4}"); + + + /** + * 规范化空白字符:将多个空格、制表符、换行符替换为单个空格,并去除首尾空白。 + */ + public static String normalizeWhitespace(String text) { + if (text == null) return null; + return text.replaceAll("\\s+", " ").trim(); + } + + /** + * 规范化标点符号:将全角标点转半角,统一常见标点,去除重复标点等。 + * 这是一个简化版,实际可能需要更复杂的规则或第三方库。 + */ + public static String normalizePunctuation(String text) { + if (text == null) return null; + // 修复:移除Java中不支持的命名参数 (target: replacement:) + String result = text.replace(",", ",") + .replace("。", ".") + .replace("!", "!") + .replace("?", "?"); + + // 修复:String.replaceAll 不支持 lambda 表达式作为替换字符串。 + // 需要使用 Pattern 和 Matcher 显式处理来保留重复标点的第一个字符。 + Pattern p = Pattern.compile("[\\.,!?;]{2,}"); // 匹配两个或更多连续的 .,!?; 标点符号 + Matcher m = p.matcher(result); + StringBuffer sb = new StringBuffer(); // 用于构建替换后的字符串 + while (m.find()) { + // 对于每个匹配项,替换为该匹配项的第一个字符 + m.appendReplacement(sb, Matcher.quoteReplacement(m.group().substring(0, 1))); + } + m.appendTail(sb); // 将匹配后的剩余部分追加到StringBuffer + return sb.toString(); + } + + /** + * 去除HTML标签。 + */ + public static String removeHtmlTags(String text) { + if (text == null) return null; + return HTML_TAG_PATTERN.matcher(text).replaceAll(""); + } + + /** + * 去除常见的Markdown格式。 + */ + public static String removeMarkdownFormatting(String text) { + if (text == null) return null; + return MARKDOWN_PATTERN.matcher(text).replaceAll("$2$4$5$7"); // 替换为捕获组中的内容 + } + + /** + * 简单匿名化 PII (个人身份信息),例如邮箱和电话号码。 + * 返回一个包含清洗后文本和是否包含 PII 的 Map。 + */ + public static Map anonymizePii(String text) { + HashMap map = new HashMap<>(); + boolean hasPii = false; + if (text == null) { + // 修正:移除 'new *' 冗余行 + return new HashMap() {{ + put("text", null); + put("has_pii", false); + }}; + } + + Matcher emailMatcher = EMAIL_PII_PATTERN.matcher(text); + // 修正:移除命名参数 'replacement:' + if (emailMatcher.find()) { + text = emailMatcher.replaceAll("[EMAIL_REDACTED]"); + hasPii = true; + } + + Matcher phoneMatcher = PHONE_PII_PATTERN.matcher(text); + // 修正:移除命名参数 'replacement:' + if (phoneMatcher.find()) { + text = phoneMatcher.replaceAll("[PHONE_REDACTED]"); + hasPii = true; + } + + map.put("text",text); + map.put("has_pii",hasPii); + // 修正:移除 'new *' 冗余行 + return map; + } + + /** + * 检查文本是否包含敏感词。 + */ + public static boolean containsSensitiveWords(String text, List sensitiveWords) { + if (text == null || sensitiveWords == null || sensitiveWords.isEmpty()) { + return false; + } + String lowerText = text.toLowerCase(); + for (String word : sensitiveWords) { + if (lowerText.contains(word.toLowerCase())) { + return true; + } + } + return false; + } + + /** + * 简单计算文本质量得分 (占位符)。 + * 实际可能基于:可读性指数(Flesch-Kincaid),语法正确性API,内容相关性等。 + */ + public static Double calculateQualityScore(String text) { + if (text == null || text.trim().isEmpty()) { + return 0.0; + } + // 示例:基于文本长度和非标点字符比例的简单评分 + int length = text.length(); + long alphaNumericCount = text.chars().filter(Character::isLetterOrDigit).count(); + if (length == 0) return 0.0; + // 假设长度越长,字母数字占比越高,质量越高 + return Math.min(1.0, (double) alphaNumericCount / length + (double) length / 500.0); // 简单示例 + } + + /** + * 简单计算文本的Token数量 (占位符)。 + * 实际可能需要调用大模型分词器,如 SentencePiece, BPE 等。 + * 这里用空格分割词语来粗略估计。 + */ + public static Integer countTokens(String text) { + if (text == null || text.trim().isEmpty()) { + return 0; + } + // 简单的空格分词 + return text.split("\\s+").length; + } + + // TODO: 实现更复杂的文本处理功能,例如: + // - 语言检测 (使用 Apache Tika, Lingua 等库) + // - 关键词提取 + // - 实体识别 + // - 文本摘要 +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/TextProcessor.java b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/TextProcessor.java new file mode 100644 index 000000000..600723271 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/java/cn/iocoder/yudao/module/mdpf/util/TextProcessor.java @@ -0,0 +1,127 @@ +package cn.iocoder.yudao.module.mdpf.util; + +import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO; +import cn.iocoder.yudao.module.mdpf.util.TextCleaningUtil; // 此导入现在相对于新包是正确的 +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import org.springframework.util.DigestUtils; +import org.springframework.util.StringUtils; + +import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import java.util.*; +import java.util.stream.Collectors; + +/** + * 文本处理器,负责对原始文本片段进行深度清洗、质量评估和格式转换。 + */ +@Component +@Slf4j +public class TextProcessor { + + @Autowired + private ObjectMapper objectMapper; + + /** + * 清洗单个原始文本片段并评估其质量。 + * + * @param rawSegmentMap 原始文本片段信息 (来自解析策略的 Map) + * @param datasetId 关联的 DataSetMiddleDO 的 ID (MySQL 主表 ID) + * @param sourceFileId 关联的 DataSetMiddleMongoDO 的 ID (MongoDB 元数据 ID) + * @return 清洗后的 DataSetFileMiddleDO 实体,如果文本质量过低或被过滤则返回 null + */ + public DataSetFileMiddleDO cleanAndEvaluate(Map rawSegmentMap, Long datasetId, Long sourceFileId) { + // 从 Map 中提取原始文本片段的各项信息 + String originalMinioPath = (String) rawSegmentMap.get("originalMinioPath"); + String fileExtension = (String) rawSegmentMap.get("sourceFileExtension"); + String extractedText = (String) rawSegmentMap.get("extractedText"); + Map sourceSpecificMetadata = (Map) rawSegmentMap.get("sourceSpecificMetadata"); + + // 从 additionalMetadata (现在是 sourceSpecificMetadata 的一部分) 中获取新字段 + String dataSetFileUrl = (String) sourceSpecificMetadata.get("dataSetFileUrl"); + String dataSetFileType = (String) sourceSpecificMetadata.get("dataSetFileType"); + String datasetFileName = (String) sourceSpecificMetadata.get("datasetFileName"); + String sourceFileName = (String) sourceSpecificMetadata.get("sourceFileName"); + + + if (!StringUtils.hasText(extractedText)) { + log.warn("Skipping null or empty extracted text for datasetId: {}, Source File ID: {}", datasetId, sourceFileId); + return null; + } + + String cleanedText = extractedText; + StringBuilder remarks = new StringBuilder(); // 仅用于日志,不存入 DB + + // --- 深度清洗步骤 (使用 TextCleaningUtil) --- + cleanedText = TextCleaningUtil.normalizeWhitespace(cleanedText); + cleanedText = TextCleaningUtil.normalizePunctuation(cleanedText); + cleanedText = TextCleaningUtil.removeHtmlTags(cleanedText); + cleanedText = TextCleaningUtil.removeMarkdownFormatting(cleanedText); + Map piiResult = TextCleaningUtil.anonymizePii(cleanedText); + cleanedText = (String) piiResult.get("text"); + if ((Boolean) piiResult.get("has_pii")) { + remarks.append("包含PII,已匿名化; "); + } +// if (TextCleaningUtil.containsSensitiveWords(cleanedText,null)) { // <-- 这里的调用现在应该能正确解析了 +// cleanedText = TextCleaningUtil.filterSensitiveWords(cleanedText); // <-- 这里的调用现在应该能正确解析了 +// remarks.append("包含敏感词,已过滤; "); +// } + + // --- 质量评估与过滤 --- + Double qualityScoreDouble = TextCleaningUtil.calculateQualityScore(cleanedText); + BigDecimal qualityScore = BigDecimal.valueOf(qualityScoreDouble); // 转换为 BigDecimal + Integer tokenCount = TextCleaningUtil.countTokens(cleanedText); + + DataSetFileMiddleDO cleanedFileDO = new DataSetFileMiddleDO(); + + // 填充来自 DataSetMiddleServiceImpl 传递的 ID + cleanedFileDO.setDataSetId(datasetId); + cleanedFileDO.setSourceFileId(sourceFileId); // 对应 MongoDB 元数据 ID + + // 填充来自文件元数据的字段 + cleanedFileDO.setDataSetFileUrl(dataSetFileUrl); + cleanedFileDO.setDataSetFileType(dataSetFileType); + cleanedFileDO.setDatasetFileName(datasetFileName); + cleanedFileDO.setSourceFileUrl(originalMinioPath); // originalMinioPath 对应 source_file_url + cleanedFileDO.setSourceFileName(sourceFileName); + cleanedFileDO.setSourceFileExtension(fileExtension); + + // 填充清洗后的文本和相关度量 + cleanedFileDO.setCleanedText(cleanedText); + cleanedFileDO.setCleanedTextHash(DigestUtils.md5DigestAsHex(cleanedText.getBytes(StandardCharsets.UTF_8))); + cleanedFileDO.setQualityScore(qualityScore); + cleanedFileDO.setTokenCount(tokenCount); + cleanedFileDO.setCleanTime(LocalDateTime.now()); // 清洗时间是当前时间 + + // 最终过滤逻辑:文本太短或质量分过低 +// if (!StringUtils.hasText(cleanedText) || tokenCount <= 10 || qualityScore.compareTo(BigDecimal.valueOf(0.2)) < 0) { +// log.warn("Filtered out text segment due to final quality check (datasetId: {}, sourceFileId: {}), remarks: {}", datasetId, sourceFileId, remarks.toString()); +// return null; // 返回 null 表示该片段被过滤 +// } + + return cleanedFileDO; + } + + /** + * 批量清洗原始文本片段列表。 + * + * @param rawSegments 原始文本片段信息列表 + * @param datasetId 关联的 DataSetMiddleDO 的 ID (MySQL 主表 ID) + * @param sourceFileId 关联的 DataSetMiddleMongoDO 的 ID (MongoDB 元数据 ID) + * @return 清洗后的 DataSetFileMiddleDO 实体列表 + */ + public List cleanAndEvaluateList(List> rawSegments, Long datasetId, Long sourceFileId) { + if (rawSegments == null || rawSegments.isEmpty()) { + return Collections.emptyList(); + } + List list = rawSegments.stream() + .map(rawSegmentMap -> cleanAndEvaluate(rawSegmentMap, datasetId, sourceFileId)) + .filter(Objects::nonNull) // 过滤掉返回 null (即被过滤掉) 的片段 + .collect(Collectors.toList()); + return list; + } +} \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/DataSetFileMiddleMapper.xml b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/DataSetFileMiddleMapper.xml new file mode 100644 index 000000000..8f50ba329 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/DataSetFileMiddleMapper.xml @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/PlatformDatasetAnswerMapper.xml b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/PlatformDatasetAnswerMapper.xml new file mode 100644 index 000000000..629f0bc9f --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/PlatformDatasetAnswerMapper.xml @@ -0,0 +1,32 @@ + + + + + DELETE FROM platform_dataset_answer WHERE dataset_id = #{id} + + + + + + diff --git a/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/PlatformDatasetQuestionMapper.xml b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/PlatformDatasetQuestionMapper.xml new file mode 100644 index 000000000..f2e4fbf81 --- /dev/null +++ b/yudao-module-mdpf/yudao-module-mdpf-biz/src/main/resources/mapper/dataset/PlatformDatasetQuestionMapper.xml @@ -0,0 +1,30 @@ + + + + + DELETE FROM platform_dataset_question + WHERE dataset_id = #{id} + + + + + +