From 9e0a17c99637db40e01b796052072db0d5002ceb Mon Sep 17 00:00:00 2001 From: limin Date: Sun, 5 Jan 2025 19:44:47 +0800 Subject: [PATCH] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../module/llm/enums/ErrorCodeConstants.java | 1 + .../async/AsyncDataProcessService.java | 64 +++++++++++++++++++ .../DataProcessTaskServiceImpl.java | 16 +++++ 3 files changed, 81 insertions(+) diff --git a/yudao-module-llm/yudao-module-llm-api/src/main/java/cn/iocoder/yudao/module/llm/enums/ErrorCodeConstants.java b/yudao-module-llm/yudao-module-llm-api/src/main/java/cn/iocoder/yudao/module/llm/enums/ErrorCodeConstants.java index 3cb95882e..18e4d6e71 100644 --- a/yudao-module-llm/yudao-module-llm-api/src/main/java/cn/iocoder/yudao/module/llm/enums/ErrorCodeConstants.java +++ b/yudao-module-llm/yudao-module-llm-api/src/main/java/cn/iocoder/yudao/module/llm/enums/ErrorCodeConstants.java @@ -91,6 +91,7 @@ public interface ErrorCodeConstants { ErrorCode PROMPT_TEMPLATES_APPLICATIONS_BACKUP_NOT_EXISTS = new ErrorCode(10036, "模板信息不存在"); ErrorCode PROMPT_TEMPLATES_TAGS_BACKUP_NOT_EXISTS = new ErrorCode(10037, "模板信息不存在"); + ErrorCode DATA_PROCESS_TASK_NAME_NOT_EXISTS = new ErrorCode(10038, "数据处理任务名称已存在"); } diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java index f26755db7..64523a8a2 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java @@ -1,5 +1,7 @@ package cn.iocoder.yudao.module.llm.service.async; +import cn.hutool.json.JSONArray; +import cn.hutool.json.JSONObject; import cn.iocoder.yudao.framework.common.util.collection.CollectionUtils; import cn.iocoder.yudao.framework.common.util.object.BeanUtils; import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetQuestionRespVO; @@ -12,6 +14,8 @@ import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetAnswerMapper; import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetMapper; import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetQuestionMapper; import cn.iocoder.yudao.module.llm.service.dataset.DatasetQuestionService; +import cn.iocoder.yudao.module.llm.utils.DataProcessUtil; +import com.alibaba.druid.util.StringUtils; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; @@ -58,6 +62,7 @@ public class AsyncDataProcessService { DatasetQuestionDO questionDO = BeanUtils.toBean(item, DatasetQuestionDO.class); questionDO.setDatasetId(dataProcessTask.getDatasetPostId()); questionDO.setId(null); + questionDO.setQuestion(StringUtils.isEmpty(item.getQuestion()) ?item.getQuestion():task(item.getQuestion(), dataProcessTask.getOptions())); questionDO.setCreator(dataProcessTask.getCreator()); questionDO.setUpdater(dataProcessTask.getUpdater()); datasetQuestionMapper.insert(questionDO); @@ -69,6 +74,7 @@ public class AsyncDataProcessService { answerDO.setQuestionId(questionDO.getId()); answerDO.setCreator(dataProcessTask.getCreator()); answerDO.setUpdater(dataProcessTask.getCreator()); + answerDO.setAnswer(StringUtils.isEmpty(item1.getAnswer()) ? item1.getAnswer():task(item1.getAnswer(), dataProcessTask.getOptions())); answerDO.setId(null); answerDO.setDatasetId(dataProcessTask.getDatasetPostId()); datasetAnswerMapper.insert(answerDO); @@ -96,4 +102,62 @@ public class AsyncDataProcessService { } return sb.toString(); } + + public static String task(String str, JSONObject options) { + String result = str; + // 异常清洗配置处理 + JSONObject a = options.getJSONObject("a"); + //移除不可见字符 移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围 + JSONObject a_1 = a.getJSONObject("a_1"); + if (a_1.getBool("is_on")) { + result = DataProcessUtil.removeNonVisibleAsciiChars(result); + } + //移除不可见字符 将不同的unicode空格比如  u2008,转成正常的空格 + JSONObject a_2 = a.getJSONObject("a_2"); + if (a_2.getBool("is_on")) { + result = DataProcessUtil.convertUnicodeSpacesToNormalSpaces(result); + } + //移除不可见字符 去除乱码和无意义的unicode + JSONObject a_3 = a.getJSONObject("a_3"); + if (a_3.getBool("is_on")) { + result = DataProcessUtil.removeNonPrintableUnicodeChars(result); + } + // 繁体转简体 繁体转简体,如“不經意,妳的笑容”清洗成“不经意,你的笑容” + JSONObject a_4 = a.getJSONObject("a_4"); + if (a_4.getBool("is_on")) { + result = DataProcessUtil.TraditionalToSimplified(result); + } + // 去除网页标识符 移除文档中的html标签,如,

等 + JSONObject a_5 = a.getJSONObject("a_5"); + if (a_5.getBool("is_on")) { + result = DataProcessUtil.removeHtmlTags(result); + } + // 去除表情 去除文档中的表情,如‘🐰’、‘👵’等 + JSONObject a_6 = a.getJSONObject("a_6"); + if (a_6.getBool("is_on")) { + result = DataProcessUtil.removeEmojis(result); + } + // TODO 过滤配置 去重配置 暂停处理 + + // 去隐私配置 + JSONObject d = options.getJSONObject("d"); + // 去除EMAIL地址 + JSONObject d_1 = d.getJSONObject("d_1"); + if (d_1.getBool("is_on")) { + result = DataProcessUtil.processFile(result); + } + // 去除IP地址 去除IPV4或者IPV6地址 + JSONObject d_2 = d.getJSONObject("d_2"); + if (d_2.getBool("is_on")) { + result = DataProcessUtil.removeIPAddresses(result); + } + //去除数字 去除数字和字母数字标识符,如电话号码、信用卡号、 + //十六进制散列等,同时跳过年份和简单数字的实例 + JSONObject d_3 = d.getJSONObject("d_3"); + if (d_3.getBool("is_on")) { +// result = DataProcessUtil.removeIdentifiers(result); + } + return result; + + } } diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataprocesstask/DataProcessTaskServiceImpl.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataprocesstask/DataProcessTaskServiceImpl.java index 414ca4121..93f145646 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataprocesstask/DataProcessTaskServiceImpl.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataprocesstask/DataProcessTaskServiceImpl.java @@ -5,6 +5,7 @@ import cn.iocoder.yudao.module.llm.dal.dataobject.basemodel.BaseModelDO; import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetDO; import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetMapper; import cn.iocoder.yudao.module.llm.service.async.AsyncDataProcessService; +import cn.iocoder.yudao.module.llm.utils.DataProcessUtil; import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; @@ -45,6 +46,7 @@ public class DataProcessTaskServiceImpl implements DataProcessTaskService { @Override public Long createDataProcessTask(DataProcessTaskSaveReqVO createReqVO) { + validateDataProcessTaskNameExists(createReqVO); // 插入 DataProcessTaskDO dataProcessTask = BeanUtils.toBean(createReqVO, DataProcessTaskDO.class); dataProcessTask.setStatus(1); @@ -53,12 +55,26 @@ public class DataProcessTaskServiceImpl implements DataProcessTaskService { // 返回 return dataProcessTask.getId(); } + private void validateDataProcessTaskNameExists(DataProcessTaskSaveReqVO task) { + LambdaQueryWrapper wrapper = new LambdaQueryWrapper() + .eq(DataProcessTaskDO::getTaskName, task.getTaskName()); + + if (task.getId() != null){ + wrapper.ne(DataProcessTaskDO::getId, task.getId()); + } + List dataProcessTaskDO = dataProcessTaskMapper.selectList(wrapper); + if (com.baomidou.mybatisplus.core.toolkit.CollectionUtils.isNotEmpty(dataProcessTaskDO)){ + throw exception(DATA_PROCESS_TASK_NAME_NOT_EXISTS); + } + } + @Override public void updateDataProcessTask(DataProcessTaskSaveReqVO updateReqVO) { // 校验存在 validateDataProcessTaskExists(updateReqVO.getId()); + validateDataProcessTaskNameExists(updateReqVO); // 更新 DataProcessTaskDO updateObj = BeanUtils.toBean(updateReqVO, DataProcessTaskDO.class); dataProcessTaskMapper.updateById(updateObj);