[update] 数据处理修改1

2025-01-17 20:34:20 +08:00 · 2025-01-17 20:34:20 +08:00 · 499bd3df78
commit 499bd3df78
parent e765907cd0
6 changed files with 548 additions and 126 deletions
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java
@ -3,6 +3,7 @@ package cn.iocoder.yudao.module.llm.service.async;
 import cn.hutool.json.JSONObject;
 import cn.iocoder.yudao.framework.common.util.collection.CollectionUtils;
 import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
+import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetAnswerRespVO;
 import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetQuestionRespVO;
 import cn.iocoder.yudao.module.llm.dal.dataobject.dataprocesstask.DataProcessTaskDO;
 import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetAnswerDO;
@ -15,13 +16,17 @@ import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetQuestionMapper;
 import cn.iocoder.yudao.module.llm.service.dataset.DatasetQuestionService;
 import cn.iocoder.yudao.module.llm.utils.DataProcessUtil;
 import com.alibaba.druid.util.StringUtils;
+import com.alibaba.fastjson.JSON;
+import lombok.extern.slf4j.Slf4j;
 import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Service;

 import javax.annotation.Resource;
 import java.security.SecureRandom;
-import java.util.List;
+import java.util.*;
+import java.util.stream.Collectors;

+@Slf4j
@Service
 public class AsyncDataProcessService {
    @Resource
@ -36,13 +41,12 @@ public class AsyncDataProcessService {
    private DatasetAnswerMapper datasetAnswerMapper;
    private static final String CHARACTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
    private static final SecureRandom RANDOM = new SecureRandom();
-  /*  @Resource
-    private */
+
    @Async
-    public void backups(DataProcessTaskDO dataProcessTask) {
+    public void backups (DataProcessTaskDO dataProcessTask) {
        try {
-            // 判断备份数据集是否存在
-            if (dataProcessTask.getDatasetPostId() == null){
+            // 判断是否备份
+            if (dataProcessTask.getDatasetPostId() == null) {
                DatasetDO datasetDO = datasetMapper.selectById(dataProcessTask.getDatasetId());
                DatasetDO newData = BeanUtils.toBean(datasetDO, DatasetDO.class);
                newData.setId(null);
@ -55,34 +59,10 @@ public class AsyncDataProcessService {
            datasetQuestionMapper.deleteTrue(dataProcessTask.getDatasetPostId());
            datasetAnswerMapper.deleteTrue(dataProcessTask.getDatasetPostId());

-            List<DatasetQuestionRespVO> datasetQuestionList = datasetQuestionService.getDatasetQuestionList(dataProcessTask.getDatasetId());
-            if (!CollectionUtils.isAnyEmpty(datasetQuestionList)){
-                datasetQuestionList.forEach(item -> {
-                    DatasetQuestionDO questionDO = BeanUtils.toBean(item, DatasetQuestionDO.class);
-                    questionDO.setDatasetId(dataProcessTask.getDatasetPostId());
-                    questionDO.setId(null);
-                    questionDO.setQuestion(StringUtils.isEmpty(item.getQuestion()) ?item.getQuestion():task(item.getQuestion(), dataProcessTask.getOptions()));
-                    questionDO.setCreator(dataProcessTask.getCreator());
-                    questionDO.setUpdater(dataProcessTask.getUpdater());
-                    datasetQuestionMapper.insert(questionDO);
-                    System.out.println(item);
-                    System.out.println(questionDO);
-                    if (!CollectionUtils.isAnyEmpty(item.getDatasetAnswerRespVO())){
-                        item.getDatasetAnswerRespVO().forEach(item1 -> {
-                            DatasetAnswerDO answerDO = BeanUtils.toBean(item1, DatasetAnswerDO.class);
-                            answerDO.setQuestionId(questionDO.getId());
-                            answerDO.setCreator(dataProcessTask.getCreator());
-                            answerDO.setUpdater(dataProcessTask.getCreator());
-                            answerDO.setAnswer(StringUtils.isEmpty(item1.getAnswer()) ? item1.getAnswer():task(item1.getAnswer(), dataProcessTask.getOptions()));
-                            answerDO.setId(null);
-                            answerDO.setDatasetId(dataProcessTask.getDatasetPostId());
-                            datasetAnswerMapper.insert(answerDO);
-                        });
-                    }
-                });
-            }
-            dataProcessTaskMapper.updateStatus(dataProcessTask.getId(), 2);
-        }catch(Exception e){
+            // 开始数据处理
+            startDataProcess(dataProcessTask);
+
+        } catch (Exception e) {
            datasetQuestionMapper.deleteTrue(dataProcessTask.getDatasetPostId());
            datasetAnswerMapper.deleteTrue(dataProcessTask.getDatasetPostId());
            datasetMapper.deleteTrue(dataProcessTask.getDatasetPostId());
@ -90,10 +70,139 @@ public class AsyncDataProcessService {
            dataProcessTask.setStatus(5);
            dataProcessTaskMapper.updateById(dataProcessTask);

-//            dataProcessTaskMapper.updateStatus(dataProcessTask.getId(), 5);
-        };
+        }
+
    }
-    public static String generateRandomString(int length) {
+
+    /**
+     * 数据处理
+     *
+     * @param dataProcessTask 数据处理任务
+     */
+    private void startDataProcess (DataProcessTaskDO dataProcessTask) {
+        log.info(" =========== 数据处理开始 =========== {}", JSON.toJSONString(dataProcessTask));
+
+        // 获取所有问题和回答
+        List<DatasetQuestionRespVO> datasetQuestionList = datasetQuestionService.getDatasetQuestionList(dataProcessTask.getDatasetId());
+        //        log.info(" *********** 处理前数据: *********** {}", JSON.toJSONString(datasetQuestionList));
+        // 所有列表循环处理
+        if (!CollectionUtils.isAnyEmpty(datasetQuestionList)) {
+            // 获取数据处理配置
+            JSONObject options = dataProcessTask.getOptions();
+
+            // 先处理问题，有需要过滤文档的直接把问题和回答都过滤掉
+            datasetQuestionList = processQuestion(datasetQuestionList, options);
+
+            // 处理回答
+            datasetQuestionList = processAnswer(datasetQuestionList, options);
+        }
+
+
+        if (!CollectionUtils.isAnyEmpty(datasetQuestionList)) {
+            datasetQuestionList.forEach(item -> {
+                DatasetQuestionDO questionDO = BeanUtils.toBean(item, DatasetQuestionDO.class);
+                questionDO.setDatasetId(dataProcessTask.getDatasetPostId());
+                questionDO.setId(null);
+                questionDO.setQuestion(StringUtils.isEmpty(item.getQuestion()) ? "":replaceStr(item.getQuestion()));
+                questionDO.setCreator(dataProcessTask.getCreator());
+                questionDO.setUpdater(dataProcessTask.getUpdater());
+                datasetQuestionMapper.insert(questionDO);
+
+                if (!CollectionUtils.isAnyEmpty(item.getDatasetAnswerRespVO())) {
+                    item.getDatasetAnswerRespVO().forEach(item1 -> {
+                        DatasetAnswerDO answerDO = BeanUtils.toBean(item1, DatasetAnswerDO.class);
+                        answerDO.setQuestionId(questionDO.getId());
+                        answerDO.setCreator(dataProcessTask.getCreator());
+                        answerDO.setUpdater(dataProcessTask.getCreator());
+                        answerDO.setAnswer(StringUtils.isEmpty(item1.getAnswer()) ? "":replaceStr(item1.getAnswer()));
+                        answerDO.setId(null);
+                        answerDO.setDatasetId(dataProcessTask.getDatasetPostId());
+                        datasetAnswerMapper.insert(answerDO);
+                    });
+                }
+            });
+        }
+
+
+        dataProcessTaskMapper.updateStatus(dataProcessTask.getId(), 2);
+        log.info(" =========== 数据处理结束 =========== {}", JSON.toJSONString(dataProcessTask));
+        //        log.info(" *********** 处理后数据: ************ {}", JSON.toJSONString(datasetQuestionList));
+    }
+
+    /**
+     * 处理问题
+     *
+     * @param datasetQuestionList 数据集列表
+     * @param options             配置
+     * @return 处理后的数据集列表
+     */
+    private List<DatasetQuestionRespVO> processQuestion (List<DatasetQuestionRespVO> datasetQuestionList, JSONObject options) {
+        // 转化为 Map，key 为 id，value 为问题文本
+        Map<Long, String> questionMap = datasetQuestionList.stream()
+                .collect(Collectors.toMap(DatasetQuestionRespVO::getId, DatasetQuestionRespVO::getQuestion));
+
+        // 调用 processTask 方法对问题进行处理，将处理后的结果存储在 processAfterMap 中
+        Map<Long, String> processAfterMap = processTask(questionMap, options);
+
+        // 筛选出在 processAfterMap 中存在对应 id 的元素，并更新问题，收集为新列表
+        return datasetQuestionList.stream()
+                // 过滤掉 processAfterMap 中不存在对应 id 的元素
+                .filter(item -> processAfterMap.containsKey(item.getId()))
+                // 更新问题
+                .peek(item -> item.setQuestion(processAfterMap.get(item.getId())))
+                .collect(Collectors.toList());
+    }
+
+    /**
+     * 处理回答
+     *
+     * @param datasetQuestionList 数据集列表
+     * @param options             配置
+     * @return 处理后的数据集列表
+     */
+    private List<DatasetQuestionRespVO> processAnswer (List<DatasetQuestionRespVO> datasetQuestionList, JSONObject options) {
+        // 先去除所有空的回答
+        datasetQuestionList.removeIf(item -> com.baomidou.mybatisplus.core.toolkit.CollectionUtils.isEmpty(item.getDatasetAnswerRespVO()));
+
+        // 获取所有回答
+        List<DatasetAnswerRespVO> datasetAnswerList = datasetQuestionList.stream()
+                .flatMap(item -> item.getDatasetAnswerRespVO().stream())
+                .collect(Collectors.toList());
+
+        // 转化为 Map，key 为 id，value 为回答文本
+        Map<Long, String> answerMap = datasetAnswerList.stream()
+                .collect(Collectors.toMap(DatasetAnswerRespVO::getId, DatasetAnswerRespVO::getAnswer));
+
+        // 调用 processTask 方法对回答进行处理，将处理后的结果存储在 processAfterMap 中
+        Map<Long, String> processAfterMap = processTask(answerMap, options);
+
+        // 筛选出在 processAfterMap 中存在对应 id 的元素，并更新问题，收集为新列表
+        datasetAnswerList = datasetAnswerList.stream()
+                // 过滤掉 processAfterMap 中不存在对应 id 的元素
+                .filter(item -> processAfterMap.containsKey(item.getId()))
+                // 更新问题
+                .peek(item -> item.setAnswer(processAfterMap.get(item.getId())))
+                .collect(Collectors.toList());
+
+        // 赋值回 datasetQuestionList 列表
+        List<DatasetAnswerRespVO> finalDatasetAnswerList = datasetAnswerList;
+        datasetQuestionList.forEach(question -> {
+            List<DatasetAnswerRespVO> updatedAnswers = finalDatasetAnswerList.stream()
+                    .filter(answer -> question.getDatasetAnswerRespVO().stream()
+                            .anyMatch(originalAnswer -> originalAnswer.getId().equals(answer.getId())))
+                    .collect(Collectors.toList());
+            question.setDatasetAnswerRespVO(updatedAnswers);
+        });
+
+        return datasetQuestionList;
+
+    }
+
+    public static String replaceStr(String text) {
+      return text.replace("[", "").replace("]", "");
+    }
+
+    public static String generateRandomString (int length) {
        StringBuilder sb = new StringBuilder(length);
        for (int i = 0; i < length; i++) {
            int index = RANDOM.nextInt(CHARACTERS.length());
@ -102,61 +211,285 @@ public class AsyncDataProcessService {
        return sb.toString();
    }

-    public static String task(String str, JSONObject options) {
-        String result = str;
-        // 异常清洗配置处理
-        JSONObject a = options.getJSONObject("a");
-        //移除不可见字符 移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围
-        JSONObject a_1 = a.getJSONObject("a_1");
+
+    /**
+     * 处理任务
+     *
+     * @param texts   文本列表
+     * @param options 配置
+     * @return 处理后的文本列表
+     */
+    public static Map<Long, String> processTask (Map<Long, String> texts, JSONObject options) {
+        // 数据处理任务 1 ：异常清洗配置处理
+        texts = dataCleaning(texts, options);
+
+        // 数据处理任务 2 ：过滤配置处理
+        texts = dataFilter(texts, options);
+
+        // 数据处理任务 3 ：去重配置处理
+        texts = dataDistinct(texts, options);
+
+        // 数据处理任务 4 ：去隐私配置处理
+        texts = dataAnonymization(texts, options);
+
+        return texts;
+    }
+
+
+    /**
+     * 数据处理任务 1 ：异常清洗配置处理
+     *
+     * @param texts   文本列表
+     * @param options 数据处理配置
+     * @return 处理后的文本列表
+     */
+    private static Map<Long, String> dataCleaning (Map<Long, String> texts, JSONObject options) {
+        JSONObject optionsA = options.getJSONObject("a");
+
+        Map<Long, String> resultMap = new HashMap<>();
+        if (com.baomidou.mybatisplus.core.toolkit.CollectionUtils.isNotEmpty(texts)) {
+            for (Map.Entry<Long, String> entry : texts.entrySet()) {
+                Long key = entry.getKey();
+                String value = entry.getValue();
+                if (org.apache.commons.lang3.StringUtils.isNotBlank(value)) {
+                    String cleanedValue = dataCleaningDetail(optionsA, value);
+                    resultMap.put(key, cleanedValue);
+                }
+            }
+        }
+
+        return resultMap;
+    }
+
+    /**
+     * 异常清洗配置处理
+     *
+     * @param options 异常清洗配置
+     * @param text    要处理的文本
+     * @return 处理后的文本
+     */
+    public static String dataCleaningDetail (JSONObject options, String text) {
+
+        String result = text;
+        // 移除不可见字符 移除 ASCII 中的一些不可见字符, 如 0-32 和 127-160 这两个范围
+        JSONObject a_1 = options.getJSONObject("a_1");
        if (a_1.getBool("is_on")) {
            result = DataProcessUtil.removeNonVisibleAsciiChars(result);
        }
-        //移除不可见字符 将不同的unicode空格比如  u2008，转成正常的空格
-        JSONObject a_2 = a.getJSONObject("a_2");
+        // 移除不可见字符 将不同的 unicode 空格比如  u2008，转成正常的空格
+        JSONObject a_2 = options.getJSONObject("a_2");
        if (a_2.getBool("is_on")) {
            result = DataProcessUtil.convertUnicodeSpacesToNormalSpaces(result);
        }
-        //移除不可见字符 去除乱码和无意义的unicode
-        JSONObject a_3 = a.getJSONObject("a_3");
+        // 移除不可见字符 去除乱码和无意义的 unicode
+        JSONObject a_3 = options.getJSONObject("a_3");
        if (a_3.getBool("is_on")) {
            result = DataProcessUtil.removeNonPrintableUnicodeChars(result);
        }
        // 繁体转简体 繁体转简体，如“不經意，妳的笑容”清洗成“不经意，你的笑容”
-        JSONObject a_4 = a.getJSONObject("a_4");
+        JSONObject a_4 = options.getJSONObject("a_4");
        if (a_4.getBool("is_on")) {
            result = DataProcessUtil.traditionalToSimplified(result);
        }
-        // 去除网页标识符 移除文档中的html标签，如<html>,<dev><p>等
-        JSONObject a_5 = a.getJSONObject("a_5");
+        // 去除网页标识符 移除文档中的 html 标签，如<html>,<div><p>等
+        JSONObject a_5 = options.getJSONObject("a_5");
        if (a_5.getBool("is_on")) {
            result = DataProcessUtil.removeHtmlTags(result);
        }
        // 去除表情 去除文档中的表情，如‘🐰’、‘👵’等
-        JSONObject a_6 = a.getJSONObject("a_6");
+        JSONObject a_6 = options.getJSONObject("a_6");
        if (a_6.getBool("is_on")) {
            result = DataProcessUtil.removeEmojis(result);
        }
-        //  TODO 过滤配置 去重配置 暂停处理
+        return result;
+    }

-        // 去隐私配置
-        JSONObject d = options.getJSONObject("d");
-        // 去除EMAIL地址
-        JSONObject d_1 = d.getJSONObject("d_1");
+    /**
+     * 数据处理任务 2 ：过滤配置处理
+     *
+     * @param texts   文本列表
+     * @param options 数据处理配置
+     * @return 处理后的文本列表
+     */
+    private static Map<Long, String> dataFilter (Map<Long, String> texts, JSONObject options) {
+        JSONObject optionsB = options.getJSONObject("b");
+        Map<Long, String> resultMap = new HashMap<>();
+        if (com.baomidou.mybatisplus.core.toolkit.CollectionUtils.isNotEmpty(texts)) {
+            for (Map.Entry<Long, String> entry : texts.entrySet()) {
+                log.info("======================");
+                Long key = entry.getKey();
+                String value = entry.getValue();
+                if (org.apache.commons.lang3.StringUtils.isNotBlank(value)) {
+                    // 检查文档的词数目
+                    String cleanedValue = dataFilterDetailByWordNumber(optionsB, value);
+                    // 使用检测文档次数目后的文本继续进行检测
+                    log.info("检测文档次数目后的文本：{}", cleanedValue);
+                    // 其他过滤配置
+                    boolean shouldFilter = dataFilterDetailByOther(optionsB, cleanedValue);
+                    if (!shouldFilter) {
+                        resultMap.put(key, cleanedValue);
+                    }
+                }
+            }
+        }
+        return resultMap;
+    }
+
+    /**
+     * 过滤配置处理：检查文档的词数目
+     *
+     * @param options 过滤配置
+     * @param text    文本
+     * @return 处理后的文本
+     */
+    public static String dataFilterDetailByWordNumber (JSONObject options, String text) {
+        String result = text;
+        // 检查文档的词数目：词数目不在指定范围会被过滤掉，如中文，取值范围[1,1000000]（正整数）。
+        JSONObject b_1 = options.getJSONObject("b_1");
+        if (b_1.getBool("is_on")) {
+            Integer num1 = b_1.getInt("num1");
+            Integer num2 = b_1.getInt("num2");
+            result = DataProcessUtil.filterWords(result, num1, num2).toString();
+        }
+
+        return result;
+    }
+
+    /**
+     * 过滤配置处理：其他过滤配置
+     *
+     * @param options 过滤配置
+     * @param text    文本
+     * @return 是否过滤文档
+     */
+    public static boolean dataFilterDetailByOther (JSONObject options, String text) {
+        // 检查文档的字重复率：如果字重复率太高，意味着文档中重复的字太多，文档会被过滤掉，取值范围[0,1]。
+        JSONObject b_2 = options.getJSONObject("b_2");
+        if (b_2.getBool("is_on")) {
+            int threshold = b_2.getInt("num1");
+            if (DataProcessUtil.calculateCharacterRepetitionRate(text, threshold)) {
+                return true;
+            }
+        }
+
+        // 检查文档的词重复率：如果词重复率太高，意味着文档中重复的词太多，文档会被过滤掉，取值范围[0,1]。
+        JSONObject b_3 = options.getJSONObject("b_3");
+        if (b_3.getBool("is_on")) {
+            double threshold = b_3.getDouble("num1");
+            if (DataProcessUtil.calculateWordRepetitionRate(text, threshold)) {
+                return true;
+            }
+        }
+
+        // 检查文档的特殊字符率：如果特殊字符率太高，意味着文档中特殊字符太多，文档会被过滤掉，取值范围[0,1]。
+        JSONObject b_4 = options.getJSONObject("b_4");
+        if (b_4.getBool("is_on")) {
+            double threshold = b_4.getDouble("num1");
+            if (DataProcessUtil.checkSpecialCharacterRate(text, threshold)) {
+                log.info("过滤特殊字符检测：{}", text);
+                return true;
+            }
+        }
+
+        // 检查文档的色情暴力词率：如果色情暴力词率太高，文档会被过滤掉，取值范围[0,1]。
+        JSONObject b_5 = options.getJSONObject("b_5");
+        if (b_5.getBool("is_on")) {
+            double threshold = b_5.getDouble("num1");
+            if (DataProcessUtil.checkSensitiveWordRate(text, threshold)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * 数据处理任务 3 ：去重配置处理
+     *
+     * @param texts   文本列表
+     * @param options 数据处理配置
+     * @return 处理后的文本列表
+     */
+    private static Map<Long, String> dataDistinct (Map<Long, String> texts, JSONObject options) {
+        JSONObject optionsC = options.getJSONObject("c");
+        Map<Long, String> resultMap = new HashMap<>();
+        if (com.baomidou.mybatisplus.core.toolkit.CollectionUtils.isNotEmpty(texts)) {
+            List<Long> repeatedKeys = dataDistinctDetail(optionsC, texts);
+            // 从原列表中移除相似的
+            texts.keySet().removeIf(repeatedKeys::contains);
+            // 直接将 剩下的 texts 中的元素添加到 resultMap 中
+            resultMap.putAll(texts);
+        }
+        return resultMap;
+    }
+
+    /**
+     * 去重配置处理
+     *
+     * @param texts   文本列表
+     * @param options 去重配置
+     * @return 需要去重读取的 key(Id) 列表
+     */
+    private static List<Long> dataDistinctDetail (JSONObject options, Map<Long, String> texts) {
+        JSONObject c_1 = options.getJSONObject("c_1");
+        List<Long> repeatedKeys = new ArrayList<>();
+        if (c_1.getBool("is_on")) {
+            Double threshold = c_1.getDouble("num1");
+            repeatedKeys = DataProcessUtil.similarityDeduplication(texts, threshold);
+        }
+        return repeatedKeys;
+    }
+
+    /**
+     * 数据处理任务 4 ：去隐私配置处理
+     *
+     * @param texts   文本列表
+     * @param options 数据处理配置
+     * @return 处理后的文本列表
+     */
+    private static Map<Long, String> dataAnonymization (Map<Long, String> texts, JSONObject options) {
+        JSONObject optionsD = options.getJSONObject("d");
+
+        Map<Long, String> resultMap = new HashMap<>();
+        if (com.baomidou.mybatisplus.core.toolkit.CollectionUtils.isNotEmpty(texts)) {
+            for (Map.Entry<Long, String> entry : texts.entrySet()) {
+                Long key = entry.getKey();
+                String value = entry.getValue();
+                if (org.apache.commons.lang3.StringUtils.isNotBlank(value)) {
+                    String cleanedValue = dataAnonymizationDetail(optionsD, value);
+                    resultMap.put(key, cleanedValue);
+                }
+            }
+        }
+
+        return resultMap;
+    }
+
+    /**
+     * 去隐私配置处理
+     *
+     * @param options 去隐私配置
+     * @param text    要处理的文本
+     * @return 处理后的文本
+     */
+    public static String dataAnonymizationDetail (JSONObject options, String text) {
+
+        String result = text;
+        // 去除Email：去除Email地址。
+        JSONObject d_1 = options.getJSONObject("d_1");
        if (d_1.getBool("is_on")) {
            result = DataProcessUtil.processFile(result);
        }
-        // 去除IP地址 去除IPV4或者IPV6地址
-        JSONObject d_2 = d.getJSONObject("d_2");
+        // 去除IP地址：去除IPv4 或者 IPv6 地址。
+        JSONObject d_2 = options.getJSONObject("d_2");
        if (d_2.getBool("is_on")) {
            result = DataProcessUtil.removeIPAddresses(result);
        }
-        //去除数字 去除数字和字母数字标识符，如电话号码、信用卡号、
-        //十六进制散列等，同时跳过年份和简单数字的实例
-        JSONObject d_3 = d.getJSONObject("d_3");
+        // 去除数字：去除数字和字母数字标识符，如电话号码、信用卡号、十六进制散列等，同时跳过年份和简单数字的实例。
+        JSONObject d_3 = options.getJSONObject("d_3");
        if (d_3.getBool("is_on")) {
-//            result = DataProcessUtil.removeIdentifiers(result);
+            result = DataProcessUtil.removeIdentifiers(result);
        }
        return result;
-
    }
 }
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataset/DatasetQuestionService.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataset/DatasetQuestionService.java
@ -1,14 +1,14 @@
 package cn.iocoder.yudao.module.llm.service.dataset;

-import java.util.*;
-import javax.validation.*;
 import cn.iocoder.yudao.framework.common.pojo.PageResult;
-import cn.iocoder.yudao.framework.common.pojo.PageParam;
 import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetQuestionPageReqVO;
 import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetQuestionRespVO;
 import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetQuestionSaveReqVO;
 import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO;

+import javax.validation.Valid;
+import java.util.List;
+
 /**
 * 数据集数据问题 Service 接口
 *
@ -22,21 +22,21 @@ public interface DatasetQuestionService {
     * @param createReqVO 创建信息
     * @return 编号
     */
-    Long createDatasetQuestion(@Valid DatasetQuestionSaveReqVO createReqVO);
+    Long createDatasetQuestion (@Valid DatasetQuestionSaveReqVO createReqVO);

    /**
     * 更新数据集数据问题
     *
     * @param updateReqVO 更新信息
     */
-    void updateDatasetQuestion(@Valid DatasetQuestionSaveReqVO updateReqVO);
+    void updateDatasetQuestion (@Valid DatasetQuestionSaveReqVO updateReqVO);

    /**
     * 删除数据集数据问题
     *
     * @param id 编号
     */
-    void deleteDatasetQuestion(Long id);
+    void deleteDatasetQuestion (Long id);

    /**
     * 获得数据集数据问题
@ -44,7 +44,7 @@ public interface DatasetQuestionService {
     * @param id 编号
     * @return 数据集数据问题
     */
-    DatasetQuestionDO getDatasetQuestion(Long id);
+    DatasetQuestionDO getDatasetQuestion (Long id);

    /**
     * 获得数据集数据问题分页
@ -52,9 +52,15 @@ public interface DatasetQuestionService {
     * @param pageReqVO 分页查询
     * @return 数据集数据问题分页
     */
-    PageResult<DatasetQuestionRespVO> getDatasetQuestionPage(DatasetQuestionPageReqVO pageReqVO);
+    PageResult<DatasetQuestionRespVO> getDatasetQuestionPage (DatasetQuestionPageReqVO pageReqVO);

-    void updateDatasetQuestionDataAnno(List<DatasetQuestionSaveReqVO> updateReqVOS);
+    void updateDatasetQuestionDataAnno (List<DatasetQuestionSaveReqVO> updateReqVOS);

-    List<DatasetQuestionRespVO> getDatasetQuestionList(Long datasetId);
-}
+    /**
+     * 获得 数据集数据问题 列表
+     *
+     * @param datasetId 数据集ID
+     * @return 数据集数据问题 列表
+     */
+    List<DatasetQuestionRespVO> getDatasetQuestionList (Long datasetId);
+}
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataset/DatasetQuestionServiceImpl.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataset/DatasetQuestionServiceImpl.java
@ -92,6 +92,13 @@ public class DatasetQuestionServiceImpl implements DatasetQuestionService {
        }
        return result;
    }
+
+    /**
+     * 获得 数据集数据问题 列表
+     *
+     * @param datasetId 数据集ID
+     * @return 数据集数据问题 列表
+     */
    @Override
    public List<DatasetQuestionRespVO> getDatasetQuestionList(Long datasetId) {
        List<DatasetQuestionDO> datasetQuestionDOS = datasetQuestionMapper.selectList(new LambdaQueryWrapper<>(DatasetQuestionDO.class)
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/TrainHttpService.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/TrainHttpService.java
@ -11,18 +11,10 @@ import kong.unirest.HttpResponse;
 import kong.unirest.Unirest;
 import kong.unirest.UnirestException;
 import lombok.extern.slf4j.Slf4j;
-import lombok.val;
-import org.apache.http.HttpEntity;
-import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpGet;
-import org.apache.http.impl.client.CloseableHttpClient;
-import org.apache.http.impl.client.HttpClients;
 import org.springframework.stereotype.Service;

 import javax.annotation.Resource;
 import java.io.*;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
@ -4,11 +4,14 @@ import com.github.houbb.opencc4j.util.ZhConverterUtil;
 import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
 import lombok.extern.slf4j.Slf4j;

+import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
 import java.time.Year;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;

@Slf4j
 public class DataProcessUtil {
@ -196,6 +199,7 @@ public class DataProcessUtil {
     * <p>
     * 如果字重复率太高，意味着文档中重复的字太多，文档会被过滤掉
     * </p>
+     *
     * @param content   文档行
     * @param threshold 设置字重复率的阈值，例如10%
     * @return true表示字重复率低于阈值，false表示字重复率高于阈值，文档会被过滤掉
@ -226,8 +230,8 @@ public class DataProcessUtil {
        double repetitionRate = (double) repeatedChars / totalChars;

        // 打印重复率和阈值，方便调试
-        System.out.println("字重复率: " + repetitionRate);
-        System.out.println("阈值: " + threshold);
+        log.info("字重复率: " + repetitionRate);
+        log.info("阈值: " + threshold);

        // 如果重复率超过阈值，返回true表示需要过滤掉文档
        return repetitionRate > threshold;
@ -281,8 +285,8 @@ public class DataProcessUtil {
        double repetitionRate = (double) repeatedWords / totalWords;

        // 打印重复率和阈值，方便调试
-        System.out.println("词重复率: " + repetitionRate);
-        System.out.println("阈值: " + threshold);
+        log.info("词重复率: " + repetitionRate);
+        log.info("阈值: " + threshold);

        // 如果重复率超过阈值，返回true表示需要过滤掉文档
        return repetitionRate > threshold;
@ -298,6 +302,7 @@ public class DataProcessUtil {
     */
    public static boolean checkSpecialCharacterRate (String content, double threshold) {

+        log.info("特殊字符检测：{}", content);
        // 使用正则表达式匹配特殊字符（非字母数字字符）
        Pattern pattern = Pattern.compile("[^a-zA-Z0-9]");
        Matcher matcher = pattern.matcher(content);
@ -315,8 +320,8 @@ public class DataProcessUtil {
        double specialCharRate = (double) specialCharCount / totalCharCount;

        // 打印特殊字符率和阈值，方便调试
-        System.out.println("特殊字符率: " + specialCharRate);
-        System.out.println("阈值: " + threshold);
+        log.info("特殊字符率: " + specialCharRate);
+        log.info("阈值: " + threshold);

        // 如果特殊字符率超过阈值，返回true表示需要过滤掉文档
        return specialCharRate > threshold;
@ -373,21 +378,50 @@ public class DataProcessUtil {
    /**
     * 相似度去重配置
     *
-     * @param contents   文本内容列表
-     * @param threshold   相似度阈值
+     * @param contentMap 文本内容列表
+     * @param threshold  相似度阈值
     * @return 是否需要去重
     */
-    public static List<Integer> similarityDeduplication (List<String> contents, double threshold) {
+    public static List<Long> similarityDeduplication (Map<Long, String> contentMap, double threshold) {
        long l3 = System.currentTimeMillis();
-        List<String> simHashes = new ArrayList<>();
+        // 先将 Map 转换为有顺序的 Map，使用 LinkedHashMap 保持插入顺序
+        LinkedHashMap<Long, String> sortedContentMap = new LinkedHashMap<>();
+        // 将原始内容映射的条目按照键排序，并添加到 sortedContentMap 中
+        contentMap.entrySet().stream()
+                .sorted(Map.Entry.comparingByKey())
+                .forEachOrdered(x -> sortedContentMap.put(x.getKey(), x.getValue()));
+
+        sortedContentMap.forEach((key, value)->{
+            log.info("key:{},value:{}", key, value);
+        });
+        // 有顺序的转换成文本内容，使用 LinkedList 存储文本内容
+        LinkedList<String> contents = sortedContentMap.entrySet().stream()
+                .sorted(Comparator.comparing(Map.Entry::getKey))
+                .map(Map.Entry::getValue)
+                .collect(Collectors.toCollection(LinkedList::new));
+
+        contents.forEach(v->{
+            log.info("value:{}", v);
+        });
+
+        // 存储每个文本的 simHash
+        LinkedList<String> simHashes = new LinkedList<>();
+        // 计算每个文本的 simHash 并添加到 simHashes 中
        for (String content : contents) {
            simHashes.add(HammingUtils.getSimHash(content));
        }
+
+
        // 存储相似元素的索引
-        List<Integer> similarityIndex = new ArrayList<>();
+        List<Long> similarityIndex = new ArrayList<>();
+        // 存储相似元素的键
+        List<Long> similarityKey = new ArrayList<>();
+
+
+        // 计算相似性
        for (int i = 0; i < simHashes.size(); i++) {
            // 如果当前元素已经标记为相似，则跳过
-            if (similarityIndex.contains(i)) {
+            if (similarityIndex.contains((long) i)) {
                continue;
            }
            for (int j = i + 1; j < simHashes.size(); j++) {
@ -395,27 +429,34 @@ public class DataProcessUtil {
                String hash2 = simHashes.get(j);
                // 从 1 开始计数，所以 i 和 j 都加 1
                double similarity = HammingUtils.getSimilarity(hash1, hash2);
-                log.info("第 " + (i + 1) + " 个元素 " + " 和第 " + (j + 1) + " 个元素 " + " 的文本相似度是：" + similarity);
+                // 打印相似性信息
+                log.info("第 {} 个元素  和第 {} 个元素  的文本相似度是：{}", i + 1, j + 1, similarity);

+
+                // 如果相似度超过阈值，标记为相似
                if (similarity > threshold) {
-                    log.info("相似度大于 {} 的文本：{} 和 {}", threshold,hash1, hash2);
-                    // 移除相似的文本
-                    similarityIndex.add(j);
+                    // 移除相似的文本的索引
+                    similarityIndex.add((long) j);
+
+                    // 根据索引位置获取map对应位置的key
+                    similarityKey.add(sortedContentMap.keySet().toArray(new Long[0])[j]);
                }
            }
        }
-        log.info("相似索引列表：" + similarityIndex);
+        log.info("相似索引列表：{}", similarityIndex);
+        log.info("相似Key列表：{}", similarityKey);

        long l4 = System.currentTimeMillis();
        long diff = l4 - l3;
        long minutes = diff / (60 * 1000);
-        long seconds = (diff % (60 * 1000)) / 1000; 
-        long milliseconds = diff % 1000; 
+        long seconds = (diff % (60 * 1000)) / 1000;
+        long milliseconds = diff % 1000;

        log.info("总耗时: " + minutes + " 分 " + seconds + " 秒 " + milliseconds + " 毫秒");
        log.info("======================================");

-        return similarityIndex;
+        return similarityKey;
+
    }

    /*
@ -450,7 +491,7 @@ public class DataProcessUtil {
        String modifiedContent = removeEmails(content);

        // 或者打印到控制台以查看结果
-        System.out.println(modifiedContent);
+        log.info(modifiedContent);
        return modifiedContent;
    }

@ -599,12 +640,42 @@ public class DataProcessUtil {
        // 去除标识符
        String textWithoutIdentifiers = removeIdentifiers(textWithIdentifiers);
        // 打印结果
-        System.out.println(textWithoutIdentifiers);
+        log.info(textWithoutIdentifiers);

        String traditionalText = "不經意，妳的笑容";
        String simplifiedText = traditionalToSimplified(traditionalText);

-        System.out.println("繁体文本: [" + traditionalText + "]");
-        System.out.println("简体文本: [" + simplifiedText + "]");
+        log.info("繁体文本: [" + traditionalText + "]");
+        log.info("简体文本: [" + simplifiedText + "]");
+String dirtyString="?<3F><>简体文<E4BD93><E69687><EFBFBD>f?<3F>G<EFBFBD><47>?<3F><>??<3F>G<EFBFBD>G<EFBFBD><47>پ?<3F>l?,,,杩欐槸涓€涓\\uE043贡鐮";
+        // 先进行编码转换
+        dirtyString = convertEncoding(dirtyString);
+        // 再进行乱码和无意义 Unicode 字符的清理
+        String cleanString = clean(dirtyString);
+//        String s1 = removeNonPrintableUnicodeChars(s);
+        log.info("去除乱码:[{}]", cleanString);
+    }
+    public static String clean(String input) {
+        // 更广泛的乱码字符范围，包括一些扩展的不可打印字符
+        String cleanString = input.replaceAll("[\\x00-\\x1F\\x7F-\\x9F\\uFFFD]", "");
+        // 去除无意义的 Unicode 字符，这里范围可根据实际情况修改
+        cleanString = cleanString.replaceAll("[\\uE000-\\uF8FF]", "");
+        return cleanString;
+    }
+
+    public static String convertEncoding(String input) {
+        // 尝试多种编码转换，找到正确的编码
+        String[] encodings = {"UTF-8", "GBK", "Big5", "ISO-8859-1"};
+        for (String encoding : encodings) {
+            try {
+                byte[] bytes = input.getBytes(encoding);
+                String result = new String(bytes, StandardCharsets.UTF_8);
+                return result;
+            } catch (Exception e) {
+                // 编码转换失败，继续尝试下一个编码
+                continue;
+            }
+        }
+        return input;
    }
 }
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/HammingUtils.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/HammingUtils.java
@ -6,6 +6,7 @@ import lombok.extern.slf4j.Slf4j;
 import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
+import java.util.ArrayList;
 import java.util.List;

 /**z
@ -30,32 +31,35 @@ public class HammingUtils {
     * @param str 传入的String类型字符串
     * @return 返回str的simHash值
     */
-    public static String getSimHash (String str) {
-
+    public static String getSimHash(String str) {
        // 用数组表示特征向量,取128位,从 0 1 2 位开始表示从高位到低位
        int[] v = new int[128];
-        // 1、分词（使用了外部依赖hankcs包提供的接口）
-        //取出所有关键词
-        List<String> keywordList = HanLP.extractKeyword(str, str.length());
+        // 1、分词（使用了外部依赖 hankcs 包提供的接口）
+        List<String> keywordList;
+        if (str.length() < 200) {
+            // 对于短文本，采取不同的处理方式，例如使用更简单的分词或添加额外的处理逻辑
+            keywordList = handleShortText(str);
+        } else {
+            keywordList = HanLP.extractKeyword(str, str.length());// 取出所有关键词
+        }
        // hash
        int size = keywordList.size();
-        //以i做外层循环
-        int i = 0;
+        int i = 0; // 以 i 做外层循环
        for (String keyword : keywordList) {
-            // 2、获取hash值
-            StringBuilder keywordHash = new StringBuilder(getHash(keyword));
+            // 2、获取 hash 值
+            String keywordHash = getHash(keyword);
            if (keywordHash.length() < 128) {
-                // hash值可能少于128位，在低位以0补齐
+                // hash 值可能少于 128 位，在低位以 0 补齐
                int dif = 128 - keywordHash.length();
                for (int j = 0; j < dif; j++) {
-                    keywordHash.append("0");
+                    keywordHash += "0";
                }
            }
            // 3、加权、合并
            for (int j = 0; j < v.length; j++) {
-                // 对keywordHash的每一位与'1'进行比较
+                // 对 keywordHash 的每一位与 '1' 进行比较
                if (keywordHash.charAt(j) == '1') {
-                    //权重分10级，由词频从高到低，取权重10~0
+                    // 权重分 10 级，由词频从高到低，取权重 10~0
                    v[j] += (10 - (i / (size / 10)));
                } else {
                    v[j] -= (10 - (i / (size / 10)));
@ -64,19 +68,28 @@ public class HammingUtils {
            i++;
        }
        // 4、降维
-        // 储存返回的simHash值
-        StringBuilder simHash = new StringBuilder();
-        for (int k : v) {
+        String simHash = ""; // 储存返回的 simHash 值
+        for (int j = 0; j < v.length; j++) {
            // 从高位遍历到低位
-            if (k <= 0) {
-                simHash.append("0");
+            if (v[j] <= 0) {
+                simHash += "0";
            } else {
-                simHash.append("1");
+                simHash += "1";
            }
        }
-        return simHash.toString();
+        return simHash;
    }

+    private static List<String> handleShortText(String str) {
+        // 这里可以添加对短文本的特殊处理逻辑，例如直接使用字符作为关键词
+        // 或者使用更简单的分词工具，或者对短文本进行预处理
+        // 以下是一个简单的示例，将短文本拆分为单个字符作为关键词
+        List<String> result = new ArrayList<>();
+        for (char c : str.toCharArray()) {
+            result.add(String.valueOf(c));
+        }
+        return result;
+    }

    /** 输入两个 simHash 值，计算它们的海明距离
     *