From 7d48924be653196fce3ad58766eafe4519d14a8f Mon Sep 17 00:00:00 2001
From: Liuyang <2746366019@qq.com>
Date: Mon, 7 Jul 2025 15:15:57 +0800
Subject: [PATCH] =?UTF-8?q?refactor(llm):=20=E4=BC=98=E5=8C=96=E6=96=87?=
 =?UTF-8?q?=E6=9C=AC=E8=BF=87=E6=BB=A4=E5=8A=9F=E8=83=BD=E5=92=8C=E6=97=A5?=
 =?UTF-8?q?=E5=BF=97=E8=BE=93=E5=87=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 修复字重复率计算逻辑，提高准确性
- 增加对词重复率、特殊字符率等过滤条件的日志输出
- 优化特殊字符率计算方法，考虑非中文字符
-调整日志输出格式，提高可读性
---
 .../async/AsyncDataProcessService.java        |   7 +-
 .../module/llm/utils/DataProcessUtil.java     | 126 ++++++++++++------
 2 files changed, 91 insertions(+), 42 deletions(-)

diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java
index 9a2808e41..6ed8a3e71 100644
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java
@@ -335,7 +335,7 @@ public class AsyncDataProcessService {
                     // 检查文档的词数目
                     String cleanedValue = dataFilterDetailByWordNumber(optionsB, value);
                     // 使用检测文档次数目后的文本继续进行检测
-                    log.info("检测文档次数目后的文本：{}", cleanedValue);
+                    log.info("检测文档词数目后的文本：{}", cleanedValue);
                     // 其他过滤配置
                     boolean shouldFilter = dataFilterDetailByOther(optionsB, cleanedValue);
                     if (!shouldFilter) {
@@ -377,6 +377,7 @@ public class AsyncDataProcessService {
     public static boolean dataFilterDetailByOther (JSONObject options, String text) {
         // 检查文档的字重复率：如果字重复率太高，意味着文档中重复的字太多，文档会被过滤掉，取值范围[0,1]。
         JSONObject b_2 = options.getJSONObject("b_2");
+        log.info("检查文档的字重复率：{}", b_2);
         if (b_2.getBool("is_on")) {
             int threshold = b_2.getInt("num1");
             if (DataProcessUtil.calculateCharacterRepetitionRate(text, threshold)) {
@@ -386,6 +387,7 @@ public class AsyncDataProcessService {
 
         // 检查文档的词重复率：如果词重复率太高，意味着文档中重复的词太多，文档会被过滤掉，取值范围[0,1]。
         JSONObject b_3 = options.getJSONObject("b_3");
+        log.info("检查文档的词重复率：{}", b_3);
         if (b_3.getBool("is_on")) {
             double threshold = b_3.getDouble("num1");
             if (DataProcessUtil.calculateWordRepetitionRate(text, threshold)) {
@@ -395,6 +397,7 @@ public class AsyncDataProcessService {
 
         // 检查文档的特殊字符率：如果特殊字符率太高，意味着文档中特殊字符太多，文档会被过滤掉，取值范围[0,1]。
         JSONObject b_4 = options.getJSONObject("b_4");
+        log.info("检查文档的特殊字符率：{}", b_4);
         if (b_4.getBool("is_on")) {
             double threshold = b_4.getDouble("num1");
             if (DataProcessUtil.checkSpecialCharacterRate(text, threshold)) {
@@ -405,9 +408,11 @@ public class AsyncDataProcessService {
 
         // 检查文档的色情暴力词率：如果色情暴力词率太高，文档会被过滤掉，取值范围[0,1]。
         JSONObject b_5 = options.getJSONObject("b_5");
+        log.info("检查文档的色情暴力词率：{}", b_5);
         if (b_5.getBool("is_on")) {
             double threshold = b_5.getDouble("num1");
             if (DataProcessUtil.checkSensitiveWordRate(text, threshold)) {
+
                 return true;
             }
         }
diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
index fc1cca498..269f83314 100644
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
@@ -5,6 +5,7 @@ import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
 import lombok.extern.slf4j.Slf4j;
 
 import java.nio.charset.StandardCharsets;
+import java.text.DecimalFormat;
 import java.time.Year;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
@@ -203,37 +204,53 @@ public class DataProcessUtil {
      * @param threshold 设置字重复率的阈值，例如10%
      * @return true表示字重复率低于阈值，false表示字重复率高于阈值，文档会被过滤掉
      */
-    public static boolean calculateCharacterRepetitionRate (String content, double threshold) {
-        //        StringBuilder content = new StringBuilder();
-        //        for (String line : lines) {
-        //            content.append(line);
-        //        }
-
-        // 统计字符出现次数
-        Map<Character, Integer> charCount = new HashMap<>();
-        for (char c : content.toCharArray()) {
-            charCount.put(c, charCount.getOrDefault(c, 0) + 1);
+    public static boolean calculateCharacterRepetitionRate(String content, double threshold) {
+        // 输入校验
+        if (content == null || content.trim().isEmpty()) {
+            return false;
         }
 
-        // 计算重复字符数和总字符数
-        int totalChars = content.length();
-        int repeatedChars = 0;
-        for (int count : charCount.values()) {
-            if (count > 1) {
-                // 只计算重复的部分
-                repeatedChars += count - 1;
+        // 预处理（去空格、标点等）
+        String processedContent = content
+                .replaceAll("\\s+", "")
+                .replaceAll("[\\pP\\pS]", "");
+
+        // 短文本不检查
+        if (processedContent.length() < 5) {
+            return false;
+        }
+
+        // 统计字符频率
+        Map<Character, Integer> charCount = new HashMap<>();
+        char[] chars = processedContent.toCharArray();
+        for (char c : chars) {
+            if (isChineseCharacter(c)) {  // 可选：仅统计中文
+                charCount.put(c, charCount.getOrDefault(c, 0) + 1);
             }
         }
 
-        // 计算字重复率
-        double repetitionRate = (double) repeatedChars / totalChars;
+        // 计算重复率（方式1：传统重复率）
+        int totalChars = chars.length;
+        double repetitionRate = (double) (totalChars - charCount.size()) / totalChars;
 
-        // 打印重复率和阈值，方便调试
-        log.info("字重复率: " + repetitionRate);
-        log.info("阈值: " + threshold);
+        // 将重复率转换为百分比（0～100），以便与阈值直接比较
+        double repetitionPercent = repetitionRate * 100;
 
-        // 如果重复率超过阈值，返回true表示需要过滤掉文档
-        return repetitionRate > threshold;
+        // 调试日志（输出百分比）
+        log.info("总字数: {}", totalChars);
+        log.info("重复字数: {}", totalChars - charCount.size());
+        log.info("字重复率: {}%", String.format("%.2f", repetitionPercent));
+
+        // 比较前可添加浮点数容差（可选）
+        final double EPSILON = 0.0001;
+        return repetitionPercent - threshold > EPSILON;
+    }
+
+
+    // 判断是否为中文字符（可选）
+    private static boolean isChineseCharacter(char c) {
+        Character.UnicodeScript sc = Character.UnicodeScript.of(c);
+        return sc == Character.UnicodeScript.HAN;
     }
 
     // 简单的基于空格和标点符号的分词方法
@@ -299,31 +316,58 @@ public class DataProcessUtil {
      * @param threshold
      * @return
      */
-    public static boolean checkSpecialCharacterRate (String content, double threshold) {
+    /**
+     * 检测文本中特殊字符率是否超过阈值（阈值范围0～100.00）
+     * @param content 待检测文本
+     * @param threshold 百分比阈值（如传入10表示10%）
+     * @return 超过阈值返回true
+     */
+    public static boolean checkSpecialCharacterRate(String content, double threshold) {
+        // 参数校验
+        if (content == null || content.isEmpty()) {
+            log.warn("输入内容为空");
+            return false;
+        }
+        if (threshold < 0 || threshold > 100) {
+            throw new IllegalArgumentException("阈值必须是0～100之间的数值");
+        }
 
-        log.info("特殊字符检测：{}", content);
-        // 使用正则表达式匹配特殊字符（非字母数字字符）
-        Pattern pattern = Pattern.compile("[^a-zA-Z0-9]");
-        Matcher matcher = pattern.matcher(content);
+        // 预处理：去除所有空白字符（可选）
+        String processedContent = content.replaceAll("\\s+", "");
+        int totalCharCount = processedContent.length();
+
+        // 空文本或纯空白内容处理
+        if (totalCharCount == 0) {
+            log.info("有效字符数为0");
+            return false;
+        }
+
+        // 统计特殊字符（非字母、数字、汉字）
+        // 正则说明：
+        // [^a-zA-Z0-9\\p{Script=Han}] → 排除字母数字和汉字
+        // 如需包含其他语言字符，需调整正则
+        Pattern pattern = Pattern.compile("[^a-zA-Z0-9\\p{Script=Han}]");
+        Matcher matcher = pattern.matcher(processedContent);
 
-        // 统计特殊字符的数量
         int specialCharCount = 0;
         while (matcher.find()) {
             specialCharCount++;
         }
 
-        // 计算文档的总字符数（不包括换行符等空白字符，可以根据需要调整）
-        int totalCharCount = content.length(); // 或者使用 content.replaceAll("\\s+", "").length() 来排除空白字符
+        // 计算特殊字符率（转换为百分比）
+        double specialCharRatePercent = (double) specialCharCount / totalCharCount * 100;
 
-        // 计算特殊字符率
-        double specialCharRate = (double) specialCharCount / totalCharCount;
+        // 调试日志（保留2位小数）
+        DecimalFormat df = new DecimalFormat("0.00");
+        log.info("特殊字符检测结果: {}/{}={}% (阈值: {}%)",
+                specialCharCount,
+                totalCharCount,
+                df.format(specialCharRatePercent),
+                df.format(threshold));
 
-        // 打印特殊字符率和阈值，方便调试
-        log.info("特殊字符率: " + specialCharRate);
-        log.info("阈值: " + threshold);
-
-        // 如果特殊字符率超过阈值，返回true表示需要过滤掉文档
-        return specialCharRate > threshold;
+        // 浮点数精确比较（添加1e-6容差）
+        final double EPSILON = 1e-6;
+        return specialCharRatePercent - threshold > EPSILON;
     }
 
     /**
@@ -490,7 +534,7 @@ public class DataProcessUtil {
         String modifiedContent = removeEmails(content);
 
         // 或者打印到控制台以查看结果
-        log.info(modifiedContent);
+        log.info("去除电子邮件地址:{}", modifiedContent);
         return modifiedContent;
     }