refactor(llm): 优化文本过滤功能和日志输出

- 修复字重复率计算逻辑，提高准确性 - 增加对词重复率、特殊字符率等过滤条件的日志输出 - 优化特殊字符率计算方法，考虑非中文字符 -调整日志输出格式，提高可读性
2025-07-07 15:15:57 +08:00 · 2025-07-07 15:15:57 +08:00 · 7d48924be6
commit 7d48924be6
parent 2a3bc9c0a9
2 changed files with 91 additions and 42 deletions
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java
@ -335,7 +335,7 @@ public class AsyncDataProcessService {
                    // 检查文档的词数目
                    String cleanedValue = dataFilterDetailByWordNumber(optionsB, value);
                    // 使用检测文档次数目后的文本继续进行检测
-                    log.info("检测文档次数目后的文本：{}", cleanedValue);
+                    log.info("检测文档词数目后的文本：{}", cleanedValue);
                    // 其他过滤配置
                    boolean shouldFilter = dataFilterDetailByOther(optionsB, cleanedValue);
                    if (!shouldFilter) {
@ -377,6 +377,7 @@ public class AsyncDataProcessService {
    public static boolean dataFilterDetailByOther (JSONObject options, String text) {
        // 检查文档的字重复率：如果字重复率太高，意味着文档中重复的字太多，文档会被过滤掉，取值范围[0,1]。
        JSONObject b_2 = options.getJSONObject("b_2");
+        log.info("检查文档的字重复率：{}", b_2);
        if (b_2.getBool("is_on")) {
            int threshold = b_2.getInt("num1");
            if (DataProcessUtil.calculateCharacterRepetitionRate(text, threshold)) {
@ -386,6 +387,7 @@ public class AsyncDataProcessService {

        // 检查文档的词重复率：如果词重复率太高，意味着文档中重复的词太多，文档会被过滤掉，取值范围[0,1]。
        JSONObject b_3 = options.getJSONObject("b_3");
+        log.info("检查文档的词重复率：{}", b_3);
        if (b_3.getBool("is_on")) {
            double threshold = b_3.getDouble("num1");
            if (DataProcessUtil.calculateWordRepetitionRate(text, threshold)) {
@ -395,6 +397,7 @@ public class AsyncDataProcessService {

        // 检查文档的特殊字符率：如果特殊字符率太高，意味着文档中特殊字符太多，文档会被过滤掉，取值范围[0,1]。
        JSONObject b_4 = options.getJSONObject("b_4");
+        log.info("检查文档的特殊字符率：{}", b_4);
        if (b_4.getBool("is_on")) {
            double threshold = b_4.getDouble("num1");
            if (DataProcessUtil.checkSpecialCharacterRate(text, threshold)) {
@ -405,9 +408,11 @@ public class AsyncDataProcessService {

        // 检查文档的色情暴力词率：如果色情暴力词率太高，文档会被过滤掉，取值范围[0,1]。
        JSONObject b_5 = options.getJSONObject("b_5");
+        log.info("检查文档的色情暴力词率：{}", b_5);
        if (b_5.getBool("is_on")) {
            double threshold = b_5.getDouble("num1");
            if (DataProcessUtil.checkSensitiveWordRate(text, threshold)) {
+
                return true;
            }
        }
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
@ -5,6 +5,7 @@ import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
 import lombok.extern.slf4j.Slf4j;

 import java.nio.charset.StandardCharsets;
+import java.text.DecimalFormat;
 import java.time.Year;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
@ -203,37 +204,53 @@ public class DataProcessUtil {
     * @param threshold 设置字重复率的阈值，例如10%
     * @return true表示字重复率低于阈值，false表示字重复率高于阈值，文档会被过滤掉
     */
-    public static boolean calculateCharacterRepetitionRate (String content, double threshold) {
-        //        StringBuilder content = new StringBuilder();
-        //        for (String line : lines) {
-        //            content.append(line);
-        //        }
-
-        // 统计字符出现次数
-        Map<Character, Integer> charCount = new HashMap<>();
-        for (char c : content.toCharArray()) {
-            charCount.put(c, charCount.getOrDefault(c, 0) + 1);
+    public static boolean calculateCharacterRepetitionRate(String content, double threshold) {
+        // 输入校验
+        if (content == null || content.trim().isEmpty()) {
+            return false;
        }

-        // 计算重复字符数和总字符数
-        int totalChars = content.length();
-        int repeatedChars = 0;
-        for (int count : charCount.values()) {
-            if (count > 1) {
-                // 只计算重复的部分
-                repeatedChars += count - 1;
+        // 预处理（去空格、标点等）
+        String processedContent = content
+                .replaceAll("\\s+", "")
+                .replaceAll("[\\pP\\pS]", "");
+
+        // 短文本不检查
+        if (processedContent.length() < 5) {
+            return false;
+        }
+
+        // 统计字符频率
+        Map<Character, Integer> charCount = new HashMap<>();
+        char[] chars = processedContent.toCharArray();
+        for (char c : chars) {
+            if (isChineseCharacter(c)) {  // 可选：仅统计中文
+                charCount.put(c, charCount.getOrDefault(c, 0) + 1);
            }
        }

-        // 计算字重复率
-        double repetitionRate = (double) repeatedChars / totalChars;
+        // 计算重复率（方式1：传统重复率）
+        int totalChars = chars.length;
+        double repetitionRate = (double) (totalChars - charCount.size()) / totalChars;

-        // 打印重复率和阈值，方便调试
-        log.info("字重复率: " + repetitionRate);
-        log.info("阈值: " + threshold);
+        // 将重复率转换为百分比（0～100），以便与阈值直接比较
+        double repetitionPercent = repetitionRate * 100;

-        // 如果重复率超过阈值，返回true表示需要过滤掉文档
-        return repetitionRate > threshold;
+        // 调试日志（输出百分比）
+        log.info("总字数: {}", totalChars);
+        log.info("重复字数: {}", totalChars - charCount.size());
+        log.info("字重复率: {}%", String.format("%.2f", repetitionPercent));
+
+        // 比较前可添加浮点数容差（可选）
+        final double EPSILON = 0.0001;
+        return repetitionPercent - threshold > EPSILON;
+    }
+
+
+    // 判断是否为中文字符（可选）
+    private static boolean isChineseCharacter(char c) {
+        Character.UnicodeScript sc = Character.UnicodeScript.of(c);
+        return sc == Character.UnicodeScript.HAN;
    }

    // 简单的基于空格和标点符号的分词方法
@ -299,31 +316,58 @@ public class DataProcessUtil {
     * @param threshold
     * @return
     */
-    public static boolean checkSpecialCharacterRate (String content, double threshold) {
+    /**
+     * 检测文本中特殊字符率是否超过阈值（阈值范围0～100.00）
+     * @param content 待检测文本
+     * @param threshold 百分比阈值（如传入10表示10%）
+     * @return 超过阈值返回true
+     */
+    public static boolean checkSpecialCharacterRate(String content, double threshold) {
+        // 参数校验
+        if (content == null || content.isEmpty()) {
+            log.warn("输入内容为空");
+            return false;
+        }
+        if (threshold < 0 || threshold > 100) {
+            throw new IllegalArgumentException("阈值必须是0～100之间的数值");
+        }

-        log.info("特殊字符检测：{}", content);
-        // 使用正则表达式匹配特殊字符（非字母数字字符）
-        Pattern pattern = Pattern.compile("[^a-zA-Z0-9]");
-        Matcher matcher = pattern.matcher(content);
+        // 预处理：去除所有空白字符（可选）
+        String processedContent = content.replaceAll("\\s+", "");
+        int totalCharCount = processedContent.length();
+
+        // 空文本或纯空白内容处理
+        if (totalCharCount == 0) {
+            log.info("有效字符数为0");
+            return false;
+        }
+
+        // 统计特殊字符（非字母、数字、汉字）
+        // 正则说明：
+        // [^a-zA-Z0-9\\p{Script=Han}] → 排除字母数字和汉字
+        // 如需包含其他语言字符，需调整正则
+        Pattern pattern = Pattern.compile("[^a-zA-Z0-9\\p{Script=Han}]");
+        Matcher matcher = pattern.matcher(processedContent);

-        // 统计特殊字符的数量
        int specialCharCount = 0;
        while (matcher.find()) {
            specialCharCount++;
        }

-        // 计算文档的总字符数（不包括换行符等空白字符，可以根据需要调整）
-        int totalCharCount = content.length(); // 或者使用 content.replaceAll("\\s+", "").length() 来排除空白字符
+        // 计算特殊字符率（转换为百分比）
+        double specialCharRatePercent = (double) specialCharCount / totalCharCount * 100;

-        // 计算特殊字符率
-        double specialCharRate = (double) specialCharCount / totalCharCount;
+        // 调试日志（保留2位小数）
+        DecimalFormat df = new DecimalFormat("0.00");
+        log.info("特殊字符检测结果: {}/{}={}% (阈值: {}%)",
+                specialCharCount,
+                totalCharCount,
+                df.format(specialCharRatePercent),
+                df.format(threshold));

-        // 打印特殊字符率和阈值，方便调试
-        log.info("特殊字符率: " + specialCharRate);
-        log.info("阈值: " + threshold);
-
-        // 如果特殊字符率超过阈值，返回true表示需要过滤掉文档
-        return specialCharRate > threshold;
+        // 浮点数精确比较（添加1e-6容差）
+        final double EPSILON = 1e-6;
+        return specialCharRatePercent - threshold > EPSILON;
    }

    /**
@ -490,7 +534,7 @@ public class DataProcessUtil {
        String modifiedContent = removeEmails(content);

        // 或者打印到控制台以查看结果
-        log.info(modifiedContent);
+        log.info("去除电子邮件地址:{}", modifiedContent);
        return modifiedContent;
    }