From aa05b784e2ae8f8ce98bc5093566349b53ee94c2 Mon Sep 17 00:00:00 2001
From: Liuyang <2746366019@qq.com>
Date: Wed, 15 Jan 2025 14:28:07 +0800
Subject: [PATCH] =?UTF-8?q?[update]=20=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86?=
 =?UTF-8?q?-=E8=BF=87=E6=BB=A4=E9=85=8D=E7=BD=AE=E8=A1=A5=E5=85=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 yudao-module-llm/yudao-module-llm-biz/pom.xml |   7 +
 .../module/llm/utils/DataProcessUtil.java     | 165 ++++++++++++------
 2 files changed, 118 insertions(+), 54 deletions(-)
diff --git a/yudao-module-llm/yudao-module-llm-biz/pom.xml b/yudao-module-llm/yudao-module-llm-biz/pom.xml
index ef61452cb..eb8eccfe5 100644
--- a/yudao-module-llm/yudao-module-llm-biz/pom.xml
+++ b/yudao-module-llm/yudao-module-llm-biz/pom.xml
@@ -78,6 +78,13 @@
             <version>5.9</version>
         </dependency>
 
+        <dependency>
+            <groupId>com.github.houbb</groupId>
+            <artifactId>sensitive-word</artifactId>
+            <version>0.24.0</version>
+            <scope>compile</scope>
+        </dependency>
+
     </dependencies>
 
 </project>
diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
index 4f8c6adb6..67fe58167 100644
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
@@ -1,6 +1,8 @@
 package cn.iocoder.yudao.module.llm.utils;
 
 import com.github.houbb.opencc4j.util.ZhConverterUtil;
+import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
+import lombok.extern.slf4j.Slf4j;
 
 import java.time.Year;
 import java.time.format.DateTimeFormatter;
@@ -8,6 +10,7 @@ import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+@Slf4j
 public class DataProcessUtil {
 
     /*
@@ -19,10 +22,11 @@ public class DataProcessUtil {
     /**
      * 移除不可见字
      * 移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围
+     *
      * @param input
      * @return
      */
-    public static String removeNonVisibleAsciiChars(String input) {
+    public static String removeNonVisibleAsciiChars (String input) {
         // 使用StringBuilder来构建正则表达式，因为我们需要动态地添加字符范围
         StringBuilder regex = new StringBuilder();
         regex.append("[\\x00-\\x1F]"); // 0-31范围的字符
@@ -35,12 +39,13 @@ public class DataProcessUtil {
 
     /**
      * 移除不可见字符
-     *
+     * <p>
      * 将不同的unicode空格比如  u2008，转成正常的空格
+     *
      * @param input
      * @return
      */
-    public static String convertUnicodeSpacesToNormalSpaces(String input) {
+    public static String convertUnicodeSpacesToNormalSpaces (String input) {
         // Unicode空格字符的正则表达式，包括但不限于u2008等
         String unicodeSpacesRegex = "[\\u0020\\u00A0\\u1680\\u2000-\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]";
 
@@ -50,12 +55,13 @@ public class DataProcessUtil {
 
     /**
      * 移除不可见字符
-     *
+     * <p>
      * 去除乱码和无意义的unicode
+     *
      * @param input
      * @return
      */
-    public static String removeNonPrintableUnicodeChars(String input) {
+    public static String removeNonPrintableUnicodeChars (String input) {
         // 构建一个正则表达式，匹配所有非打印ASCII和非打印Unicode字符
         // \p{C} 匹配所有控制字符和格式字符
         // \p{Zs} 匹配所有空白分隔符（比如U+2000到U+200F之间的字符）
@@ -78,12 +84,13 @@ public class DataProcessUtil {
 
     /**
      * 繁体转简体
-     *
+     * <p>
      * 繁体转简体，如“不經意，妳的笑容”清洗成“不经意，你的笑容”
+     *
      * @param input
      * @return
      */
-    public static String traditionalToSimplified(String input) {
+    public static String traditionalToSimplified (String input) {
         return ZhConverterUtil.toSimple(input);
     }
 
@@ -92,12 +99,13 @@ public class DataProcessUtil {
 
     /**
      * 去除网页标识符
-     *
+     * <p>
      * 移除文档中的html标签，如<html>,<dev><p>等
+     *
      * @param input
      * @return
      */
-    public static String removeHtmlTags(String input) {
+    public static String removeHtmlTags (String input) {
         if (input == null || input.isEmpty()) {
             return input;
         }
@@ -108,20 +116,21 @@ public class DataProcessUtil {
     // 这是一个简化的正则表达式，用于匹配常见的emoji表情符号。
     // 请注意，它可能不会涵盖所有可能的emoji，因为Unicode标准在不断发展。
     private static final String EMOJI_REGEX = "[\\uD83C-\\uD83D\\uD83E-\\uD83F\\u2600-\\u27FF"
-            + "\\u2B00-\\u2BFF\\u2F00-\\u2FFF\\u3000-\\u303F"
-            + "\\u3200-\\u32FF\\uA490-\\uA4CF\\uA900-\\uA97F"
-            + "\\uAC00-\\uAC7F\\uAC80-\\uACFF\\uD700-\\uD7AF"
-            + "\\uF900-\\uFAFF\\uFB00-\\uFB4F\\uFB50-\\uFDFF"
-            + "\\uFE00-\\uFE6F\\uFE70-\\uFEFF\\uFF00-\\uFFEF]";
+                                              + "\\u2B00-\\u2BFF\\u2F00-\\u2FFF\\u3000-\\u303F"
+                                              + "\\u3200-\\u32FF\\uA490-\\uA4CF\\uA900-\\uA97F"
+                                              + "\\uAC00-\\uAC7F\\uAC80-\\uACFF\\uD700-\\uD7AF"
+                                              + "\\uF900-\\uFAFF\\uFB00-\\uFB4F\\uFB50-\\uFDFF"
+                                              + "\\uFE00-\\uFE6F\\uFE70-\\uFEFF\\uFF00-\\uFFEF]";
 
     /**
      * 去除表情
-     *
+     * <p>
      * 去除文档中的表情，如‘🐰’、‘👵’等
+     *
      * @param input
      * @return
      */
-    public static String removeEmojis(String input) {
+    public static String removeEmojis (String input) {
         if (input == null || input.isEmpty()) {
             return input;
         }
@@ -136,14 +145,14 @@ public class DataProcessUtil {
     // 方法：计算字符串中的中文字符数量
     // 注意：这里假设输入字符串只包含中文字符和可能的分隔符（如空格、标点符号等）
     // 并且中文字符在UTF-16编码中占用两个char，但被视为一个逻辑字符
-    private static int countChineseChars(String input) {
+    private static int countChineseChars (String input) {
         // 使用正则表达式匹配中文词汇，并计算匹配到的字符总数（这里需要除以2，因为每个中文字符占用两个char）
         // 但为了简化，我们可以直接遍历字符，检查每个字符是否在中文范围内
         int count = 0;
         for (char c : input.toCharArray()) {
             if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
-                    || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
-                    || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
+                || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
+                || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
                 // 可以根据需要添加更多Unicode块
             ) {
                 count++;
@@ -161,12 +170,13 @@ public class DataProcessUtil {
     /**
      * 检查文档的词数目
      * 词数目不在指定范围会被过滤掉，如中文[1,1000000]
+     *
      * @param text
      * @param minChars
      * @param maxChars
      * @return
      */
-    public static List<String> filterWords(String text, int minChars, int maxChars) {
+    public static List<String> filterWords (String text, int minChars, int maxChars) {
         List<String> result = new ArrayList<>();
         Pattern pattern = Pattern.compile(CHINESE_WORD_REGEX);
         Matcher matcher = pattern.matcher(text);
@@ -183,13 +193,14 @@ public class DataProcessUtil {
 
     /**
      * 检查文档的字重复率
-     *
+     * <p>
      * 如果字重复率太高，意味着文档中重复的字太多，文档会被过滤掉
-     * @param lines  文档行列表
-     * @param threshold  // 设置字重复率的阈值，例如10%
+     *
+     * @param lines     文档行列表
+     * @param threshold // 设置字重复率的阈值，例如10%
      * @return
      */
-    public static boolean calculateCharacterRepetitionRate(List<String> lines, double threshold) {
+    public static boolean calculateCharacterRepetitionRate (List<String> lines, double threshold) {
         StringBuilder content = new StringBuilder();
         for (String line : lines) {
             content.append(line);
@@ -222,7 +233,7 @@ public class DataProcessUtil {
     }
 
     // 简单的基于空格和标点符号的分词方法
-    private static List<String> tokenize(String text) {
+    private static List<String> tokenize (String text) {
         // 使用正则表达式匹配非单词字符（包括空格、标点符号等），并将它们作为分隔符
         Pattern pattern = Pattern.compile("\\W+");
         String[] words = pattern.split(text.toLowerCase()); // 转换为小写以进行不区分大小写的比较
@@ -239,13 +250,14 @@ public class DataProcessUtil {
 
     /**
      * 检查文档的词重复率
-     *
+     * <p>
      * 如果词重复率太高，意味着文档中重复的词太多，文档会被过滤掉
+     *
      * @param content
      * @param threshold
      * @return
      */
-    public static boolean calculateWordRepetitionRate(String content, double threshold) {
+    public static boolean calculateWordRepetitionRate (String content, double threshold) {
         // 分词
         List<String> words = tokenize(content);
 
@@ -278,11 +290,12 @@ public class DataProcessUtil {
     /**
      * 检查文档的特殊字符率
      * 如果特殊字符率太高，意味着文档中特殊字符太多，文档会被过滤掉
+     *
      * @param content
      * @param threshold
      * @return
      */
-    public static boolean checkSpecialCharacterRate(String content, double threshold) {
+    public static boolean checkSpecialCharacterRate (String content, double threshold) {
 
         // 使用正则表达式匹配特殊字符（非字母数字字符）
         Pattern pattern = Pattern.compile("[^a-zA-Z0-9]");
@@ -308,6 +321,48 @@ public class DataProcessUtil {
         return specialCharRate > threshold;
     }
 
+    /**
+     * 检查文档的色情暴力词率
+     * <p>
+     * 如果色情暴力词率太高，文档会被过滤掉，取值范围[0,100]。
+     * </p>
+     *
+     * @param content   文本内容
+     * @param threshold 阈值
+     * @return 是否过滤文档
+     */
+    public static boolean checkSensitiveWordRate (String content, double threshold) {
+        // TODO: 先使用 sensitive-word 处理，有修改再调整
+
+        // 检测是否包含色情暴力词
+        boolean isFalse = SensitiveWordHelper.contains(content);
+        if (!isFalse){
+            return false;
+        }
+
+        //返回所有敏感词
+        List<String> wordList = SensitiveWordHelper.findAll(content);
+        log.info("返回所有敏感词====>>>>{}", wordList);
+
+        // 统计敏感词的字符数量
+        int sensitiveWordLength = 0;
+        for (String word : wordList) {
+            sensitiveWordLength += word.length();
+        }
+        // 计算文档的总字符数（不包括换行符等空白字符，可以根据需要调整）
+        // 或者使用 content.replaceAll("\\s+", "").length() 来排除空白字符
+        int totalCharCount = content.length();
+
+        // 计算敏感词长度占总长度的百分比
+        double specialCharRate = ((double) sensitiveWordLength / totalCharCount) * 100;
+
+        // 打印敏感词字符率和阈值，方便调试
+        log.info("敏感词字符率: {}", String.format("%.3f", specialCharRate));
+        log.info("阈值: {}", threshold);
+
+        // 如果敏感词字符率超过阈值，返回true表示需要过滤掉文档
+        return specialCharRate > threshold;
+    }
     /*
      * ---------------------------------------------------------------
      *              🔖 【 去重配置 】
@@ -327,7 +382,7 @@ public class DataProcessUtil {
     private static final Pattern EMAIL_PATTERN = Pattern.compile(EMAIL_REGEX);
 
     // 去除文本中的电子邮件地址
-    private static String removeEmails(String text) {
+    private static String removeEmails (String text) {
         Matcher matcher = EMAIL_PATTERN.matcher(text);
         // 使用空字符串替换匹配的电子邮件地址
         return matcher.replaceAll("");
@@ -335,11 +390,12 @@ public class DataProcessUtil {
 
     /**
      * 去除Email
-     *
+     * <p>
      * 去除email地址
+     *
      * @param content
      */
-    public static String processFile(String content) {
+    public static String processFile (String content) {
 
         // 去除电子邮件地址
         String modifiedContent = removeEmails(content);
@@ -352,9 +408,9 @@ public class DataProcessUtil {
     // 定义一个正则表达式来匹配IPv4地址
     private static final String IPV4_REGEX =
             "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
-                    "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
-                    "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
-                    "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)";
+            "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
+            "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
+            "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)";
 
     // 定义一个正则表达式来匹配IPv6地址
     // 这个正则表达式相对简单，可能无法匹配所有复杂的IPv6地址格式
@@ -371,7 +427,7 @@ public class DataProcessUtil {
     /**
      * 去除文本中的IPv4和IPv6地址
      */
-    public static String removeIPAddresses(String text) {
+    public static String removeIPAddresses (String text) {
         Matcher ipv4Matcher = IPV4_PATTERN.matcher(text);
         text = ipv4Matcher.replaceAll("");
         Matcher ipv6Matcher = IPV6_PATTERN.matcher(text);
@@ -407,12 +463,13 @@ public class DataProcessUtil {
 
     /**
      * 去除数字
-     *
+     * <p>
      * 去除数字和字母数字标识符，如电话号码、信用卡号、十六进制散列等，同时跳过年份和简单数字的实例
+     *
      * @param text
      * @return
      */
-    public static String removeIdentifiers(String text) {
+    public static String removeIdentifiers (String text) {
         Matcher phoneMatcher = PHONE_PATTERN.matcher(text);
         text = phoneMatcher.replaceAll("");
 
@@ -424,27 +481,27 @@ public class DataProcessUtil {
 
         // 使用StringBuilder和StringBuilder的replace方法去除其他数字，但跳过年份和简单数字
         // TODO: 这里目前有bug，先注释掉了。
-//        StringBuilder sb = new StringBuilder(text);
-//        int index = 0;
-//        while ((index = findNextNumberToReplace(sb.toString())) != -1) {
-//            String number = sb.substring(index, findEndOfNumber(sb.toString(), index));
-//            if (!isYear(number) && !isSimpleNumber(number)) {
-//                sb.replace(index, index + number.length(), "");
-//            }
-//        }
+        //        StringBuilder sb = new StringBuilder(text);
+        //        int index = 0;
+        //        while ((index = findNextNumberToReplace(sb.toString())) != -1) {
+        //            String number = sb.substring(index, findEndOfNumber(sb.toString(), index));
+        //            if (!isYear(number) && !isSimpleNumber(number)) {
+        //                sb.replace(index, index + number.length(), "");
+        //            }
+        //        }
         return text;
     }
 
     // 查找下一个要替换的数字的起始索引
-    private static int findNextNumberToReplace(String text) {
+    private static int findNextNumberToReplace (String text) {
         // 这里可以添加更复杂的逻辑来定位要替换的数字，但为了简化，我们假设数字以空格或非数字字符分隔
         for (int i = 0; i < text.length(); i++) {
             char c = text.charAt(i);
             if (Character.isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
                 // 找到数字的起始位置
                 while (i < text.length() && (Character.isDigit(text.charAt(i)) ||
-                        (text.charAt(i) >= 'a' && text.charAt(i) <= 'f') ||
-                        (text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) {
+                                             (text.charAt(i) >= 'a' && text.charAt(i) <= 'f') ||
+                                             (text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) {
                     i++;
                 }
                 // 返回数字的起始索引（减1，因为我们要在循环外部处理i的递增）
@@ -455,12 +512,12 @@ public class DataProcessUtil {
     }
 
     // 找到数字的结束索引
-    private static int findEndOfNumber(String text, int startIndex) {
+    private static int findEndOfNumber (String text, int startIndex) {
         // 从startIndex开始向后查找，直到遇到非数字字符
         for (int i = startIndex; i < text.length(); i++) {
             if (!(Character.isDigit(text.charAt(i)) ||
-                    (text.charAt(i) >= 'a' && text.charAt(i) <= 'f') ||
-                    (text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) {
+                  (text.charAt(i) >= 'a' && text.charAt(i) <= 'f') ||
+                  (text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) {
                 return i;
             }
         }
@@ -468,7 +525,7 @@ public class DataProcessUtil {
     }
 
     // 检查一个字符串是否是年份
-    private static boolean isYear(String str) {
+    private static boolean isYear (String str) {
         try {
             int year = Integer.parseInt(str);
             Year y = Year.parse(str, YEAR_FORMAT);
@@ -479,7 +536,7 @@ public class DataProcessUtil {
     }
 
     // 检查一个字符串是否是简单数字（这里假设不超过六位的连续数字）
-    private static boolean isSimpleNumber(String str) {
+    private static boolean isSimpleNumber (String str) {
         try {
             int number = Integer.parseInt(str);
             return String.valueOf(number).equals(str) && number >= 0 && number < 1000000;
@@ -488,7 +545,7 @@ public class DataProcessUtil {
         }
     }
 
-    public static void main(String[] args) {
+    public static void main (String[] args) {
         String textWithIdentifiers = "Here are some identifiers: 123-456-7890, 1234567812345678, a1b2c3d4e5f6a1b2c3d4e5f6, 2023, and 987654.";
         // 去除标识符
         String textWithoutIdentifiers = removeIdentifiers(textWithIdentifiers);