数据过滤工具类

2025-01-05 17:09:48 +08:00 · 2025-01-05 17:09:48 +08:00 · 6a073287dd
commit 6a073287dd
parent c1798ef5a2
2 changed files with 471 additions and 0 deletions
--- a/yudao-module-llm/yudao-module-llm-biz/pom.xml
+++ b/yudao-module-llm/yudao-module-llm-biz/pom.xml
@ -55,6 +55,11 @@
            <groupId>cn.iocoder.boot</groupId>
            <artifactId>yudao-spring-boot-starter-mybatis</artifactId>
        </dependency>
+        <dependency>
+            <groupId>com.github.houbb</groupId>
+            <artifactId>opencc4j</artifactId>
+            <version>1.8.1</version>
+        </dependency>

    </dependencies>

--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java
@ -0,0 +1,466 @@
+package cn.iocoder.yudao.module.llm.utils;
+
+import com.github.houbb.opencc4j.util.ZhConverterUtil;
+
+import java.time.Year;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class DataProcessUtil {
+
+    /**
+     * 移除不可见字
+     * 移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围
+     * @param input
+     * @return
+     */
+    public static String removeNonVisibleAsciiChars(String input) {
+        // 使用StringBuilder来构建正则表达式，因为我们需要动态地添加字符范围
+        StringBuilder regex = new StringBuilder();
+        regex.append("[\\x00-\\x1F]"); // 0-31范围的字符
+        regex.append("|");            // OR 操作符
+        regex.append("[\\x7F-\\xA0]"); // 127-160范围的字符
+
+        // 使用replaceAll方法和构建的正则表达式来移除不可见字符
+        return input.replaceAll(regex.toString(), "");
+    }
+
+    /**
+     * 移除不可见字符
+     *
+     * 将不同的unicode空格比如  u2008，转成正常的空格
+     * @param input
+     * @return
+     */
+    public static String convertUnicodeSpacesToNormalSpaces(String input) {
+        // Unicode空格字符的正则表达式，包括但不限于u2008等
+        String unicodeSpacesRegex = "[\\u0020\\u00A0\\u1680\\u2000-\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]";
+
+        // 使用正则表达式替换匹配的Unicode空格字符为普通空格
+        return input.replaceAll(unicodeSpacesRegex, " ");
+    }
+
+    /**
+     * 移除不可见字符
+     *
+     * 去除乱码和无意义的unicode
+     * @param input
+     * @return
+     */
+    public static String removeNonPrintableUnicodeChars(String input) {
+        // 构建一个正则表达式，匹配所有非打印ASCII和非打印Unicode字符
+        // \p{C} 匹配所有控制字符和格式字符
+        // \p{Zs} 匹配所有空白分隔符（比如U+2000到U+200F之间的字符）
+        // 注意：有些空白字符可能是有意义的，比如空格（U+0020），所以这里的选择要谨慎
+        // 如果你确定某些空白字符是无意义的，可以将其添加到正则表达式中
+        String regex = "[\\p{C}\\p{Zs}&&[^\\s]]+|\\u0000"; // \\u0000 是NULL字符，通常是无意义的
+
+        // 使用replaceAll方法移除匹配的字符
+        // 注意：这里使用了两个替换步骤，因为直接替换可能会导致正则表达式匹配问题
+        // 首先替换掉所有匹配的字符为一个占位符（比如"*"），然后再替换掉占位符为空字符串
+        // 这样做是为了避免在替换过程中正则表达式匹配到已经被替换掉的部分
+        // 但在这种情况下，由于我们使用的是字符类匹配，其实直接替换为空字符串也是可以的
+        // 下面的代码为了演示这种可能的复杂性而保留了两步替换的逻辑
+        String intermediate = input.replaceAll(regex, "*"); // 这一步其实是多余的，但为了说明而保留
+        return intermediate.replaceAll("[*]+", ""); // 这一步实际上完成了去除非打印字符的任务
+
+        // 简化版：直接替换为空字符串
+        // return input.replaceAll(regex, "");
+    }
+
+    /**
+     * 繁体转简体
+     *
+     * 繁体转简体，如“不經意，妳的笑容”清洗成“不经意，你的笑容”
+     * @param input
+     * @return
+     */
+    public static String TraditionalToSimplified(String input) {
+        return ZhConverterUtil.toSimple(input);
+    }
+
+    // 使用正则表达式匹配HTML标签
+    private static final String HTML_TAG_REGEX = "<[^>]+>";
+
+    /**
+     * 去除网页标识符
+     *
+     * 移除文档中的html标签，如<html>,<dev><p>等
+     * @param input
+     * @return
+     */
+    public static String removeHtmlTags(String input) {
+        if (input == null || input.isEmpty()) {
+            return input;
+        }
+        // 使用replaceAll方法替换匹配的HTML标签为空字符串
+        return input.replaceAll(HTML_TAG_REGEX, "");
+    }
+
+    // 这是一个简化的正则表达式，用于匹配常见的emoji表情符号。
+    // 请注意，它可能不会涵盖所有可能的emoji，因为Unicode标准在不断发展。
+    private static final String EMOJI_REGEX = "[\\uD83C-\\uD83D\\uD83E-\\uD83F\\u2600-\\u27FF"
+            + "\\u2B00-\\u2BFF\\u2F00-\\u2FFF\\u3000-\\u303F"
+            + "\\u3200-\\u32FF\\uA490-\\uA4CF\\uA900-\\uA97F"
+            + "\\uAC00-\\uAC7F\\uAC80-\\uACFF\\uD700-\\uD7AF"
+            + "\\uF900-\\uFAFF\\uFB00-\\uFB4F\\uFB50-\\uFDFF"
+            + "\\uFE00-\\uFE6F\\uFE70-\\uFEFF\\uFF00-\\uFFEF]";
+
+    /**
+     * 去除表情
+     *
+     * 去除文档中的表情，如‘🐰’、‘👵’等
+     * @param input
+     * @return
+     */
+    public static String removeEmojis(String input) {
+        if (input == null || input.isEmpty()) {
+            return input;
+        }
+        Pattern pattern = Pattern.compile(EMOJI_REGEX, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE);
+        Matcher matcher = pattern.matcher(input);
+        return matcher.replaceAll("");
+    }
+
+    // 正则表达式用于匹配中文词汇（这里假设词汇由连续的中文字符组成）
+    private static final String CHINESE_WORD_REGEX = "[\\u4e00-\\u9fff]+";
+
+    // 方法：计算字符串中的中文字符数量
+    // 注意：这里假设输入字符串只包含中文字符和可能的分隔符（如空格、标点符号等）
+    // 并且中文字符在UTF-16编码中占用两个char，但被视为一个逻辑字符
+    private static int countChineseChars(String input) {
+        // 使用正则表达式匹配中文词汇，并计算匹配到的字符总数（这里需要除以2，因为每个中文字符占用两个char）
+        // 但为了简化，我们可以直接遍历字符，检查每个字符是否在中文范围内
+        int count = 0;
+        for (char c : input.toCharArray()) {
+            if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
+                    || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
+                    || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
+                // 可以根据需要添加更多Unicode块
+            ) {
+                count++;
+            }
+        }
+        return count;
+    }
+
+    /**
+     * 检查文档的词数目
+     * 词数目不在指定范围会被过滤掉，如中文[1,1000000]
+     * @param text
+     * @param minChars
+     * @param maxChars
+     * @return
+     */
+    public static List<String> filterWords(String text, int minChars, int maxChars) {
+        List<String> result = new ArrayList<>();
+        Pattern pattern = Pattern.compile(CHINESE_WORD_REGEX);
+        Matcher matcher = pattern.matcher(text);
+
+        while (matcher.find()) {
+            String word = matcher.group();
+            int chineseCharCount = countChineseChars(word); // 计算中文字符数量
+            if (chineseCharCount >= minChars && chineseCharCount <= maxChars) {
+                result.add(word);
+            }
+        }
+        return result;
+    }
+
+    /**
+     * 检查文档的字重复率
+     *
+     * 如果字重复率太高，意味着文档中重复的字太多，文档会被过滤掉
+     * @param lines  文档行列表
+     * @param threshold  // 设置字重复率的阈值，例如10%
+     * @return
+     */
+    public static boolean calculateCharacterRepetitionRate(List<String> lines, double threshold) {
+        StringBuilder content = new StringBuilder();
+        for (String line : lines) {
+            content.append(line);
+        }
+
+        // 统计字符出现次数
+        Map<Character, Integer> charCount = new HashMap<>();
+        for (char c : content.toString().toCharArray()) {
+            charCount.put(c, charCount.getOrDefault(c, 0) + 1);
+        }
+
+        // 计算重复字符数和总字符数
+        int totalChars = content.length();
+        int repeatedChars = 0;
+        for (int count : charCount.values()) {
+            if (count > 1) {
+                repeatedChars += count - 1; // 只计算重复的部分
+            }
+        }
+
+        // 计算字重复率
+        double repetitionRate = (double) repeatedChars / totalChars;
+
+        // 打印重复率和阈值，方便调试
+        System.out.println("字重复率: " + repetitionRate);
+        System.out.println("阈值: " + threshold);
+
+        // 如果重复率超过阈值，返回true表示需要过滤掉文档
+        return repetitionRate > threshold;
+    }
+
+    // 简单的基于空格和标点符号的分词方法
+    private static List<String> tokenize(String text) {
+        // 使用正则表达式匹配非单词字符（包括空格、标点符号等），并将它们作为分隔符
+        Pattern pattern = Pattern.compile("\\W+");
+        String[] words = pattern.split(text.toLowerCase()); // 转换为小写以进行不区分大小写的比较
+        List<String> tokens = new ArrayList<>();
+        for (String word : words) {
+            if (!word.isEmpty()) { // 排除空字符串
+                tokens.add(word);
+            }
+        }
+        return tokens;
+    }
+
+    // 方法：计算文档的词重复率
+
+    /**
+     * 检查文档的词重复率
+     *
+     * 如果词重复率太高，意味着文档中重复的词太多，文档会被过滤掉
+     * @param content
+     * @param threshold
+     * @return
+     */
+    public static boolean calculateWordRepetitionRate(String content, double threshold) {
+        // 分词
+        List<String> words = tokenize(content);
+
+        // 统计词出现次数
+        Map<String, Integer> wordCount = new HashMap<>();
+        for (String word : words) {
+            wordCount.put(word, wordCount.getOrDefault(word, 0) + 1);
+        }
+
+        // 计算重复词数和总词数
+        int totalWords = words.size();
+        int repeatedWords = 0;
+        for (int count : wordCount.values()) {
+            if (count > 1) {
+                repeatedWords += (count - 1); // 只计算重复的部分
+            }
+        }
+
+        // 计算词重复率
+        double repetitionRate = (double) repeatedWords / totalWords;
+
+        // 打印重复率和阈值，方便调试
+        System.out.println("词重复率: " + repetitionRate);
+        System.out.println("阈值: " + threshold);
+
+        // 如果重复率超过阈值，返回true表示需要过滤掉文档
+        return repetitionRate > threshold;
+    }
+
+    /**
+     * 检查文档的特殊字符率
+     * 如果特殊字符率太高，意味着文档中特殊字符太多，文档会被过滤掉
+     * @param content
+     * @param threshold
+     * @return
+     */
+    public static boolean checkSpecialCharacterRate(String content, double threshold) {
+
+        // 使用正则表达式匹配特殊字符（非字母数字字符）
+        Pattern pattern = Pattern.compile("[^a-zA-Z0-9]");
+        Matcher matcher = pattern.matcher(content);
+
+        // 统计特殊字符的数量
+        int specialCharCount = 0;
+        while (matcher.find()) {
+            specialCharCount++;
+        }
+
+        // 计算文档的总字符数（不包括换行符等空白字符，可以根据需要调整）
+        int totalCharCount = content.length(); // 或者使用 content.replaceAll("\\s+", "").length() 来排除空白字符
+
+        // 计算特殊字符率
+        double specialCharRate = (double) specialCharCount / totalCharCount;
+
+        // 打印特殊字符率和阈值，方便调试
+        System.out.println("特殊字符率: " + specialCharRate);
+        System.out.println("阈值: " + threshold);
+
+        // 如果特殊字符率超过阈值，返回true表示需要过滤掉文档
+        return specialCharRate > threshold;
+    }
+
+    // 定义一个正则表达式来匹配电子邮件地址
+    private static final String EMAIL_REGEX =
+            "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,6}";
+
+    // 编译正则表达式为Pattern对象
+    private static final Pattern EMAIL_PATTERN = Pattern.compile(EMAIL_REGEX);
+
+    // 去除文本中的电子邮件地址
+    private static String removeEmails(String text) {
+        Matcher matcher = EMAIL_PATTERN.matcher(text);
+        // 使用空字符串替换匹配的电子邮件地址
+        return matcher.replaceAll("");
+    }
+
+    /**
+     * 去除Email
+     *
+     * 去除email地址
+     * @param content
+     */
+    public static String processFile(String content) {
+
+        // 去除电子邮件地址
+        String modifiedContent = removeEmails(content);
+
+        // 或者打印到控制台以查看结果
+        System.out.println(modifiedContent);
+        return modifiedContent;
+    }
+
+    // 定义一个正则表达式来匹配IPv4地址
+    private static final String IPV4_REGEX =
+            "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
+                    "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
+                    "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
+                    "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)";
+
+    // 定义一个正则表达式来匹配IPv6地址
+    // 这个正则表达式相对简单，可能无法匹配所有复杂的IPv6地址格式
+    // 但它可以匹配常见的IPv6地址，如2001:0db8:85a3:0000:0000:8a2e:0370:7334
+    private static final String IPV6_REGEX =
+            "([0-9a-fA-F]{1,4}:){7}([0-9a-fA-F]{1,4})";
+
+    // 编译IPv4正则表达式为Pattern对象
+    private static final Pattern IPV4_PATTERN = Pattern.compile(IPV4_REGEX);
+
+    // 编译IPv6正则表达式为Pattern对象
+    private static final Pattern IPV6_PATTERN = Pattern.compile(IPV6_REGEX);
+
+    /**
+     * 去除文本中的IPv4和IPv6地址
+     */
+    public static String removeIPAddresses(String text) {
+        Matcher ipv4Matcher = IPV4_PATTERN.matcher(text);
+        text = ipv4Matcher.replaceAll("");
+        Matcher ipv6Matcher = IPV6_PATTERN.matcher(text);
+        return ipv6Matcher.replaceAll("");
+    }
+
+    // 定义一个正则表达式来匹配常见的电话号码格式（这是一个非常简化的示例）
+    private static final String PHONE_REGEX = "\\b\\d{3}-?\\d{2}-?\\d{4}\\b";
+
+    // 定义一个正则表达式来匹配常见的信用卡号格式（16位数字）
+    private static final String CREDIT_CARD_REGEX = "\\b\\d{16}\\b";
+
+    // 定义一个正则表达式来匹配常见的十六进制散列格式（32位十六进制数，用于SHA-256等）
+    private static final String HASH_REGEX = "\\b[a-fA-F0-9]{32}\\b";
+
+    // 编译正则表达式为Pattern对象
+    private static final Pattern PHONE_PATTERN = Pattern.compile(PHONE_REGEX);
+    private static final Pattern CREDIT_CARD_PATTERN = Pattern.compile(CREDIT_CARD_REGEX);
+    private static final Pattern HASH_PATTERN = Pattern.compile(HASH_REGEX);
+
+    // 定义一个年份格式
+    private static final DateTimeFormatter YEAR_FORMAT = DateTimeFormatter.ofPattern("yyyy");
+
+    // 定义一个集合来存储要跳过的年份（这里我们假设跳过当前年份和前几年的范围）
+    private static final Set<String> YEARS_TO_SKIP = new HashSet<>();
+
+    static {
+        int currentYear = Year.now().getValue();
+        for (int i = currentYear - 5; i <= currentYear + 5; i++) {
+            YEARS_TO_SKIP.add(String.valueOf(i));
+        }
+    }
+
+    /**
+     * 去除数字
+     *
+     * 去除数字和字母数字标识符，如电话号码、信用卡号、十六进制散列等，同时跳过年份和简单数字的实例
+     * @param text
+     * @return
+     */
+    public static String removeIdentifiers(String text) {
+        Matcher phoneMatcher = PHONE_PATTERN.matcher(text);
+        text = phoneMatcher.replaceAll("");
+
+        Matcher creditCardMatcher = CREDIT_CARD_PATTERN.matcher(text);
+        text = creditCardMatcher.replaceAll("");
+
+        Matcher hashMatcher = HASH_PATTERN.matcher(text);
+        text = hashMatcher.replaceAll("");
+
+        // 使用StringBuilder和StringBuilder的replace方法去除其他数字，但跳过年份和简单数字
+        StringBuilder sb = new StringBuilder(text);
+        int index = 0;
+        while ((index = findNextNumberToReplace(sb.toString())) != -1) {
+            String number = sb.substring(index, findEndOfNumber(sb.toString(), index));
+            if (!isYear(number) && !isSimpleNumber(number)) {
+                sb.replace(index, index + number.length(), "");
+            }
+        }
+        return sb.toString();
+    }
+
+    // 查找下一个要替换的数字的起始索引
+    private static int findNextNumberToReplace(String text) {
+        // 这里可以添加更复杂的逻辑来定位要替换的数字，但为了简化，我们假设数字以空格或非数字字符分隔
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+            if (Character.isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
+                // 找到数字的起始位置
+                while (i < text.length() && (Character.isDigit(text.charAt(i)) ||
+                        (text.charAt(i) >= 'a' && text.charAt(i) <= 'f') ||
+                        (text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) {
+                    i++;
+                }
+                // 返回数字的起始索引（减1，因为我们要在循环外部处理i的递增）
+                return i - 1 > 0 ? i - 1 : 0;
+            }
+        }
+        return -1; // 没有找到要替换的数字
+    }
+
+    // 找到数字的结束索引
+    private static int findEndOfNumber(String text, int startIndex) {
+        // 从startIndex开始向后查找，直到遇到非数字字符
+        for (int i = startIndex; i < text.length(); i++) {
+            if (!(Character.isDigit(text.charAt(i)) ||
+                    (text.charAt(i) >= 'a' && text.charAt(i) <= 'f') ||
+                    (text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) {
+                return i;
+            }
+        }
+        return text.length(); // 如果字符串以数字结束，则返回字符串的长度
+    }
+
+    // 检查一个字符串是否是年份
+    private static boolean isYear(String str) {
+        try {
+            int year = Integer.parseInt(str);
+            Year y = Year.parse(str, YEAR_FORMAT);
+            return YEARS_TO_SKIP.contains(str);
+        } catch (NumberFormatException e) {
+            return false;
+        }
+    }
+
+    // 检查一个字符串是否是简单数字（这里假设不超过六位的连续数字）
+    private static boolean isSimpleNumber(String str) {
+        try {
+            int number = Integer.parseInt(str);
+            return String.valueOf(number).equals(str) && number >= 0 && number < 1000000;
+        } catch (NumberFormatException e) {
+            return false;
+        }
+    }
+}