[update] 添加注释

This commit is contained in:
Liuyang 2025-01-15 13:16:55 +08:00
parent 530659f12b
commit 8b9649d2d3

View File

@ -10,6 +10,12 @@ import java.util.regex.Pattern;
public class DataProcessUtil {
/*
* ---------------------------------------------------------------
* 🔖 异常清洗配置
* ---------------------------------------------------------------
*/
/**
* 移除不可见字
* 移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围
@ -146,6 +152,12 @@ public class DataProcessUtil {
return count;
}
/*
* ---------------------------------------------------------------
* 🔖 过滤配置
* ---------------------------------------------------------------
*/
/**
* 检查文档的词数目
* 词数目不在指定范围会被过滤掉如中文[1,1000000]
@ -296,6 +308,17 @@ public class DataProcessUtil {
return specialCharRate > threshold;
}
/*
* ---------------------------------------------------------------
* 🔖 去重配置
* ---------------------------------------------------------------
*/
/*
* ---------------------------------------------------------------
* 🔖 去隐私配置
* ---------------------------------------------------------------
*/
// 定义一个正则表达式来匹配电子邮件地址
private static final String EMAIL_REGEX =
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,6}";