[update] 添加注释
This commit is contained in:
parent
530659f12b
commit
8b9649d2d3
@ -10,6 +10,12 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class DataProcessUtil {
|
||||
|
||||
/*
|
||||
* ---------------------------------------------------------------
|
||||
* 🔖 【 异常清洗配置 】
|
||||
* ---------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/**
|
||||
* 移除不可见字
|
||||
* 移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围
|
||||
@ -146,6 +152,12 @@ public class DataProcessUtil {
|
||||
return count;
|
||||
}
|
||||
|
||||
/*
|
||||
* ---------------------------------------------------------------
|
||||
* 🔖 【 过滤配置 】
|
||||
* ---------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/**
|
||||
* 检查文档的词数目
|
||||
* 词数目不在指定范围会被过滤掉,如中文[1,1000000]
|
||||
@ -296,6 +308,17 @@ public class DataProcessUtil {
|
||||
return specialCharRate > threshold;
|
||||
}
|
||||
|
||||
/*
|
||||
* ---------------------------------------------------------------
|
||||
* 🔖 【 去重配置 】
|
||||
* ---------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/*
|
||||
* ---------------------------------------------------------------
|
||||
* 🔖 【 去隐私配置 】
|
||||
* ---------------------------------------------------------------
|
||||
*/
|
||||
// 定义一个正则表达式来匹配电子邮件地址
|
||||
private static final String EMAIL_REGEX =
|
||||
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,6}";
|
||||
|
Loading…
x
Reference in New Issue
Block a user