[update] 检查文档的字重复率参数修改
This commit is contained in:
parent
a6dff7b3ab
commit
2d35090608
@ -195,20 +195,20 @@ public class DataProcessUtil {
|
||||
* 检查文档的字重复率
|
||||
* <p>
|
||||
* 如果字重复率太高,意味着文档中重复的字太多,文档会被过滤掉
|
||||
*
|
||||
* @param lines 文档行列表
|
||||
* @param threshold // 设置字重复率的阈值,例如10%
|
||||
* @return
|
||||
* </p>
|
||||
* @param content 文档行
|
||||
* @param threshold 设置字重复率的阈值,例如10%
|
||||
* @return true表示字重复率低于阈值,false表示字重复率高于阈值,文档会被过滤掉
|
||||
*/
|
||||
public static boolean calculateCharacterRepetitionRate (List<String> lines, double threshold) {
|
||||
StringBuilder content = new StringBuilder();
|
||||
for (String line : lines) {
|
||||
content.append(line);
|
||||
}
|
||||
public static boolean calculateCharacterRepetitionRate (String content, double threshold) {
|
||||
// StringBuilder content = new StringBuilder();
|
||||
// for (String line : lines) {
|
||||
// content.append(line);
|
||||
// }
|
||||
|
||||
// 统计字符出现次数
|
||||
Map<Character, Integer> charCount = new HashMap<>();
|
||||
for (char c : content.toString().toCharArray()) {
|
||||
for (char c : content.toCharArray()) {
|
||||
charCount.put(c, charCount.getOrDefault(c, 0) + 1);
|
||||
}
|
||||
|
||||
@ -217,7 +217,8 @@ public class DataProcessUtil {
|
||||
int repeatedChars = 0;
|
||||
for (int count : charCount.values()) {
|
||||
if (count > 1) {
|
||||
repeatedChars += count - 1; // 只计算重复的部分
|
||||
// 只计算重复的部分
|
||||
repeatedChars += count - 1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -336,7 +337,7 @@ public class DataProcessUtil {
|
||||
|
||||
// 检测是否包含色情暴力词
|
||||
boolean isFalse = SensitiveWordHelper.contains(content);
|
||||
if (!isFalse){
|
||||
if (!isFalse) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user