[update] 检查文档的字重复率参数修改

This commit is contained in:
Liuyang 2025-01-16 11:24:15 +08:00
parent a6dff7b3ab
commit 2d35090608

View File

@ -195,20 +195,20 @@ public class DataProcessUtil {
* 检查文档的字重复率
* <p>
* 如果字重复率太高意味着文档中重复的字太多文档会被过滤掉
*
* @param lines 文档行列表
* @param threshold // 设置字重复率的阈值例如10%
* @return
* </p>
* @param content 文档行
* @param threshold 设置字重复率的阈值例如10%
* @return true表示字重复率低于阈值false表示字重复率高于阈值文档会被过滤掉
*/
public static boolean calculateCharacterRepetitionRate (List<String> lines, double threshold) {
StringBuilder content = new StringBuilder();
for (String line : lines) {
content.append(line);
}
public static boolean calculateCharacterRepetitionRate (String content, double threshold) {
// StringBuilder content = new StringBuilder();
// for (String line : lines) {
// content.append(line);
// }
// 统计字符出现次数
Map<Character, Integer> charCount = new HashMap<>();
for (char c : content.toString().toCharArray()) {
for (char c : content.toCharArray()) {
charCount.put(c, charCount.getOrDefault(c, 0) + 1);
}
@ -217,7 +217,8 @@ public class DataProcessUtil {
int repeatedChars = 0;
for (int count : charCount.values()) {
if (count > 1) {
repeatedChars += count - 1; // 只计算重复的部分
// 只计算重复的部分
repeatedChars += count - 1;
}
}
@ -336,7 +337,7 @@ public class DataProcessUtil {
// 检测是否包含色情暴力词
boolean isFalse = SensitiveWordHelper.contains(content);
if (!isFalse){
if (!isFalse) {
return false;
}