refactor(llm): 优化文本过滤功能和日志输出

- 修复字重复率计算逻辑,提高准确性
- 增加对词重复率、特殊字符率等过滤条件的日志输出
- 优化特殊字符率计算方法,考虑非中文字符
-调整日志输出格式,提高可读性
This commit is contained in:
Liuyang 2025-07-07 15:15:57 +08:00
parent 2a3bc9c0a9
commit 7d48924be6
2 changed files with 91 additions and 42 deletions

View File

@ -335,7 +335,7 @@ public class AsyncDataProcessService {
// 检查文档的词数目
String cleanedValue = dataFilterDetailByWordNumber(optionsB, value);
// 使用检测文档次数目后的文本继续进行检测
log.info("检测文档数目后的文本:{}", cleanedValue);
log.info("检测文档数目后的文本:{}", cleanedValue);
// 其他过滤配置
boolean shouldFilter = dataFilterDetailByOther(optionsB, cleanedValue);
if (!shouldFilter) {
@ -377,6 +377,7 @@ public class AsyncDataProcessService {
public static boolean dataFilterDetailByOther (JSONObject options, String text) {
// 检查文档的字重复率如果字重复率太高意味着文档中重复的字太多文档会被过滤掉取值范围[0,1]
JSONObject b_2 = options.getJSONObject("b_2");
log.info("检查文档的字重复率:{}", b_2);
if (b_2.getBool("is_on")) {
int threshold = b_2.getInt("num1");
if (DataProcessUtil.calculateCharacterRepetitionRate(text, threshold)) {
@ -386,6 +387,7 @@ public class AsyncDataProcessService {
// 检查文档的词重复率如果词重复率太高意味着文档中重复的词太多文档会被过滤掉取值范围[0,1]
JSONObject b_3 = options.getJSONObject("b_3");
log.info("检查文档的词重复率:{}", b_3);
if (b_3.getBool("is_on")) {
double threshold = b_3.getDouble("num1");
if (DataProcessUtil.calculateWordRepetitionRate(text, threshold)) {
@ -395,6 +397,7 @@ public class AsyncDataProcessService {
// 检查文档的特殊字符率如果特殊字符率太高意味着文档中特殊字符太多文档会被过滤掉取值范围[0,1]
JSONObject b_4 = options.getJSONObject("b_4");
log.info("检查文档的特殊字符率:{}", b_4);
if (b_4.getBool("is_on")) {
double threshold = b_4.getDouble("num1");
if (DataProcessUtil.checkSpecialCharacterRate(text, threshold)) {
@ -405,9 +408,11 @@ public class AsyncDataProcessService {
// 检查文档的色情暴力词率如果色情暴力词率太高文档会被过滤掉取值范围[0,1]
JSONObject b_5 = options.getJSONObject("b_5");
log.info("检查文档的色情暴力词率:{}", b_5);
if (b_5.getBool("is_on")) {
double threshold = b_5.getDouble("num1");
if (DataProcessUtil.checkSensitiveWordRate(text, threshold)) {
return true;
}
}

View File

@ -5,6 +5,7 @@ import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
import lombok.extern.slf4j.Slf4j;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.time.Year;
import java.time.format.DateTimeFormatter;
import java.util.*;
@ -203,37 +204,53 @@ public class DataProcessUtil {
* @param threshold 设置字重复率的阈值例如10%
* @return true表示字重复率低于阈值false表示字重复率高于阈值文档会被过滤掉
*/
public static boolean calculateCharacterRepetitionRate (String content, double threshold) {
// StringBuilder content = new StringBuilder();
// for (String line : lines) {
// content.append(line);
// }
// 统计字符出现次数
Map<Character, Integer> charCount = new HashMap<>();
for (char c : content.toCharArray()) {
charCount.put(c, charCount.getOrDefault(c, 0) + 1);
public static boolean calculateCharacterRepetitionRate(String content, double threshold) {
// 输入校验
if (content == null || content.trim().isEmpty()) {
return false;
}
// 计算重复字符数和总字符数
int totalChars = content.length();
int repeatedChars = 0;
for (int count : charCount.values()) {
if (count > 1) {
// 只计算重复的部分
repeatedChars += count - 1;
// 预处理去空格标点等
String processedContent = content
.replaceAll("\\s+", "")
.replaceAll("[\\pP\\pS]", "");
// 短文本不检查
if (processedContent.length() < 5) {
return false;
}
// 统计字符频率
Map<Character, Integer> charCount = new HashMap<>();
char[] chars = processedContent.toCharArray();
for (char c : chars) {
if (isChineseCharacter(c)) { // 可选仅统计中文
charCount.put(c, charCount.getOrDefault(c, 0) + 1);
}
}
// 计算字重复率
double repetitionRate = (double) repeatedChars / totalChars;
// 计算重复率方式1传统重复率
int totalChars = chars.length;
double repetitionRate = (double) (totalChars - charCount.size()) / totalChars;
// 打印重复率和阈值方便调试
log.info("字重复率: " + repetitionRate);
log.info("阈值: " + threshold);
// 将重复率转换为百分比0100以便与阈值直接比较
double repetitionPercent = repetitionRate * 100;
// 如果重复率超过阈值返回true表示需要过滤掉文档
return repetitionRate > threshold;
// 调试日志输出百分比
log.info("总字数: {}", totalChars);
log.info("重复字数: {}", totalChars - charCount.size());
log.info("字重复率: {}%", String.format("%.2f", repetitionPercent));
// 比较前可添加浮点数容差可选
final double EPSILON = 0.0001;
return repetitionPercent - threshold > EPSILON;
}
// 判断是否为中文字符可选
private static boolean isChineseCharacter(char c) {
Character.UnicodeScript sc = Character.UnicodeScript.of(c);
return sc == Character.UnicodeScript.HAN;
}
// 简单的基于空格和标点符号的分词方法
@ -299,31 +316,58 @@ public class DataProcessUtil {
* @param threshold
* @return
*/
public static boolean checkSpecialCharacterRate (String content, double threshold) {
/**
* 检测文本中特殊字符率是否超过阈值阈值范围0100.00
* @param content 待检测文本
* @param threshold 百分比阈值如传入10表示10%
* @return 超过阈值返回true
*/
public static boolean checkSpecialCharacterRate(String content, double threshold) {
// 参数校验
if (content == null || content.isEmpty()) {
log.warn("输入内容为空");
return false;
}
if (threshold < 0 || threshold > 100) {
throw new IllegalArgumentException("阈值必须是0100之间的数值");
}
log.info("特殊字符检测:{}", content);
// 使用正则表达式匹配特殊字符非字母数字字符
Pattern pattern = Pattern.compile("[^a-zA-Z0-9]");
Matcher matcher = pattern.matcher(content);
// 预处理去除所有空白字符可选
String processedContent = content.replaceAll("\\s+", "");
int totalCharCount = processedContent.length();
// 空文本或纯空白内容处理
if (totalCharCount == 0) {
log.info("有效字符数为0");
return false;
}
// 统计特殊字符非字母数字汉字
// 正则说明
// [^a-zA-Z0-9\\p{Script=Han}] 排除字母数字和汉字
// 如需包含其他语言字符需调整正则
Pattern pattern = Pattern.compile("[^a-zA-Z0-9\\p{Script=Han}]");
Matcher matcher = pattern.matcher(processedContent);
// 统计特殊字符的数量
int specialCharCount = 0;
while (matcher.find()) {
specialCharCount++;
}
// 计算文档的总字符数不包括换行符等空白字符可以根据需要调整
int totalCharCount = content.length(); // 或者使用 content.replaceAll("\\s+", "").length() 来排除空白字符
// 计算特殊字符率转换为百分比
double specialCharRatePercent = (double) specialCharCount / totalCharCount * 100;
// 计算特殊字符率
double specialCharRate = (double) specialCharCount / totalCharCount;
// 调试日志保留2位小数
DecimalFormat df = new DecimalFormat("0.00");
log.info("特殊字符检测结果: {}/{}={}% (阈值: {}%)",
specialCharCount,
totalCharCount,
df.format(specialCharRatePercent),
df.format(threshold));
// 打印特殊字符率和阈值方便调试
log.info("特殊字符率: " + specialCharRate);
log.info("阈值: " + threshold);
// 如果特殊字符率超过阈值返回true表示需要过滤掉文档
return specialCharRate > threshold;
// 浮点数精确比较添加1e-6容差
final double EPSILON = 1e-6;
return specialCharRatePercent - threshold > EPSILON;
}
/**
@ -490,7 +534,7 @@ public class DataProcessUtil {
String modifiedContent = removeEmails(content);
// 或者打印到控制台以查看结果
log.info(modifiedContent);
log.info("去除电子邮件地址:{}", modifiedContent);
return modifiedContent;
}