refactor(llm): 优化文本过滤功能和日志输出
- 修复字重复率计算逻辑,提高准确性 - 增加对词重复率、特殊字符率等过滤条件的日志输出 - 优化特殊字符率计算方法,考虑非中文字符 -调整日志输出格式,提高可读性
This commit is contained in:
parent
2a3bc9c0a9
commit
7d48924be6
@ -335,7 +335,7 @@ public class AsyncDataProcessService {
|
||||
// 检查文档的词数目
|
||||
String cleanedValue = dataFilterDetailByWordNumber(optionsB, value);
|
||||
// 使用检测文档次数目后的文本继续进行检测
|
||||
log.info("检测文档次数目后的文本:{}", cleanedValue);
|
||||
log.info("检测文档词数目后的文本:{}", cleanedValue);
|
||||
// 其他过滤配置
|
||||
boolean shouldFilter = dataFilterDetailByOther(optionsB, cleanedValue);
|
||||
if (!shouldFilter) {
|
||||
@ -377,6 +377,7 @@ public class AsyncDataProcessService {
|
||||
public static boolean dataFilterDetailByOther (JSONObject options, String text) {
|
||||
// 检查文档的字重复率:如果字重复率太高,意味着文档中重复的字太多,文档会被过滤掉,取值范围[0,1]。
|
||||
JSONObject b_2 = options.getJSONObject("b_2");
|
||||
log.info("检查文档的字重复率:{}", b_2);
|
||||
if (b_2.getBool("is_on")) {
|
||||
int threshold = b_2.getInt("num1");
|
||||
if (DataProcessUtil.calculateCharacterRepetitionRate(text, threshold)) {
|
||||
@ -386,6 +387,7 @@ public class AsyncDataProcessService {
|
||||
|
||||
// 检查文档的词重复率:如果词重复率太高,意味着文档中重复的词太多,文档会被过滤掉,取值范围[0,1]。
|
||||
JSONObject b_3 = options.getJSONObject("b_3");
|
||||
log.info("检查文档的词重复率:{}", b_3);
|
||||
if (b_3.getBool("is_on")) {
|
||||
double threshold = b_3.getDouble("num1");
|
||||
if (DataProcessUtil.calculateWordRepetitionRate(text, threshold)) {
|
||||
@ -395,6 +397,7 @@ public class AsyncDataProcessService {
|
||||
|
||||
// 检查文档的特殊字符率:如果特殊字符率太高,意味着文档中特殊字符太多,文档会被过滤掉,取值范围[0,1]。
|
||||
JSONObject b_4 = options.getJSONObject("b_4");
|
||||
log.info("检查文档的特殊字符率:{}", b_4);
|
||||
if (b_4.getBool("is_on")) {
|
||||
double threshold = b_4.getDouble("num1");
|
||||
if (DataProcessUtil.checkSpecialCharacterRate(text, threshold)) {
|
||||
@ -405,9 +408,11 @@ public class AsyncDataProcessService {
|
||||
|
||||
// 检查文档的色情暴力词率:如果色情暴力词率太高,文档会被过滤掉,取值范围[0,1]。
|
||||
JSONObject b_5 = options.getJSONObject("b_5");
|
||||
log.info("检查文档的色情暴力词率:{}", b_5);
|
||||
if (b_5.getBool("is_on")) {
|
||||
double threshold = b_5.getDouble("num1");
|
||||
if (DataProcessUtil.checkSensitiveWordRate(text, threshold)) {
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.DecimalFormat;
|
||||
import java.time.Year;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
@ -203,37 +204,53 @@ public class DataProcessUtil {
|
||||
* @param threshold 设置字重复率的阈值,例如10%
|
||||
* @return true表示字重复率低于阈值,false表示字重复率高于阈值,文档会被过滤掉
|
||||
*/
|
||||
public static boolean calculateCharacterRepetitionRate (String content, double threshold) {
|
||||
// StringBuilder content = new StringBuilder();
|
||||
// for (String line : lines) {
|
||||
// content.append(line);
|
||||
// }
|
||||
|
||||
// 统计字符出现次数
|
||||
Map<Character, Integer> charCount = new HashMap<>();
|
||||
for (char c : content.toCharArray()) {
|
||||
charCount.put(c, charCount.getOrDefault(c, 0) + 1);
|
||||
public static boolean calculateCharacterRepetitionRate(String content, double threshold) {
|
||||
// 输入校验
|
||||
if (content == null || content.trim().isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 计算重复字符数和总字符数
|
||||
int totalChars = content.length();
|
||||
int repeatedChars = 0;
|
||||
for (int count : charCount.values()) {
|
||||
if (count > 1) {
|
||||
// 只计算重复的部分
|
||||
repeatedChars += count - 1;
|
||||
// 预处理(去空格、标点等)
|
||||
String processedContent = content
|
||||
.replaceAll("\\s+", "")
|
||||
.replaceAll("[\\pP\\pS]", "");
|
||||
|
||||
// 短文本不检查
|
||||
if (processedContent.length() < 5) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 统计字符频率
|
||||
Map<Character, Integer> charCount = new HashMap<>();
|
||||
char[] chars = processedContent.toCharArray();
|
||||
for (char c : chars) {
|
||||
if (isChineseCharacter(c)) { // 可选:仅统计中文
|
||||
charCount.put(c, charCount.getOrDefault(c, 0) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// 计算字重复率
|
||||
double repetitionRate = (double) repeatedChars / totalChars;
|
||||
// 计算重复率(方式1:传统重复率)
|
||||
int totalChars = chars.length;
|
||||
double repetitionRate = (double) (totalChars - charCount.size()) / totalChars;
|
||||
|
||||
// 打印重复率和阈值,方便调试
|
||||
log.info("字重复率: " + repetitionRate);
|
||||
log.info("阈值: " + threshold);
|
||||
// 将重复率转换为百分比(0~100),以便与阈值直接比较
|
||||
double repetitionPercent = repetitionRate * 100;
|
||||
|
||||
// 如果重复率超过阈值,返回true表示需要过滤掉文档
|
||||
return repetitionRate > threshold;
|
||||
// 调试日志(输出百分比)
|
||||
log.info("总字数: {}", totalChars);
|
||||
log.info("重复字数: {}", totalChars - charCount.size());
|
||||
log.info("字重复率: {}%", String.format("%.2f", repetitionPercent));
|
||||
|
||||
// 比较前可添加浮点数容差(可选)
|
||||
final double EPSILON = 0.0001;
|
||||
return repetitionPercent - threshold > EPSILON;
|
||||
}
|
||||
|
||||
|
||||
// 判断是否为中文字符(可选)
|
||||
private static boolean isChineseCharacter(char c) {
|
||||
Character.UnicodeScript sc = Character.UnicodeScript.of(c);
|
||||
return sc == Character.UnicodeScript.HAN;
|
||||
}
|
||||
|
||||
// 简单的基于空格和标点符号的分词方法
|
||||
@ -299,31 +316,58 @@ public class DataProcessUtil {
|
||||
* @param threshold
|
||||
* @return
|
||||
*/
|
||||
public static boolean checkSpecialCharacterRate (String content, double threshold) {
|
||||
/**
|
||||
* 检测文本中特殊字符率是否超过阈值(阈值范围0~100.00)
|
||||
* @param content 待检测文本
|
||||
* @param threshold 百分比阈值(如传入10表示10%)
|
||||
* @return 超过阈值返回true
|
||||
*/
|
||||
public static boolean checkSpecialCharacterRate(String content, double threshold) {
|
||||
// 参数校验
|
||||
if (content == null || content.isEmpty()) {
|
||||
log.warn("输入内容为空");
|
||||
return false;
|
||||
}
|
||||
if (threshold < 0 || threshold > 100) {
|
||||
throw new IllegalArgumentException("阈值必须是0~100之间的数值");
|
||||
}
|
||||
|
||||
log.info("特殊字符检测:{}", content);
|
||||
// 使用正则表达式匹配特殊字符(非字母数字字符)
|
||||
Pattern pattern = Pattern.compile("[^a-zA-Z0-9]");
|
||||
Matcher matcher = pattern.matcher(content);
|
||||
// 预处理:去除所有空白字符(可选)
|
||||
String processedContent = content.replaceAll("\\s+", "");
|
||||
int totalCharCount = processedContent.length();
|
||||
|
||||
// 空文本或纯空白内容处理
|
||||
if (totalCharCount == 0) {
|
||||
log.info("有效字符数为0");
|
||||
return false;
|
||||
}
|
||||
|
||||
// 统计特殊字符(非字母、数字、汉字)
|
||||
// 正则说明:
|
||||
// [^a-zA-Z0-9\\p{Script=Han}] → 排除字母数字和汉字
|
||||
// 如需包含其他语言字符,需调整正则
|
||||
Pattern pattern = Pattern.compile("[^a-zA-Z0-9\\p{Script=Han}]");
|
||||
Matcher matcher = pattern.matcher(processedContent);
|
||||
|
||||
// 统计特殊字符的数量
|
||||
int specialCharCount = 0;
|
||||
while (matcher.find()) {
|
||||
specialCharCount++;
|
||||
}
|
||||
|
||||
// 计算文档的总字符数(不包括换行符等空白字符,可以根据需要调整)
|
||||
int totalCharCount = content.length(); // 或者使用 content.replaceAll("\\s+", "").length() 来排除空白字符
|
||||
// 计算特殊字符率(转换为百分比)
|
||||
double specialCharRatePercent = (double) specialCharCount / totalCharCount * 100;
|
||||
|
||||
// 计算特殊字符率
|
||||
double specialCharRate = (double) specialCharCount / totalCharCount;
|
||||
// 调试日志(保留2位小数)
|
||||
DecimalFormat df = new DecimalFormat("0.00");
|
||||
log.info("特殊字符检测结果: {}/{}={}% (阈值: {}%)",
|
||||
specialCharCount,
|
||||
totalCharCount,
|
||||
df.format(specialCharRatePercent),
|
||||
df.format(threshold));
|
||||
|
||||
// 打印特殊字符率和阈值,方便调试
|
||||
log.info("特殊字符率: " + specialCharRate);
|
||||
log.info("阈值: " + threshold);
|
||||
|
||||
// 如果特殊字符率超过阈值,返回true表示需要过滤掉文档
|
||||
return specialCharRate > threshold;
|
||||
// 浮点数精确比较(添加1e-6容差)
|
||||
final double EPSILON = 1e-6;
|
||||
return specialCharRatePercent - threshold > EPSILON;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -490,7 +534,7 @@ public class DataProcessUtil {
|
||||
String modifiedContent = removeEmails(content);
|
||||
|
||||
// 或者打印到控制台以查看结果
|
||||
log.info(modifiedContent);
|
||||
log.info("去除电子邮件地址:{}", modifiedContent);
|
||||
return modifiedContent;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user