[update] 数据处理:去隐私配置 去除电话号码修改

This commit is contained in:
Liuyang 2025-01-22 18:19:59 +08:00
parent d80c8ba7b6
commit c7a0e2468a

View File

@ -4,7 +4,6 @@ import com.github.houbb.opencc4j.util.ZhConverterUtil;
import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
import lombok.extern.slf4j.Slf4j;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.time.Year;
import java.time.format.DateTimeFormatter;
@ -391,7 +390,7 @@ public class DataProcessUtil {
.sorted(Map.Entry.comparingByKey())
.forEachOrdered(x -> sortedContentMap.put(x.getKey(), x.getValue()));
sortedContentMap.forEach((key, value)->{
sortedContentMap.forEach((key, value) -> {
log.info("key:{},value:{}", key, value);
});
// 有顺序的转换成文本内容使用 LinkedList 存储文本内容
@ -400,7 +399,7 @@ public class DataProcessUtil {
.map(Map.Entry::getValue)
.collect(Collectors.toCollection(LinkedList::new));
contents.forEach(v->{
contents.forEach(v -> {
log.info("value:{}", v);
});
@ -524,17 +523,24 @@ public class DataProcessUtil {
return ipv6Matcher.replaceAll("");
}
// 定义一个正则表达式来匹配常见的电话号码格式这是一个非常简化的示例
private static final String PHONE_REGEX = "\\b\\d{3}-?\\d{2}-?\\d{4}\\b";
/**
* 手机号码的正则表达式
*/
private static final String MOBILE_REGEX = "1\\d{10}";
/**
* 国内电话号码的正则表达式
*/
private static final String DOMESTIC_PHONE_REGEX = "(\\d{4}-|\\d{3}-)?(\\d{8}|\\d{7})";
// 定义一个正则表达式来匹配常见的信用卡号格式16位数字
private static final String CREDIT_CARD_REGEX = "\\b\\d{16}\\b";
private static final String CREDIT_CARD_REGEX = "^([1-9]{1})(\\d{15}|\\d{18})$";
// 定义一个正则表达式来匹配常见的十六进制散列格式32位十六进制数用于SHA-256等
private static final String HASH_REGEX = "\\b[a-fA-F0-9]{32}\\b";
// 编译正则表达式为Pattern对象
private static final Pattern PHONE_PATTERN = Pattern.compile(PHONE_REGEX);
private static final Pattern MOBILE_PATTERN = Pattern.compile(MOBILE_REGEX);
private static final Pattern DOMESTIC_PHONE_PATTERN = Pattern.compile(DOMESTIC_PHONE_REGEX);
private static final Pattern CREDIT_CARD_PATTERN = Pattern.compile(CREDIT_CARD_REGEX);
private static final Pattern HASH_PATTERN = Pattern.compile(HASH_REGEX);
@ -560,25 +566,39 @@ public class DataProcessUtil {
* @return
*/
public static String removeIdentifiers (String text) {
Matcher phoneMatcher = PHONE_PATTERN.matcher(text);
text = phoneMatcher.replaceAll("");
// 使用正则表达式匹配电话号码
text = removePhone(text);
// 使用正则表达式匹配信用卡号
Matcher creditCardMatcher = CREDIT_CARD_PATTERN.matcher(text);
text = creditCardMatcher.replaceAll("");
// 使用正则表达式匹配十六进制散列
Matcher hashMatcher = HASH_PATTERN.matcher(text);
text = hashMatcher.replaceAll("");
// 使用StringBuilder和StringBuilder的replace方法去除其他数字但跳过年份和简单数字
// TODO: 这里目前有bug先注释掉了
// StringBuilder sb = new StringBuilder(text);
// int index = 0;
// while ((index = findNextNumberToReplace(sb.toString())) != -1) {
// String number = sb.substring(index, findEndOfNumber(sb.toString(), index));
// if (!isYear(number) && !isSimpleNumber(number)) {
// sb.replace(index, index + number.length(), "");
// }
// }
// // 使用StringBuilder和StringBuilder的replace方法去除其他数字但跳过年份和简单数字
// // TODO: 这里目前有bug先注释掉了
// StringBuilder sb = new StringBuilder(text);
// int index = 0;
// while ((index = findNextNumberToReplace(sb.toString())) != -1) {
// String number = sb.substring(index, findEndOfNumber(sb.toString(), index));
// if (!isYear(number) && !isSimpleNumber(number)) {
// sb.replace(index, index + number.length(), "");
// }
// }
return text;
}
private static String removePhone (String text) {
// 手机号码的正则表达式
Matcher mobileMatcher = MOBILE_PATTERN.matcher(text);
text = mobileMatcher.replaceAll("");
// 国内电话号码的正则表达式
Matcher domesticPhoneMatcher = DOMESTIC_PHONE_PATTERN.matcher(text);
text = domesticPhoneMatcher.replaceAll("");
return text;
}
@ -642,20 +662,21 @@ public class DataProcessUtil {
// 打印结果
log.info(textWithoutIdentifiers);
String traditionalText = "不經意,妳的笑容";
String simplifiedText = traditionalToSimplified(traditionalText);
log.info("繁体文本: [" + traditionalText + "]");
log.info("简体文本: [" + simplifiedText + "]");
String dirtyString="?<3F><>简体文<E4BD93><E69687><EFBFBD>f?<3F>G<EFBFBD><47>?<3F><>??<3F>G<EFBFBD>G<EFBFBD><47>پ?<3F>l?,,,杩欐槸涓€涓\\uE043贡鐮";
// 先进行编码转换
dirtyString = convertEncoding(dirtyString);
// 再进行乱码和无意义 Unicode 字符的清理
String cleanString = clean(dirtyString);
// String s1 = removeNonPrintableUnicodeChars(s);
log.info("去除乱码:[{}]", cleanString);
// String traditionalText = "不經意,妳的笑容";
// String simplifiedText = traditionalToSimplified(traditionalText);
//
// log.info("繁体文本: [" + traditionalText + "]");
// log.info("简体文本: [" + simplifiedText + "]");
//String dirtyString="?<3F><>简体文<E4BD93><E69687><EFBFBD>f?<3F>G<EFBFBD><47>?<3F><>??<3F>G<EFBFBD>G<EFBFBD><47>پ?<3F>l?,,,杩欐槸涓€涓\\uE043贡鐮";
// // 先进行编码转换
// dirtyString = convertEncoding(dirtyString);
// // 再进行乱码和无意义 Unicode 字符的清理
// String cleanString = clean(dirtyString);
//// String s1 = removeNonPrintableUnicodeChars(s);
// log.info("去除乱码:[{}]", cleanString);
}
public static String clean(String input) {
public static String clean (String input) {
// 更广泛的乱码字符范围包括一些扩展的不可打印字符
String cleanString = input.replaceAll("[\\x00-\\x1F\\x7F-\\x9F\\uFFFD]", "");
// 去除无意义的 Unicode 字符这里范围可根据实际情况修改
@ -663,7 +684,7 @@ String dirtyString="?<3F><>简体文<E4BD93><E69687><EFBFBD>f?<3F>G<EFBFBD><47>?<3F><>??<3F>G<EFBFBD>G<EFBFBD><47>
return cleanString;
}
public static String convertEncoding(String input) {
public static String convertEncoding (String input) {
// 尝试多种编码转换找到正确的编码
String[] encodings = {"UTF-8", "GBK", "Big5", "ISO-8859-1"};
for (String encoding : encodings) {