Merge remote-tracking branch 'origin/master'
This commit is contained in:
commit
0c3b5793f0
@ -4,7 +4,6 @@ import com.github.houbb.opencc4j.util.ZhConverterUtil;
|
||||
import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Year;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
@ -391,7 +390,7 @@ public class DataProcessUtil {
|
||||
.sorted(Map.Entry.comparingByKey())
|
||||
.forEachOrdered(x -> sortedContentMap.put(x.getKey(), x.getValue()));
|
||||
|
||||
sortedContentMap.forEach((key, value)->{
|
||||
sortedContentMap.forEach((key, value) -> {
|
||||
log.info("key:{},value:{}", key, value);
|
||||
});
|
||||
// 有顺序的转换成文本内容,使用 LinkedList 存储文本内容
|
||||
@ -400,7 +399,7 @@ public class DataProcessUtil {
|
||||
.map(Map.Entry::getValue)
|
||||
.collect(Collectors.toCollection(LinkedList::new));
|
||||
|
||||
contents.forEach(v->{
|
||||
contents.forEach(v -> {
|
||||
log.info("value:{}", v);
|
||||
});
|
||||
|
||||
@ -524,17 +523,27 @@ public class DataProcessUtil {
|
||||
return ipv6Matcher.replaceAll("");
|
||||
}
|
||||
|
||||
// 定义一个正则表达式来匹配常见的电话号码格式(这是一个非常简化的示例)
|
||||
private static final String PHONE_REGEX = "\\b\\d{3}-?\\d{2}-?\\d{4}\\b";
|
||||
/**
|
||||
* 手机号码的正则表达式
|
||||
*/
|
||||
private static final String MOBILE_REGEX = "1\\d{10}";
|
||||
|
||||
// 定义一个正则表达式来匹配常见的信用卡号格式(16位数字)
|
||||
private static final String CREDIT_CARD_REGEX = "\\b\\d{16}\\b";
|
||||
/**
|
||||
* 国内电话号码的正则表达式
|
||||
*/
|
||||
private static final String DOMESTIC_PHONE_REGEX = "(\\d{4}-|\\d{3}-)?(\\d{8}|\\d{7})";
|
||||
|
||||
/**
|
||||
* 信用卡号的正则表达式
|
||||
*/
|
||||
private static final String CREDIT_CARD_REGEX = "^([1-9]{1})(\\d{15}|\\d{18})$";
|
||||
|
||||
// 定义一个正则表达式来匹配常见的十六进制散列格式(32位十六进制数,用于SHA-256等)
|
||||
private static final String HASH_REGEX = "\\b[a-fA-F0-9]{32}\\b";
|
||||
|
||||
// 编译正则表达式为Pattern对象
|
||||
private static final Pattern PHONE_PATTERN = Pattern.compile(PHONE_REGEX);
|
||||
private static final Pattern MOBILE_PATTERN = Pattern.compile(MOBILE_REGEX);
|
||||
private static final Pattern DOMESTIC_PHONE_PATTERN = Pattern.compile(DOMESTIC_PHONE_REGEX);
|
||||
private static final Pattern CREDIT_CARD_PATTERN = Pattern.compile(CREDIT_CARD_REGEX);
|
||||
private static final Pattern HASH_PATTERN = Pattern.compile(HASH_REGEX);
|
||||
|
||||
@ -560,25 +569,56 @@ public class DataProcessUtil {
|
||||
* @return
|
||||
*/
|
||||
public static String removeIdentifiers (String text) {
|
||||
Matcher phoneMatcher = PHONE_PATTERN.matcher(text);
|
||||
text = phoneMatcher.replaceAll("");
|
||||
// 使用正则表达式匹配电话号码
|
||||
text = removePhone(text);
|
||||
|
||||
Matcher creditCardMatcher = CREDIT_CARD_PATTERN.matcher(text);
|
||||
text = creditCardMatcher.replaceAll("");
|
||||
// 使用正则表达式匹配信用卡号
|
||||
text = removeCreditCard(text);
|
||||
|
||||
// 使用正则表达式匹配十六进制散列
|
||||
Matcher hashMatcher = HASH_PATTERN.matcher(text);
|
||||
text = hashMatcher.replaceAll("");
|
||||
|
||||
// 使用StringBuilder和StringBuilder的replace方法去除其他数字,但跳过年份和简单数字
|
||||
// TODO: 这里目前有bug,先注释掉了。
|
||||
// StringBuilder sb = new StringBuilder(text);
|
||||
// int index = 0;
|
||||
// while ((index = findNextNumberToReplace(sb.toString())) != -1) {
|
||||
// String number = sb.substring(index, findEndOfNumber(sb.toString(), index));
|
||||
// if (!isYear(number) && !isSimpleNumber(number)) {
|
||||
// sb.replace(index, index + number.length(), "");
|
||||
// }
|
||||
// }
|
||||
// // 使用StringBuilder和StringBuilder的replace方法去除其他数字,但跳过年份和简单数字
|
||||
// // TODO: 这里目前有bug,先注释掉了。
|
||||
// StringBuilder sb = new StringBuilder(text);
|
||||
// int index = 0;
|
||||
// while ((index = findNextNumberToReplace(sb.toString())) != -1) {
|
||||
// String number = sb.substring(index, findEndOfNumber(sb.toString(), index));
|
||||
// if (!isYear(number) && !isSimpleNumber(number)) {
|
||||
// sb.replace(index, index + number.length(), "");
|
||||
// }
|
||||
// }
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* 去除电话号码
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 去除电话号码后的文本
|
||||
*/
|
||||
private static String removePhone (String text) {
|
||||
// 手机号码的正则表达式
|
||||
Matcher mobileMatcher = MOBILE_PATTERN.matcher(text);
|
||||
text = mobileMatcher.replaceAll("");
|
||||
|
||||
// 国内电话号码的正则表达式
|
||||
Matcher domesticPhoneMatcher = DOMESTIC_PHONE_PATTERN.matcher(text);
|
||||
text = domesticPhoneMatcher.replaceAll("");
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* 去除信用卡号
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 去除信用卡号后的文本
|
||||
*/
|
||||
private static String removeCreditCard (String text) {
|
||||
Matcher creditCardMatcher = CREDIT_CARD_PATTERN.matcher(text);
|
||||
text = creditCardMatcher.replaceAll("");
|
||||
return text;
|
||||
}
|
||||
|
||||
@ -642,20 +682,21 @@ public class DataProcessUtil {
|
||||
// 打印结果
|
||||
log.info(textWithoutIdentifiers);
|
||||
|
||||
String traditionalText = "不經意,妳的笑容";
|
||||
String simplifiedText = traditionalToSimplified(traditionalText);
|
||||
|
||||
log.info("繁体文本: [" + traditionalText + "]");
|
||||
log.info("简体文本: [" + simplifiedText + "]");
|
||||
String dirtyString="?<3F><>简体文<E4BD93><E69687><EFBFBD>f?<3F>G<EFBFBD><47>?<3F><>??<3F>G<EFBFBD>G<EFBFBD><47>پ?<3F>l?,,,杩欐槸涓€涓\\uE043贡鐮";
|
||||
// 先进行编码转换
|
||||
dirtyString = convertEncoding(dirtyString);
|
||||
// 再进行乱码和无意义 Unicode 字符的清理
|
||||
String cleanString = clean(dirtyString);
|
||||
// String s1 = removeNonPrintableUnicodeChars(s);
|
||||
log.info("去除乱码:[{}]", cleanString);
|
||||
// String traditionalText = "不經意,妳的笑容";
|
||||
// String simplifiedText = traditionalToSimplified(traditionalText);
|
||||
//
|
||||
// log.info("繁体文本: [" + traditionalText + "]");
|
||||
// log.info("简体文本: [" + simplifiedText + "]");
|
||||
//String dirtyString="?<3F><>简体文<E4BD93><E69687><EFBFBD>f?<3F>G<EFBFBD><47>?<3F><>??<3F>G<EFBFBD>G<EFBFBD><47>پ?<3F>l?,,,杩欐槸涓€涓\\uE043贡鐮";
|
||||
// // 先进行编码转换
|
||||
// dirtyString = convertEncoding(dirtyString);
|
||||
// // 再进行乱码和无意义 Unicode 字符的清理
|
||||
// String cleanString = clean(dirtyString);
|
||||
//// String s1 = removeNonPrintableUnicodeChars(s);
|
||||
// log.info("去除乱码:[{}]", cleanString);
|
||||
}
|
||||
public static String clean(String input) {
|
||||
|
||||
public static String clean (String input) {
|
||||
// 更广泛的乱码字符范围,包括一些扩展的不可打印字符
|
||||
String cleanString = input.replaceAll("[\\x00-\\x1F\\x7F-\\x9F\\uFFFD]", "");
|
||||
// 去除无意义的 Unicode 字符,这里范围可根据实际情况修改
|
||||
@ -663,7 +704,7 @@ String dirtyString="?<3F><>简体文<E4BD93><E69687><EFBFBD>f?<3F>G<EFBFBD><47>?<3F><>??<3F>G<EFBFBD>G<EFBFBD><47>
|
||||
return cleanString;
|
||||
}
|
||||
|
||||
public static String convertEncoding(String input) {
|
||||
public static String convertEncoding (String input) {
|
||||
// 尝试多种编码转换,找到正确的编码
|
||||
String[] encodings = {"UTF-8", "GBK", "Big5", "ISO-8859-1"};
|
||||
for (String encoding : encodings) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user