From 47c88e3ebc15f4e5168d6517df0c00060e475dc5 Mon Sep 17 00:00:00 2001 From: zhangtao Date: Sun, 5 Jan 2025 19:58:51 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=BF=87=E6=BB=A4?= =?UTF-8?q?=E5=B7=A5=E5=85=B7=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../module/llm/utils/DataProcessUtil.java | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java index 7ca626f09..0de44a2fc 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java @@ -400,15 +400,16 @@ public class DataProcessUtil { text = hashMatcher.replaceAll(""); // 使用StringBuilder和StringBuilder的replace方法去除其他数字,但跳过年份和简单数字 - StringBuilder sb = new StringBuilder(text); - int index = 0; - while ((index = findNextNumberToReplace(sb.toString())) != -1) { - String number = sb.substring(index, findEndOfNumber(sb.toString(), index)); - if (!isYear(number) && !isSimpleNumber(number)) { - sb.replace(index, index + number.length(), ""); - } - } - return sb.toString(); + // TODO: 这里目前有bug,先注释掉了。 +// StringBuilder sb = new StringBuilder(text); +// int index = 0; +// while ((index = findNextNumberToReplace(sb.toString())) != -1) { +// String number = sb.substring(index, findEndOfNumber(sb.toString(), index)); +// if (!isYear(number) && !isSimpleNumber(number)) { +// sb.replace(index, index + number.length(), ""); +// } +// } + return text; } // 查找下一个要替换的数字的起始索引 @@ -463,4 +464,14 @@ public class DataProcessUtil { return false; } } + + public static void main(String[] args) { + String textWithIdentifiers = "Here are some identifiers: 123-456-7890, 1234567812345678, a1b2c3d4e5f6a1b2c3d4e5f6, 2023, and 987654."; + + // 去除标识符 + String textWithoutIdentifiers = removeIdentifiers(textWithIdentifiers); + + // 打印结果 + System.out.println(textWithoutIdentifiers); + } } From ea506b84db51b8ab0b374890189f1def82f04b7e Mon Sep 17 00:00:00 2001 From: zhangtao Date: Sun, 5 Jan 2025 20:01:18 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=BF=87=E6=BB=A4?= =?UTF-8?q?=E5=B7=A5=E5=85=B7=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../llm/service/async/AsyncDataProcessService.java | 3 +-- .../yudao/module/llm/utils/DataProcessUtil.java | 10 +++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java index 64523a8a2..5d8ec52a9 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncDataProcessService.java @@ -1,6 +1,5 @@ package cn.iocoder.yudao.module.llm.service.async; -import cn.hutool.json.JSONArray; import cn.hutool.json.JSONObject; import cn.iocoder.yudao.framework.common.util.collection.CollectionUtils; import cn.iocoder.yudao.framework.common.util.object.BeanUtils; @@ -125,7 +124,7 @@ public class AsyncDataProcessService { // 繁体转简体 繁体转简体,如“不經意,妳的笑容”清洗成“不经意,你的笑容” JSONObject a_4 = a.getJSONObject("a_4"); if (a_4.getBool("is_on")) { - result = DataProcessUtil.TraditionalToSimplified(result); + result = DataProcessUtil.traditionalToSimplified(result); } // 去除网页标识符 移除文档中的html标签,如,

等 JSONObject a_5 = a.getJSONObject("a_5"); diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java index 0de44a2fc..47d3a3ab1 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataProcessUtil.java @@ -77,7 +77,7 @@ public class DataProcessUtil { * @param input * @return */ - public static String TraditionalToSimplified(String input) { + public static String traditionalToSimplified(String input) { return ZhConverterUtil.toSimple(input); } @@ -467,11 +467,15 @@ public class DataProcessUtil { public static void main(String[] args) { String textWithIdentifiers = "Here are some identifiers: 123-456-7890, 1234567812345678, a1b2c3d4e5f6a1b2c3d4e5f6, 2023, and 987654."; - // 去除标识符 String textWithoutIdentifiers = removeIdentifiers(textWithIdentifiers); - // 打印结果 System.out.println(textWithoutIdentifiers); + + String traditionalText = "不經意,妳的笑容"; + String simplifiedText = traditionalToSimplified(traditionalText); + + System.out.println("繁体文本: [" + traditionalText + "]"); + System.out.println("简体文本: [" + simplifiedText + "]"); } }