feat(llm): 优化数据备份和去重功能
- 完善数据备份逻辑,更新数据集长度 - 重构去重功能,使用 SimHash 算法实现高效去重 - 优化日志输出,提高可读性和性能
This commit is contained in:
parent
7d48924be6
commit
ce64bd451c
@ -48,6 +48,7 @@ public class AsyncDataProcessService {
|
||||
@Async
|
||||
public void backups (DataProcessTaskDO dataProcessTask) {
|
||||
try {
|
||||
DatasetDO updateDatasetDO = new DatasetDO();
|
||||
// 判断是否备份
|
||||
if (dataProcessTask.getDatasetPostId() == null) {
|
||||
DatasetDO datasetDO = datasetMapper.selectById(dataProcessTask.getDatasetId());
|
||||
@ -55,6 +56,7 @@ public class AsyncDataProcessService {
|
||||
newData.setId(null);
|
||||
newData.setDatasetName(datasetDO.getDatasetName() + generateRandomString(4));
|
||||
datasetMapper.insert(newData);
|
||||
updateDatasetDO=newData;
|
||||
dataProcessTask.setDatasetPostId(newData.getId());
|
||||
dataProcessTaskMapper.updateById(dataProcessTask);
|
||||
}
|
||||
@ -63,8 +65,10 @@ public class AsyncDataProcessService {
|
||||
datasetAnswerMapper.deleteTrue(dataProcessTask.getDatasetPostId());
|
||||
|
||||
// 开始数据处理
|
||||
startDataProcess(dataProcessTask);
|
||||
|
||||
Integer size=startDataProcess(dataProcessTask);
|
||||
updateDatasetDO.setDataLength(Long.valueOf(size));
|
||||
// 修改数据集长度
|
||||
datasetMapper.updateById(updateDatasetDO);
|
||||
} catch (Exception e) {
|
||||
datasetQuestionMapper.deleteTrue(dataProcessTask.getDatasetPostId());
|
||||
datasetAnswerMapper.deleteTrue(dataProcessTask.getDatasetPostId());
|
||||
@ -82,7 +86,7 @@ public class AsyncDataProcessService {
|
||||
*
|
||||
* @param dataProcessTask 数据处理任务
|
||||
*/
|
||||
private void startDataProcess (DataProcessTaskDO dataProcessTask) {
|
||||
private Integer startDataProcess (DataProcessTaskDO dataProcessTask) {
|
||||
log.info(" =========== 数据处理开始 =========== {}", JSON.toJSONString(dataProcessTask));
|
||||
|
||||
// 获取所有问题和回答
|
||||
@ -129,6 +133,7 @@ public class AsyncDataProcessService {
|
||||
|
||||
dataProcessTaskMapper.updateStatus(dataProcessTask.getId(), 2);
|
||||
log.info(" =========== 数据处理结束 =========== {}", JSON.toJSONString(dataProcessTask));
|
||||
return datasetQuestionList.size();
|
||||
// log.info(" *********** 处理后数据: ************ {}", JSON.toJSONString(datasetQuestionList));
|
||||
}
|
||||
|
||||
|
@ -425,81 +425,82 @@ public class DataProcessUtil {
|
||||
* @param threshold 相似度阈值
|
||||
* @return 是否需要去重
|
||||
*/
|
||||
public static List<Long> similarityDeduplication (Map<Long, String> contentMap, double threshold) {
|
||||
long l3 = System.currentTimeMillis();
|
||||
// 先将 Map 转换为有顺序的 Map,使用 LinkedHashMap 保持插入顺序
|
||||
LinkedHashMap<Long, String> sortedContentMap = new LinkedHashMap<>();
|
||||
// 将原始内容映射的条目按照键排序,并添加到 sortedContentMap 中
|
||||
contentMap.entrySet().stream()
|
||||
.sorted(Map.Entry.comparingByKey())
|
||||
.forEachOrdered(x -> sortedContentMap.put(x.getKey(), x.getValue()));
|
||||
|
||||
sortedContentMap.forEach((key, value) -> {
|
||||
log.info("key:{},value:{}", key, value);
|
||||
});
|
||||
// 有顺序的转换成文本内容,使用 LinkedList 存储文本内容
|
||||
LinkedList<String> contents = sortedContentMap.entrySet().stream()
|
||||
.sorted(Comparator.comparing(Map.Entry::getKey))
|
||||
.map(Map.Entry::getValue)
|
||||
.collect(Collectors.toCollection(LinkedList::new));
|
||||
|
||||
contents.forEach(v -> {
|
||||
log.info("value:{}", v);
|
||||
});
|
||||
|
||||
// 存储每个文本的 simHash
|
||||
LinkedList<String> simHashes = new LinkedList<>();
|
||||
// 计算每个文本的 simHash 并添加到 simHashes 中
|
||||
for (String content : contents) {
|
||||
simHashes.add(HammingUtils.getSimHash(content));
|
||||
/**
|
||||
* 基于SimHash的文本相似度去重
|
||||
* @param contentMap 文本集合(Key: 文档ID, Value: 文本内容)
|
||||
* @param threshold 相似度阈值(0~1,如0.8表示80%相似)
|
||||
* @return 需要删除的文档ID列表
|
||||
*/
|
||||
public static List<Long> similarityDeduplication(Map<Long, String> contentMap, double threshold) {
|
||||
// 参数校验
|
||||
if (contentMap == null || contentMap.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
if (threshold < 0 || threshold > 1) {
|
||||
throw new IllegalArgumentException("相似度阈值必须在0~1之间");
|
||||
}
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
// 存储相似元素的索引
|
||||
List<Long> similarityIndex = new ArrayList<>();
|
||||
// 存储相似元素的键
|
||||
List<Long> similarityKey = new ArrayList<>();
|
||||
// 1. 按文档ID排序(保持处理顺序确定性)
|
||||
LinkedHashMap<Long, String> sortedMap = contentMap.entrySet().stream()
|
||||
.sorted(Map.Entry.comparingByKey())
|
||||
.collect(Collectors.toMap(
|
||||
Map.Entry::getKey,
|
||||
Map.Entry::getValue,
|
||||
(e1, e2) -> e1,
|
||||
LinkedHashMap::new));
|
||||
|
||||
// 2. 并行计算SimHash(提升大数据量性能)
|
||||
Map<Long, String> simHashMap = sortedMap.entrySet().parallelStream()
|
||||
.collect(Collectors.toMap(
|
||||
Map.Entry::getKey,
|
||||
entry -> HammingUtils.getSimHash(entry.getValue()),
|
||||
(e1, e2) -> e1,
|
||||
LinkedHashMap::new));
|
||||
|
||||
// 计算相似性
|
||||
for (int i = 0; i < simHashes.size(); i++) {
|
||||
// 如果当前元素已经标记为相似,则跳过
|
||||
if (similarityIndex.contains((long) i)) {
|
||||
// 3. 相似度检测
|
||||
List<Long> duplicateKeys = new ArrayList<>();
|
||||
List<Long> processedIds = new ArrayList<>(simHashMap.keySet());
|
||||
|
||||
for (int i = 0; i < processedIds.size(); i++) {
|
||||
Long currentId = processedIds.get(i);
|
||||
if (duplicateKeys.contains(currentId)) {
|
||||
continue;
|
||||
}
|
||||
for (int j = i + 1; j < simHashes.size(); j++) {
|
||||
String hash1 = simHashes.get(i);
|
||||
String hash2 = simHashes.get(j);
|
||||
// 从 1 开始计数,所以 i 和 j 都加 1
|
||||
double similarity = HammingUtils.getSimilarity(hash1, hash2);
|
||||
// 打印相似性信息
|
||||
log.info("第 {} 个元素 和第 {} 个元素 的文本相似度是:{}", i + 1, j + 1, similarity);
|
||||
|
||||
String hash1 = simHashMap.get(currentId);
|
||||
|
||||
// 只与后续未处理的文档比较
|
||||
for (int j = i + 1; j < processedIds.size(); j++) {
|
||||
Long compareId = processedIds.get(j);
|
||||
if (duplicateKeys.contains(compareId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
double similarity = HammingUtils.getSimilarity(
|
||||
hash1,
|
||||
simHashMap.get(compareId));
|
||||
|
||||
log.debug("文档 {} 与 {} 的相似度: {:.2f}%",
|
||||
currentId, compareId, similarity * 100);
|
||||
|
||||
// 如果相似度超过阈值,标记为相似
|
||||
if (similarity > threshold) {
|
||||
// 移除相似的文本的索引
|
||||
similarityIndex.add((long) j);
|
||||
|
||||
// 根据索引位置获取map对应位置的key
|
||||
similarityKey.add(sortedContentMap.keySet().toArray(new Long[0])[j]);
|
||||
duplicateKeys.add(compareId);
|
||||
log.info("标记为相似: {} ≈ {} (相似度: {:.2f}%)",
|
||||
currentId, compareId, similarity * 100);
|
||||
}
|
||||
}
|
||||
}
|
||||
log.info("相似索引列表:{}", similarityIndex);
|
||||
log.info("相似Key列表:{}", similarityKey);
|
||||
|
||||
long l4 = System.currentTimeMillis();
|
||||
long diff = l4 - l3;
|
||||
long minutes = diff / (60 * 1000);
|
||||
long seconds = (diff % (60 * 1000)) / 1000;
|
||||
long milliseconds = diff % 1000;
|
||||
|
||||
log.info("总耗时: " + minutes + " 分 " + seconds + " 秒 " + milliseconds + " 毫秒");
|
||||
log.info("======================================");
|
||||
|
||||
return similarityKey;
|
||||
// 4. 性能日志
|
||||
long cost = System.currentTimeMillis() - startTime;
|
||||
log.info("去重完成: 总数={}, 重复数={}, 耗时={}ms",
|
||||
contentMap.size(),
|
||||
duplicateKeys.size(),
|
||||
cost);
|
||||
|
||||
return duplicateKeys;
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
x
Reference in New Issue
Block a user