refactor(llm): 重构数据集服务实现类

- 优化了文件读取和解析逻辑,支持 JSON、TXT、CSV 和 XLSX 文件格式
- 新增通用的 JSON 解析方法,提高代码复用性
- 改进了错误处理和异常提示,提升用户体验
- 重构了部分方法,提高了代码的可读性和可维护性
This commit is contained in:
sunxiqing 2025-08-12 13:39:31 +08:00
parent 0d9ea7c3c3
commit f460ddc38b
2 changed files with 119 additions and 84 deletions

View File

@ -29,6 +29,7 @@ import cn.iocoder.yudao.module.llm.utils.vo.CsvDataSetVO;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.baomidou.mybatisplus.core.toolkit.StringUtils;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.exceptions.CsvValidationException;
@ -91,9 +92,20 @@ public class DatasetServiceImpl implements DatasetService {
@Resource
private ModelAssessTaskManualBackupService modelAssessTaskManualBackupService;
private static long getFileContentLength(File file) throws IOException {
FileInputStream fis = new FileInputStream(file);
byte[] buffer = new byte[1024];
long charCount = 0;
while (fis.read(buffer) != -1) {
charCount += new String(buffer).length();
}
fis.close();
return charCount;
}
@Transactional
@Override
public Long createDataset (DatasetSaveReqVO createReqVO) {
public Long createDataset(DatasetSaveReqVO createReqVO) {
// 校验
validateDatasetNameExists(createReqVO);
if (createReqVO.getType() == null) {
@ -113,7 +125,7 @@ public class DatasetServiceImpl implements DatasetService {
Long count = datasetQuestionMapper.selectCount(new LambdaQueryWrapper<DatasetQuestionDO>()
.eq(DatasetQuestionDO::getDatasetId, dataset.getId()));
if (count <= 0){
if (count <= 0) {
throw new ServiceException(new ErrorCode(20000, "数据集问题不能为空"));
}
@ -130,16 +142,16 @@ public class DatasetServiceImpl implements DatasetService {
if (annoCount == 0) {
status = 0;
}
if (CollectionUtils.isEmpty(datasetFiles)){
if (CollectionUtils.isEmpty(datasetFiles)) {
throw new ServiceException(new ErrorCode(
20000, "数据集文件不能为空"));
}
if (dataset.getDatasetType() == 2){
if (status != 2 ){
if (dataset.getDatasetType() == 2) {
if (status != 2) {
throw new ServiceException(new ErrorCode(
20000, "评估数据集只能上传标注完成的数据"));
}
}else {
} else {
if (dataset.getStatus() != status) {
throw new ServiceException(new ErrorCode(
20000, "数据集标注状态错误!应该是【" + DatasetStatusEnum.getStatusByName(status) + ""));
@ -152,21 +164,9 @@ public class DatasetServiceImpl implements DatasetService {
return dataset.getId();
}
private static long getFileContentLength (File file) throws IOException {
FileInputStream fis = new FileInputStream(file);
byte[] buffer = new byte[1024];
long charCount = 0;
while (fis.read(buffer) != -1) {
charCount += new String(buffer).length();
}
fis.close();
return charCount;
}
@Override
@Transactional
public void updateDataset (DatasetSaveReqVO updateReqVO) {
public void updateDataset(DatasetSaveReqVO updateReqVO) {
// 校验存在
validateDatasetExists(updateReqVO.getId());
validateDatasetNameExists(updateReqVO);
@ -190,7 +190,7 @@ public class DatasetServiceImpl implements DatasetService {
List<Long> deleteIds = new ArrayList<>(selectids);
deleteIds.removeAll(updateIds);
// 删除操作
if (CollectionUtils.isNotEmpty(deleteIds)){
if (CollectionUtils.isNotEmpty(deleteIds)) {
datasetFilesMapper.deleteBatchIds(deleteIds);
datasetQuestionMapper.delete(new LambdaQueryWrapper<DatasetQuestionDO>()
.in(DatasetQuestionDO::getDatasetFilesId, deleteIds));
@ -209,13 +209,13 @@ public class DatasetServiceImpl implements DatasetService {
datasetFilesSaveReqVO.setDatasetId(updateObj.getId());
}
);
if (CollectionUtils.isNotEmpty(noIdsDatasetFiles)){
if (CollectionUtils.isNotEmpty(noIdsDatasetFiles)) {
parseFile(datasetFiles);
}
Long count = datasetQuestionMapper.selectCount(new LambdaQueryWrapper<DatasetQuestionDO>()
.eq(DatasetQuestionDO::getDatasetId, updateObj.getId()));
if (count <= 0){
if (count <= 0) {
throw new ServiceException(new ErrorCode(20000, "数据集问题不能为空"));
}
@ -230,14 +230,14 @@ public class DatasetServiceImpl implements DatasetService {
updateObj.setAnnotateProgress(formattedRatio);
}
if (annoCount == 0) {
status=0;
status = 0;
}
if (CollectionUtils.isEmpty(datasetFiles)){
if (CollectionUtils.isEmpty(datasetFiles)) {
throw new ServiceException(new ErrorCode(
20000, "数据集文件不能为空"));
}
if (updateObj.getDatasetType() == 2){
if (status != 2 ){
if (updateObj.getDatasetType() == 2) {
if (status != 2) {
throw new ServiceException(new ErrorCode(
20000, "评估数据集只能上传标注完成的数据"));
}
@ -253,7 +253,7 @@ public class DatasetServiceImpl implements DatasetService {
}
@Override
public void deleteDataset (Long id) {
public void deleteDataset(Long id) {
// 校验存在
validateDatasetExists(id);
// 校验使用
@ -262,7 +262,7 @@ public class DatasetServiceImpl implements DatasetService {
datasetMapper.deleteById(id);
}
private void validateDatasetExists (Long id) {
private void validateDatasetExists(Long id) {
if (datasetMapper.selectById(id) == null) {
throw exception(DATASET_NOT_EXISTS);
}
@ -273,7 +273,7 @@ public class DatasetServiceImpl implements DatasetService {
*
* @param id 数据集ID
*/
private void validateDatasetUse (Long id) {
private void validateDatasetUse(Long id) {
DatasetDO dataset = datasetMapper.selectById(id);
String datasetName = dataset.getDatasetName();
@ -293,7 +293,7 @@ public class DatasetServiceImpl implements DatasetService {
* @param id 数据集ID
* @param datasetName 数据集名称
*/
private void validateDatasetUsesInModelTuning (Long id, String datasetName) {
private void validateDatasetUsesInModelTuning(Long id, String datasetName) {
Map<Long, String> modelTuning = fineTuningTaskService.getModelTuningByDatasetId(id);
if (CollectionUtils.isNotEmpty(modelTuning)) {
String msg = String.format("数据集【%s】在模型调优 %s 有正在等待的任务,请先结束任务", datasetName, modelTuning.values());
@ -307,7 +307,7 @@ public class DatasetServiceImpl implements DatasetService {
* @param id 数据集ID
* @param datasetName 数据集名称
*/
private void validateDatasetUsesInModelAssessTaskManual (Long id, String datasetName) {
private void validateDatasetUsesInModelAssessTaskManual(Long id, String datasetName) {
Map<Long, String> modelAssessmentTask = modelAssessTaskManualService.getModelAssessmentTaskByDatasetId(id);
if (com.baomidou.mybatisplus.core.toolkit.CollectionUtils.isNotEmpty(modelAssessmentTask)) {
String msg = String.format("数据集【%s】在模型评估-人工评估 %s 有正在进行的任务,请先结束任务", datasetName, modelAssessmentTask.values());
@ -318,7 +318,7 @@ public class DatasetServiceImpl implements DatasetService {
/**
* 校验 数据集 是否在 模型评估任务-自动 有使用
*
* @param id 数据集ID
* @param id 数据集ID
* @param datasetName 数据集名称
*/
private void validateDatasetUsesInModelAssessTaskAuto(Long id, String datasetName) {
@ -332,10 +332,10 @@ public class DatasetServiceImpl implements DatasetService {
/**
* 校验 数据集 是否在 模型评估任务备份-人工 有使用
*
* @param id 数据集ID
* @param id 数据集ID
* @param datasetName 数据集名称
*/
private void validateDatasetUsesInModelAssessTaskBackupManual (Long id, String datasetName) {
private void validateDatasetUsesInModelAssessTaskBackupManual(Long id, String datasetName) {
Map<Long, String> modelAssessmentTask = modelAssessTaskManualBackupService.getModelAssessmentTaskByDatasetId(id);
if (CollectionUtils.isNotEmpty(modelAssessmentTask)) {
String msg = String.format("数据集【%s】在模型评估-人工评估 %s 存在备份,请先取消备份", datasetName, modelAssessmentTask.values());
@ -343,7 +343,7 @@ public class DatasetServiceImpl implements DatasetService {
}
}
private void validateDatasetNameExists (DatasetSaveReqVO dateReqVO) {
private void validateDatasetNameExists(DatasetSaveReqVO dateReqVO) {
LambdaQueryWrapper<DatasetDO> wrapper = new LambdaQueryWrapper<DatasetDO>()
.eq(DatasetDO::getDatasetName, dateReqVO.getDatasetName());
@ -357,7 +357,7 @@ public class DatasetServiceImpl implements DatasetService {
}
@Override
public DatasetRespVO getDataset (Long id) {
public DatasetRespVO getDataset(Long id) {
DatasetDO datasetDO = datasetMapper.selectById(id);
DatasetRespVO datasetRespVO = BeanUtils.toBean(datasetDO, DatasetRespVO.class);
List<DatasetFilesDO> datasetFilesDOS = datasetFilesMapper.selectList(new LambdaQueryWrapper<DatasetFilesDO>().eq(DatasetFilesDO::getDatasetId, id));
@ -369,12 +369,12 @@ public class DatasetServiceImpl implements DatasetService {
}
@Override
public PageResult<DatasetDO> getDatasetPage (DatasetPageReqVO pageReqVO) {
public PageResult<DatasetDO> getDatasetPage(DatasetPageReqVO pageReqVO) {
return datasetMapper.selectPage(pageReqVO);
}
@Override
public List<DatasetTreeNode> queryAll () {
public List<DatasetTreeNode> queryAll() {
/*List<DatasetDO> datasetDOS0 = datasetMapper.selectList(new LambdaQueryWrapper<DatasetDO>().eq(DatasetDO::getType, DataConstants.dataTypePrivate));
List<DatasetRespVO> datasetRespVOS0 = BeanUtils.toBean(datasetDOS0, DatasetRespVO.class);
List<DatasetDO> datasetDOS1 = datasetMapper.selectList(new LambdaQueryWrapper<DatasetDO>().eq(DatasetDO::getType, DataConstants.dataTypePublic));
@ -404,7 +404,7 @@ public class DatasetServiceImpl implements DatasetService {
return root;
}
public void readZipFile (List<DatasetFilesDO> zipFiles, List<DatasetFilesSaveReqVO> datasetFiles) {
public void readZipFile(List<DatasetFilesDO> zipFiles, List<DatasetFilesSaveReqVO> datasetFiles) {
DatasetFilesSaveReqVO datasetFilesSaveReqVO = datasetFiles.get(0);
List<DatasetFilesSaveReqVO> res = new ArrayList<>();
zipFiles.forEach(datasetFilesDO -> {
@ -446,7 +446,7 @@ public class DatasetServiceImpl implements DatasetService {
parseFile(res);
}
public void readTarGzFile (List<DatasetFilesDO> tarGzFiles, List<DatasetFilesSaveReqVO> datasetFiles) {
public void readTarGzFile(List<DatasetFilesDO> tarGzFiles, List<DatasetFilesSaveReqVO> datasetFiles) {
DatasetFilesSaveReqVO datasetFilesSaveReqVO = datasetFiles.get(0);
List<DatasetFilesSaveReqVO> res = new ArrayList<>();
tarGzFiles.forEach(datasetFilesDO -> {
@ -494,7 +494,7 @@ public class DatasetServiceImpl implements DatasetService {
}
// 暂时先不用
public void readTarFile (List<DatasetFilesDO> tarFiles, List<DatasetFilesSaveReqVO> datasetFiles) {
public void readTarFile(List<DatasetFilesDO> tarFiles, List<DatasetFilesSaveReqVO> datasetFiles) {
DatasetFilesSaveReqVO datasetFilesSaveReqVO = datasetFiles.get(0);
List<DatasetFilesSaveReqVO> res = new ArrayList<>();
tarFiles.forEach(datasetFilesDO -> {
@ -537,7 +537,7 @@ public class DatasetServiceImpl implements DatasetService {
}
public void readJsonFile (List<DatasetFilesDO> jsonFiles) {
public void readJsonFile(List<DatasetFilesDO> jsonFiles) {
jsonFiles.forEach(datasetFilesDO -> {
HttpURLConnection connection = DataSetReadFileUtils.readFile(datasetFilesDO.getDatasetFileUrl());
if (connection != null) {
@ -547,32 +547,8 @@ public class DatasetServiceImpl implements DatasetService {
while ((line = in.readLine()) != null) {
content.append(line);
}
// 使用Jackson解析 Json 字符串为List<String>对象
// 使用Jackson解析 Json 字符串为List<String>对象
ObjectMapper mapper = new ObjectMapper();
// 使用 TypeReference 解析 JSON 字符串为 List<String>
List<DataJsonTemplate> jsonList = mapper.readValue(content.toString(), new TypeReference<List<DataJsonTemplate>>() {
});
jsonList.forEach(
dataJsonTemplate -> {
List<String> answers = dataJsonTemplate.getAnswers();
DatasetQuestionDO datasetQuestionDO = BeanUtils.toBean(dataJsonTemplate, DatasetQuestionDO.class);
datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId());
datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId());
datasetQuestionDO.setStatus(CollectionUtils.isNotEmpty(answers) ? 2 : 0);
datasetQuestionMapper.insert(datasetQuestionDO);
if (CollectionUtils.isNotEmpty(answers)) {
for (String answer : answers) {
DatasetAnswerDO datasetAnswerDO = new DatasetAnswerDO();
datasetAnswerDO.setDatasetId(datasetFilesDO.getDatasetId());
datasetAnswerDO.setDatasetFilesId(datasetFilesDO.getId());
datasetAnswerDO.setQuestionId(datasetQuestionDO.getId());
datasetAnswerDO.setAnswer(answer);
datasetAnswerMapper.insert(datasetAnswerDO);
}
}
}
);
// 解析JSON数据
jsonParsing(content, datasetFilesDO);
} catch (Exception e) {
throw exception(new ErrorCode(11000, "请正确上传json格式得数据"));
} finally {
@ -582,31 +558,68 @@ public class DatasetServiceImpl implements DatasetService {
});
}
private void jsonParsing(StringBuilder content, DatasetFilesDO datasetFilesDO) throws JsonProcessingException {
// 使用Jackson解析 Json 字符串为List<String>对象
// 使用Jackson解析 Json 字符串为List<String>对象
ObjectMapper mapper = new ObjectMapper();
// 使用 TypeReference 解析 JSON 字符串为 List<String>
List<DataJsonTemplate> jsonList = mapper.readValue(content.toString(), new TypeReference<List<DataJsonTemplate>>() {
});
jsonList.forEach(
dataJsonTemplate -> {
List<String> answers = dataJsonTemplate.getAnswers();
DatasetQuestionDO datasetQuestionDO = BeanUtils.toBean(dataJsonTemplate, DatasetQuestionDO.class);
datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId());
datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId());
datasetQuestionDO.setStatus(CollectionUtils.isNotEmpty(answers) ? 2 : 0);
datasetQuestionMapper.insert(datasetQuestionDO);
if (CollectionUtils.isNotEmpty(answers)) {
for (String answer : answers) {
DatasetAnswerDO datasetAnswerDO = new DatasetAnswerDO();
datasetAnswerDO.setDatasetId(datasetFilesDO.getDatasetId());
datasetAnswerDO.setDatasetFilesId(datasetFilesDO.getId());
datasetAnswerDO.setQuestionId(datasetQuestionDO.getId());
datasetAnswerDO.setAnswer(answer);
datasetAnswerMapper.insert(datasetAnswerDO);
}
}
}
);
}
/**
* txt文本数据
*
* @param txtFiles
*/
public void readTxtFile (List<DatasetFilesDO> txtFiles) {
public void readTxtFile(List<DatasetFilesDO> txtFiles) {
txtFiles.forEach(datasetFilesDO -> {
List<String> newContent = new ArrayList<>();
HttpURLConnection connection = DataSetReadFileUtils.readFile(datasetFilesDO.getDatasetFileUrl());
if (connection != null) {
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String inputLine;
while ((inputLine = in.readLine()) != null) {
inputLine = inputLine.trim(); // 去除行首和行尾的空白字符
if (!inputLine.isEmpty()) {
DatasetQuestionDO datasetQuestionDO = new DatasetQuestionDO();// 检查是否为空行
datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId());
datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId());
datasetQuestionDO.setQuestion(inputLine);
datasetQuestionMapper.insert(datasetQuestionDO);
}
StringBuilder content = new StringBuilder();
String line;
while ((line = in.readLine()) != null) {
content.append(line);
}
// 使用Jackson解析 JSON
jsonParsing(content, datasetFilesDO);
// String inputLine;
// while ((inputLine = in.readLine()) != null) {
// inputLine = inputLine.trim(); // 去除行首和行尾的空白字符
// if (!inputLine.isEmpty()) {
// DatasetQuestionDO datasetQuestionDO = new DatasetQuestionDO();// 检查是否为空行
// datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId());
// datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId());
// datasetQuestionDO.setQuestion(inputLine);
// datasetQuestionMapper.insert(datasetQuestionDO);
// }
// }
} catch (Exception e) {
e.printStackTrace();
throw exception(new ErrorCode(11001, "请正确上传txt格式得数据"));
} finally {
connection.disconnect();
}
@ -619,7 +632,7 @@ public class DatasetServiceImpl implements DatasetService {
*
* @param csvFiles csv文件
*/
private void readCsvFile (List<DatasetFilesDO> csvFiles) {
private void readCsvFile(List<DatasetFilesDO> csvFiles) {
csvFiles.forEach(datasetFilesDO -> {
try {
@ -668,7 +681,7 @@ public class DatasetServiceImpl implements DatasetService {
*
* @param xlsxFiles
*/
public void readXlsxFile (List<DatasetFilesDO> xlsxFiles) {
public void readXlsxFile(List<DatasetFilesDO> xlsxFiles) {
xlsxFiles.forEach(datasetFilesDO -> {
Workbook sheets = DataSetReadFileUtils.readXlsxFromUrl(datasetFilesDO.getDatasetFileUrl());
if (sheets != null) {
@ -716,7 +729,7 @@ public class DatasetServiceImpl implements DatasetService {
});
}
public void parseFile (List<DatasetFilesSaveReqVO> datasetFiles) {
public void parseFile(List<DatasetFilesSaveReqVO> datasetFiles) {
List<DatasetFilesDO> insertDatasetFiles = BeanUtils.toBean(datasetFiles, DatasetFilesDO.class);
datasetFilesMapper.insertBatch(insertDatasetFiles, 100);

View File

@ -1 +1,23 @@
请根据下面的新闻生成摘要, 内容如下:新华社受权于18日全文播发修改后的《中华人民共和国立法法》修改后的立法法分为“总则”“法律”“行政法规”“地方性法规、自治条例和单行条例、规章”“适用与备案审查”“附则”等6章共计105条。\n生成摘要如下: [["修改后的立法法全文公布"]]
[
{
"system": "你是一个专业的新闻摘要撰写助手,擅长使用简洁明了的语言来提炼核心信息。",
"question": "据路透社报道俄罗斯经济发展部部长AlexeiUlyukayev当地时间周六(1月31日)表示俄经济发展部已向政府提交了2015年度经济发展指标最新预测此次预测是基于原油年平均价格为每桶50美元而去年12月份的预测基于原油年平均价格为每桶80美元",
"answers": [
"俄罗斯预计今年国内GDP将萎缩3%"
]
},
{
"system": "你是一个专业的新闻摘要撰写助手,擅长使用简洁明了的语言来提炼核心信息。",
"question": "1973年一桩“奸污”谜案将符福山的人生划成了对等的两半前40年他是人民教师后40年他被三女生揭发奸污遭除名并一生背负辱名。40年后3个“被奸污”女生终承认真相为能被推荐上高中3人是受人蛊惑作伪证诬告“遭奸污”。",
"answers": [
"海南教师被诬告奸污3女生背负辱名40年"
]
},
{
"system": "你是一个专业的新闻摘要撰写助手,擅长使用简洁明了的语言来提炼核心信息。",
"question": "8日白天海南北部地区阴天有小阵雨南部地区多云。9、10日全岛多云。11日一股较强冷空气来袭全岛阴天为主气温下降明显。岛民们未来一周看不见太阳了要记得保暖防寒哦~南海君最不喜欢湿冷的天气了,你呢?",
"answers": [
"11日较强冷空气再袭海南"
]
}
]