diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataset/DatasetServiceImpl.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataset/DatasetServiceImpl.java index d29757a54..055a6c2f4 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataset/DatasetServiceImpl.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/dataset/DatasetServiceImpl.java @@ -29,6 +29,7 @@ import cn.iocoder.yudao.module.llm.utils.vo.CsvDataSetVO; import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; import com.baomidou.mybatisplus.core.toolkit.StringUtils; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.opencsv.exceptions.CsvValidationException; @@ -91,9 +92,20 @@ public class DatasetServiceImpl implements DatasetService { @Resource private ModelAssessTaskManualBackupService modelAssessTaskManualBackupService; + private static long getFileContentLength(File file) throws IOException { + FileInputStream fis = new FileInputStream(file); + byte[] buffer = new byte[1024]; + long charCount = 0; + while (fis.read(buffer) != -1) { + charCount += new String(buffer).length(); + } + fis.close(); + return charCount; + } + @Transactional @Override - public Long createDataset (DatasetSaveReqVO createReqVO) { + public Long createDataset(DatasetSaveReqVO createReqVO) { // 校验 validateDatasetNameExists(createReqVO); if (createReqVO.getType() == null) { @@ -113,7 +125,7 @@ public class DatasetServiceImpl implements DatasetService { Long count = datasetQuestionMapper.selectCount(new LambdaQueryWrapper() .eq(DatasetQuestionDO::getDatasetId, dataset.getId())); - if (count <= 0){ + if (count <= 0) { throw new ServiceException(new ErrorCode(20000, "数据集问题不能为空")); } @@ -130,16 +142,16 @@ public class DatasetServiceImpl implements DatasetService { if (annoCount == 0) { status = 0; } - if (CollectionUtils.isEmpty(datasetFiles)){ + if (CollectionUtils.isEmpty(datasetFiles)) { throw new ServiceException(new ErrorCode( 20000, "数据集文件不能为空")); } - if (dataset.getDatasetType() == 2){ - if (status != 2 ){ + if (dataset.getDatasetType() == 2) { + if (status != 2) { throw new ServiceException(new ErrorCode( 20000, "评估数据集只能上传标注完成的数据")); } - }else { + } else { if (dataset.getStatus() != status) { throw new ServiceException(new ErrorCode( 20000, "数据集标注状态错误!应该是【" + DatasetStatusEnum.getStatusByName(status) + "】")); @@ -152,21 +164,9 @@ public class DatasetServiceImpl implements DatasetService { return dataset.getId(); } - - private static long getFileContentLength (File file) throws IOException { - FileInputStream fis = new FileInputStream(file); - byte[] buffer = new byte[1024]; - long charCount = 0; - while (fis.read(buffer) != -1) { - charCount += new String(buffer).length(); - } - fis.close(); - return charCount; - } - @Override @Transactional - public void updateDataset (DatasetSaveReqVO updateReqVO) { + public void updateDataset(DatasetSaveReqVO updateReqVO) { // 校验存在 validateDatasetExists(updateReqVO.getId()); validateDatasetNameExists(updateReqVO); @@ -190,7 +190,7 @@ public class DatasetServiceImpl implements DatasetService { List deleteIds = new ArrayList<>(selectids); deleteIds.removeAll(updateIds); // 删除操作 - if (CollectionUtils.isNotEmpty(deleteIds)){ + if (CollectionUtils.isNotEmpty(deleteIds)) { datasetFilesMapper.deleteBatchIds(deleteIds); datasetQuestionMapper.delete(new LambdaQueryWrapper() .in(DatasetQuestionDO::getDatasetFilesId, deleteIds)); @@ -209,13 +209,13 @@ public class DatasetServiceImpl implements DatasetService { datasetFilesSaveReqVO.setDatasetId(updateObj.getId()); } ); - if (CollectionUtils.isNotEmpty(noIdsDatasetFiles)){ + if (CollectionUtils.isNotEmpty(noIdsDatasetFiles)) { parseFile(datasetFiles); } Long count = datasetQuestionMapper.selectCount(new LambdaQueryWrapper() .eq(DatasetQuestionDO::getDatasetId, updateObj.getId())); - if (count <= 0){ + if (count <= 0) { throw new ServiceException(new ErrorCode(20000, "数据集问题不能为空")); } @@ -230,14 +230,14 @@ public class DatasetServiceImpl implements DatasetService { updateObj.setAnnotateProgress(formattedRatio); } if (annoCount == 0) { - status=0; + status = 0; } - if (CollectionUtils.isEmpty(datasetFiles)){ + if (CollectionUtils.isEmpty(datasetFiles)) { throw new ServiceException(new ErrorCode( 20000, "数据集文件不能为空")); } - if (updateObj.getDatasetType() == 2){ - if (status != 2 ){ + if (updateObj.getDatasetType() == 2) { + if (status != 2) { throw new ServiceException(new ErrorCode( 20000, "评估数据集只能上传标注完成的数据")); } @@ -253,7 +253,7 @@ public class DatasetServiceImpl implements DatasetService { } @Override - public void deleteDataset (Long id) { + public void deleteDataset(Long id) { // 校验存在 validateDatasetExists(id); // 校验使用 @@ -262,7 +262,7 @@ public class DatasetServiceImpl implements DatasetService { datasetMapper.deleteById(id); } - private void validateDatasetExists (Long id) { + private void validateDatasetExists(Long id) { if (datasetMapper.selectById(id) == null) { throw exception(DATASET_NOT_EXISTS); } @@ -273,7 +273,7 @@ public class DatasetServiceImpl implements DatasetService { * * @param id 数据集ID */ - private void validateDatasetUse (Long id) { + private void validateDatasetUse(Long id) { DatasetDO dataset = datasetMapper.selectById(id); String datasetName = dataset.getDatasetName(); @@ -293,7 +293,7 @@ public class DatasetServiceImpl implements DatasetService { * @param id 数据集ID * @param datasetName 数据集名称 */ - private void validateDatasetUsesInModelTuning (Long id, String datasetName) { + private void validateDatasetUsesInModelTuning(Long id, String datasetName) { Map modelTuning = fineTuningTaskService.getModelTuningByDatasetId(id); if (CollectionUtils.isNotEmpty(modelTuning)) { String msg = String.format("数据集【%s】在模型调优 %s 有正在等待的任务,请先结束任务", datasetName, modelTuning.values()); @@ -307,7 +307,7 @@ public class DatasetServiceImpl implements DatasetService { * @param id 数据集ID * @param datasetName 数据集名称 */ - private void validateDatasetUsesInModelAssessTaskManual (Long id, String datasetName) { + private void validateDatasetUsesInModelAssessTaskManual(Long id, String datasetName) { Map modelAssessmentTask = modelAssessTaskManualService.getModelAssessmentTaskByDatasetId(id); if (com.baomidou.mybatisplus.core.toolkit.CollectionUtils.isNotEmpty(modelAssessmentTask)) { String msg = String.format("数据集【%s】在模型评估-人工评估 %s 有正在进行的任务,请先结束任务", datasetName, modelAssessmentTask.values()); @@ -318,7 +318,7 @@ public class DatasetServiceImpl implements DatasetService { /** * 校验 数据集 是否在 模型评估任务-自动 有使用 * - * @param id 数据集ID + * @param id 数据集ID * @param datasetName 数据集名称 */ private void validateDatasetUsesInModelAssessTaskAuto(Long id, String datasetName) { @@ -332,10 +332,10 @@ public class DatasetServiceImpl implements DatasetService { /** * 校验 数据集 是否在 模型评估任务备份-人工 有使用 * - * @param id 数据集ID + * @param id 数据集ID * @param datasetName 数据集名称 */ - private void validateDatasetUsesInModelAssessTaskBackupManual (Long id, String datasetName) { + private void validateDatasetUsesInModelAssessTaskBackupManual(Long id, String datasetName) { Map modelAssessmentTask = modelAssessTaskManualBackupService.getModelAssessmentTaskByDatasetId(id); if (CollectionUtils.isNotEmpty(modelAssessmentTask)) { String msg = String.format("数据集【%s】在模型评估-人工评估 %s 存在备份,请先取消备份", datasetName, modelAssessmentTask.values()); @@ -343,7 +343,7 @@ public class DatasetServiceImpl implements DatasetService { } } - private void validateDatasetNameExists (DatasetSaveReqVO dateReqVO) { + private void validateDatasetNameExists(DatasetSaveReqVO dateReqVO) { LambdaQueryWrapper wrapper = new LambdaQueryWrapper() .eq(DatasetDO::getDatasetName, dateReqVO.getDatasetName()); @@ -357,7 +357,7 @@ public class DatasetServiceImpl implements DatasetService { } @Override - public DatasetRespVO getDataset (Long id) { + public DatasetRespVO getDataset(Long id) { DatasetDO datasetDO = datasetMapper.selectById(id); DatasetRespVO datasetRespVO = BeanUtils.toBean(datasetDO, DatasetRespVO.class); List datasetFilesDOS = datasetFilesMapper.selectList(new LambdaQueryWrapper().eq(DatasetFilesDO::getDatasetId, id)); @@ -369,12 +369,12 @@ public class DatasetServiceImpl implements DatasetService { } @Override - public PageResult getDatasetPage (DatasetPageReqVO pageReqVO) { + public PageResult getDatasetPage(DatasetPageReqVO pageReqVO) { return datasetMapper.selectPage(pageReqVO); } @Override - public List queryAll () { + public List queryAll() { /*List datasetDOS0 = datasetMapper.selectList(new LambdaQueryWrapper().eq(DatasetDO::getType, DataConstants.dataTypePrivate)); List datasetRespVOS0 = BeanUtils.toBean(datasetDOS0, DatasetRespVO.class); List datasetDOS1 = datasetMapper.selectList(new LambdaQueryWrapper().eq(DatasetDO::getType, DataConstants.dataTypePublic)); @@ -404,7 +404,7 @@ public class DatasetServiceImpl implements DatasetService { return root; } - public void readZipFile (List zipFiles, List datasetFiles) { + public void readZipFile(List zipFiles, List datasetFiles) { DatasetFilesSaveReqVO datasetFilesSaveReqVO = datasetFiles.get(0); List res = new ArrayList<>(); zipFiles.forEach(datasetFilesDO -> { @@ -446,7 +446,7 @@ public class DatasetServiceImpl implements DatasetService { parseFile(res); } - public void readTarGzFile (List tarGzFiles, List datasetFiles) { + public void readTarGzFile(List tarGzFiles, List datasetFiles) { DatasetFilesSaveReqVO datasetFilesSaveReqVO = datasetFiles.get(0); List res = new ArrayList<>(); tarGzFiles.forEach(datasetFilesDO -> { @@ -494,7 +494,7 @@ public class DatasetServiceImpl implements DatasetService { } // 暂时先不用 - public void readTarFile (List tarFiles, List datasetFiles) { + public void readTarFile(List tarFiles, List datasetFiles) { DatasetFilesSaveReqVO datasetFilesSaveReqVO = datasetFiles.get(0); List res = new ArrayList<>(); tarFiles.forEach(datasetFilesDO -> { @@ -537,7 +537,7 @@ public class DatasetServiceImpl implements DatasetService { } - public void readJsonFile (List jsonFiles) { + public void readJsonFile(List jsonFiles) { jsonFiles.forEach(datasetFilesDO -> { HttpURLConnection connection = DataSetReadFileUtils.readFile(datasetFilesDO.getDatasetFileUrl()); if (connection != null) { @@ -547,32 +547,8 @@ public class DatasetServiceImpl implements DatasetService { while ((line = in.readLine()) != null) { content.append(line); } - // 使用Jackson解析 Json 字符串为List对象 - // 使用Jackson解析 Json 字符串为List对象 - ObjectMapper mapper = new ObjectMapper(); - // 使用 TypeReference 解析 JSON 字符串为 List - List jsonList = mapper.readValue(content.toString(), new TypeReference>() { - }); - jsonList.forEach( - dataJsonTemplate -> { - List answers = dataJsonTemplate.getAnswers(); - DatasetQuestionDO datasetQuestionDO = BeanUtils.toBean(dataJsonTemplate, DatasetQuestionDO.class); - datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId()); - datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId()); - datasetQuestionDO.setStatus(CollectionUtils.isNotEmpty(answers) ? 2 : 0); - datasetQuestionMapper.insert(datasetQuestionDO); - if (CollectionUtils.isNotEmpty(answers)) { - for (String answer : answers) { - DatasetAnswerDO datasetAnswerDO = new DatasetAnswerDO(); - datasetAnswerDO.setDatasetId(datasetFilesDO.getDatasetId()); - datasetAnswerDO.setDatasetFilesId(datasetFilesDO.getId()); - datasetAnswerDO.setQuestionId(datasetQuestionDO.getId()); - datasetAnswerDO.setAnswer(answer); - datasetAnswerMapper.insert(datasetAnswerDO); - } - } - } - ); + // 解析JSON数据 + jsonParsing(content, datasetFilesDO); } catch (Exception e) { throw exception(new ErrorCode(11000, "请正确上传json格式得数据!!!")); } finally { @@ -582,31 +558,68 @@ public class DatasetServiceImpl implements DatasetService { }); } + private void jsonParsing(StringBuilder content, DatasetFilesDO datasetFilesDO) throws JsonProcessingException { + // 使用Jackson解析 Json 字符串为List对象 + // 使用Jackson解析 Json 字符串为List对象 + ObjectMapper mapper = new ObjectMapper(); + // 使用 TypeReference 解析 JSON 字符串为 List + List jsonList = mapper.readValue(content.toString(), new TypeReference>() { + }); + jsonList.forEach( + dataJsonTemplate -> { + List answers = dataJsonTemplate.getAnswers(); + DatasetQuestionDO datasetQuestionDO = BeanUtils.toBean(dataJsonTemplate, DatasetQuestionDO.class); + datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId()); + datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId()); + datasetQuestionDO.setStatus(CollectionUtils.isNotEmpty(answers) ? 2 : 0); + datasetQuestionMapper.insert(datasetQuestionDO); + if (CollectionUtils.isNotEmpty(answers)) { + for (String answer : answers) { + DatasetAnswerDO datasetAnswerDO = new DatasetAnswerDO(); + datasetAnswerDO.setDatasetId(datasetFilesDO.getDatasetId()); + datasetAnswerDO.setDatasetFilesId(datasetFilesDO.getId()); + datasetAnswerDO.setQuestionId(datasetQuestionDO.getId()); + datasetAnswerDO.setAnswer(answer); + datasetAnswerMapper.insert(datasetAnswerDO); + } + } + } + ); + + } + /** * txt文本数据 * * @param txtFiles */ - public void readTxtFile (List txtFiles) { + public void readTxtFile(List txtFiles) { txtFiles.forEach(datasetFilesDO -> { List newContent = new ArrayList<>(); HttpURLConnection connection = DataSetReadFileUtils.readFile(datasetFilesDO.getDatasetFileUrl()); if (connection != null) { try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { - String inputLine; - while ((inputLine = in.readLine()) != null) { - inputLine = inputLine.trim(); // 去除行首和行尾的空白字符 - if (!inputLine.isEmpty()) { - DatasetQuestionDO datasetQuestionDO = new DatasetQuestionDO();// 检查是否为空行 - datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId()); - datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId()); - datasetQuestionDO.setQuestion(inputLine); - datasetQuestionMapper.insert(datasetQuestionDO); - } + StringBuilder content = new StringBuilder(); + String line; + while ((line = in.readLine()) != null) { + content.append(line); } + // 使用Jackson解析 JSON + jsonParsing(content, datasetFilesDO); +// String inputLine; +// while ((inputLine = in.readLine()) != null) { +// inputLine = inputLine.trim(); // 去除行首和行尾的空白字符 +// if (!inputLine.isEmpty()) { +// DatasetQuestionDO datasetQuestionDO = new DatasetQuestionDO();// 检查是否为空行 +// datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId()); +// datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId()); +// datasetQuestionDO.setQuestion(inputLine); +// datasetQuestionMapper.insert(datasetQuestionDO); +// } +// } } catch (Exception e) { - e.printStackTrace(); + throw exception(new ErrorCode(11001, "请正确上传txt格式得数据!!!")); } finally { connection.disconnect(); } @@ -619,7 +632,7 @@ public class DatasetServiceImpl implements DatasetService { * * @param csvFiles csv文件 */ - private void readCsvFile (List csvFiles) { + private void readCsvFile(List csvFiles) { csvFiles.forEach(datasetFilesDO -> { try { @@ -668,7 +681,7 @@ public class DatasetServiceImpl implements DatasetService { * * @param xlsxFiles */ - public void readXlsxFile (List xlsxFiles) { + public void readXlsxFile(List xlsxFiles) { xlsxFiles.forEach(datasetFilesDO -> { Workbook sheets = DataSetReadFileUtils.readXlsxFromUrl(datasetFilesDO.getDatasetFileUrl()); if (sheets != null) { @@ -716,7 +729,7 @@ public class DatasetServiceImpl implements DatasetService { }); } - public void parseFile (List datasetFiles) { + public void parseFile(List datasetFiles) { List insertDatasetFiles = BeanUtils.toBean(datasetFiles, DatasetFilesDO.class); datasetFilesMapper.insertBatch(insertDatasetFiles, 100); diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/resources/file/dataset_example/dataset_example_txt.txt b/yudao-module-llm/yudao-module-llm-biz/src/main/resources/file/dataset_example/dataset_example_txt.txt index eabde8ccb..8ed7e7a5e 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/resources/file/dataset_example/dataset_example_txt.txt +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/resources/file/dataset_example/dataset_example_txt.txt @@ -1 +1,23 @@ -请根据下面的新闻生成摘要, 内容如下:新华社受权于18日全文播发修改后的《中华人民共和国立法法》,修改后的立法法分为“总则”“法律”“行政法规”“地方性法规、自治条例和单行条例、规章”“适用与备案审查”“附则”等6章,共计105条。\n生成摘要如下: [["修改后的立法法全文公布"]] \ No newline at end of file +[ + { + "system": "你是一个专业的新闻摘要撰写助手,擅长使用简洁明了的语言来提炼核心信息。", + "question": "据路透社报道,俄罗斯经济发展部部长AlexeiUlyukayev当地时间周六(1月31日)表示,俄经济发展部已向政府提交了2015年度经济发展指标最新预测,此次预测是基于原油年平均价格为每桶50美元,而去年12月份的预测基于原油年平均价格为每桶80美元", + "answers": [ + "俄罗斯预计今年国内GDP将萎缩3%" + ] + }, + { + "system": "你是一个专业的新闻摘要撰写助手,擅长使用简洁明了的语言来提炼核心信息。", + "question": "1973年一桩“奸污”谜案,将符福山的人生划成了对等的两半:前40年,他是人民教师;后40年,他被三女生揭发奸污,遭除名并一生背负辱名。40年后3个“被奸污”女生终承认真相:为能被推荐上高中,3人是受人蛊惑,作伪证诬告“遭奸污”。", + "answers": [ + "海南教师被诬告奸污3女生背负辱名40年" + ] + }, + { + "system": "你是一个专业的新闻摘要撰写助手,擅长使用简洁明了的语言来提炼核心信息。", + "question": "8日白天,海南北部地区阴天有小阵雨,南部地区多云。9、10日全岛多云。11日,一股较强冷空气来袭,全岛阴天为主,气温下降明显。岛民们未来一周看不见太阳了,要记得保暖防寒哦~南海君最不喜欢湿冷的天气了,你呢?", + "answers": [ + "11日较强冷空气再袭海南" + ] + } +]