添加多模态数据集相关功能

This commit is contained in:
baggio19852005 2025-09-27 15:44:45 +08:00
parent b1c3b28c3b
commit 6604a24e30
27 changed files with 708 additions and 41 deletions

View File

@ -19,12 +19,22 @@
</properties>
<dependencies>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<!-- 如果在生产代码中使用,请移除 <scope>test</scope> -->
<!-- <scope>test</scope> -->
</dependency>
<dependency>
<groupId>cn.iocoder.boot</groupId>
<artifactId>yudao-module-llm-api</artifactId>
<version>${revision}</version>
</dependency>
<dependency>
<groupId>cn.iocoder.boot</groupId>
<artifactId>yudao-module-infra-biz</artifactId>
<version>2.3.0-jdk8-SNAPSHOT</version>
</dependency>
<!-- Web 相关 -->
<dependency>

View File

@ -5,4 +5,5 @@ public class DataConstants {
// 个人数据集标识
public static final int dataTypePrivate = 0;
public static final int dataTypePublic = 1;
}

View File

@ -0,0 +1,10 @@
package cn.iocoder.yudao.module.llm.constant;
public class DatasetMoreModalConstants {
public static String train_jsonfile="train.json";
public static String images_path="images/";
public static int BUFFER_SIZE = 4096; // 缓冲区大小
}

View File

@ -50,6 +50,13 @@ public class DatasetController {
return success(datasetService.createDataset(createReqVO));
}
@PostMapping("/createDatasetMoreModal")
@Operation(summary = "创建数据集")
// @PreAuthorize("@ss.hasPermission('llm:dataset:create')")
public CommonResult<Long> createDatasetMoreModal (@Valid @RequestBody DatasetSaveReqVO createReqVO) {
return success(datasetService.createDatasetMoreModal(createReqVO));
}
@PutMapping("/update")
@Operation(summary = "更新数据集")
// @PreAuthorize("@ss.hasPermission('llm:dataset:update')")
@ -66,6 +73,14 @@ public class DatasetController {
return success(list);
}
@GetMapping("/all/{type}")
@Operation(summary = "查询所有数据集接口")
// @PreAuthorize("@ss.hasPermission('llm:dataset:query')")
public CommonResult<List<DatasetTreeNode>> queryAllByBaseModelType (@PathVariable("type") Integer type) {
List<DatasetTreeNode> list = datasetService.queryAllByBaseModelType(type);
return success(list);
}
@DeleteMapping("/delete")
@Operation(summary = "删除数据集")
@Parameter(name = "id", description = "编号", required = true)

View File

@ -1,11 +1,9 @@
package cn.iocoder.yudao.module.llm.controller.admin.dataset;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetAnswerRespVO;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetQuestionPageReqVO;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetQuestionRespVO;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetQuestionSaveReqVO;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.*;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO;
import cn.iocoder.yudao.module.llm.service.dataset.DatasetQuestionService;
import cn.iocoder.yudao.module.llm.service.dataset.DatasetService;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
@ -49,6 +47,9 @@ public class DatasetQuestionController {
@Resource
private DatasetQuestionService datasetQuestionService;
@Resource
private DatasetService datasetService;
@PutMapping("data-anno")
@Operation(summary = "保存标注接口")
@ -71,6 +72,10 @@ public class DatasetQuestionController {
@ApiAccessLog(operateType = EXPORT)
public void exportDatasetFilesExcel(@Valid DatasetQuestionPageReqVO pageReqVO,
HttpServletResponse response) throws IOException {
// DatasetRespVO dataset = datasetService.getDataset(pageReqVO.getDatasetId());
// if(dataset!=null&&dataset.getStatus()!=2){
// throw new RuntimeException("只有状态为已完成的数据才能导出");
// }
HSSFWorkbook template = new HSSFWorkbook();
HSSFSheet sheet = template.createSheet();
// 创建样式并设置垂直居中
@ -94,29 +99,31 @@ public class DatasetQuestionController {
String system = item.getSystem();
String question = item.getQuestion();
List<DatasetAnswerRespVO> datasetAnswerRespVO = item.getDatasetAnswerRespVO();
List<String> collect = datasetAnswerRespVO.stream().map(DatasetAnswerRespVO::getAnswer).collect(Collectors.toList());
if (collect.size() == 0){
row = sheet.createRow(count);
row.createCell(0).setCellValue(system);
row.getCell(0).setCellStyle(cellStyle);
row.createCell(1).setCellValue(question);
row.getCell(1).setCellStyle(cellStyle);
row.createCell(2).setCellValue("");
row.getCell(2).setCellStyle(cellStyle);
id.add(count);
count++;
}else {
for (String s : collect) {
if(datasetAnswerRespVO!=null&&datasetAnswerRespVO.size()>0){
List<String> collect = datasetAnswerRespVO.stream().map(DatasetAnswerRespVO::getAnswer).collect(Collectors.toList());
if (collect.size() == 0){
row = sheet.createRow(count);
row.createCell(0).setCellValue(system);
row.getCell(0).setCellStyle(cellStyle);
row.createCell(1).setCellValue(question);
row.getCell(1).setCellStyle(cellStyle);
row.createCell(2).setCellValue(s);
row.createCell(2).setCellValue("");
row.getCell(2).setCellStyle(cellStyle);
id.add(count);
count++;
}else {
for (String s : collect) {
row = sheet.createRow(count);
row.createCell(0).setCellValue(system);
row.getCell(0).setCellStyle(cellStyle);
row.createCell(1).setCellValue(question);
row.getCell(1).setCellStyle(cellStyle);
row.createCell(2).setCellValue(s);
row.getCell(2).setCellStyle(cellStyle);
count++;
}
id.add(count-1);
}
id.add(count-1);
}
}
//合并相同内容的单元格

View File

@ -49,4 +49,6 @@ public class DatasetPageReqVO extends PageParam {
@Schema(description = "标注进度", example = "20")
private Integer annotateProgress;
private Integer datasetParentType;
}

View File

@ -40,4 +40,7 @@ public class DatasetQuestionRespVO {
@Schema(description = "标注内容")
private List<DatasetAnswerRespVO> datasetAnswerRespVO;
@Schema(description = "问题对应的图片")
private List<String> imagesList;
}

View File

@ -46,5 +46,7 @@ public class DatasetSaveReqVO {
@Schema(description = "数据集数据文件", example = "[]")
private List<DatasetFilesSaveReqVO> datasetFiles;
@Schema(description = "数据集父类型(1文本数据据,2多模态数据集)", example = "1")
private Integer datasetParentType;
}

View File

@ -0,0 +1,23 @@
package cn.iocoder.yudao.module.llm.controller.admin.dataset.vo;
import lombok.Data;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@Data
public class ZipProcessingResultVo implements Serializable {
private String trailJson;
private List<Map<String, String>> imagesList;
public ZipProcessingResultVo(String trailJson,List<Map<String, String>> imagesList){
this.imagesList=imagesList;
this.trailJson=trailJson;
}
}

View File

@ -157,6 +157,14 @@ public class ModelServiceController {
return success(true);
}
@PutMapping("/startDatasetMoreModal")
@Operation(summary = "启动模型")
// @PreAuthorize("@ss.hasPermission('llm:base-model:update')")
public CommonResult<Boolean> startDatasetMoreModal(@Valid @RequestBody ModelServiceSaveReqVO updateReqVO) {
modelServiceService.startDatasetMoreModal(updateReqVO);
return success(true);
}
@PutMapping("/disable")
@Operation(summary = "禁用模型")
// @PreAuthorize("@ss.hasPermission('llm:base-model:update')")

View File

@ -78,4 +78,8 @@ public class ModelServiceRespVO {
@Schema(description = "api秘钥")
private String BaseApiKey;
@Schema(description = "模型类型")
@ExcelProperty("模型类型")
private String modelType;
}

View File

@ -44,4 +44,6 @@ public class DatasetAnswerDO extends BaseDO {
*/
private String answer;
private String answerFrom;
}

View File

@ -74,5 +74,7 @@ public class DatasetDO extends BaseDO {
private String fileUrl;
private Integer datasetParentType;
}

View File

@ -0,0 +1,48 @@
package cn.iocoder.yudao.module.llm.dal.dataobject.dataset;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import com.baomidou.mybatisplus.annotation.KeySequence;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
/**
* 数据集数据文件 DO
*
* @author 华大大模型
*/
@TableName("llm_dataset_images")
@KeySequence("llm_dataset_images_seq") // 用于 OraclePostgreSQLKingbaseDB2H2 数据库的主键自增如果是 MySQL 等数据库可不写
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class DatasetImagesDO extends BaseDO {
/**
* 数据集ID
*/
@TableId
private Long id;
/**
* 数据集ID
*/
private Long datasetId;
/**
* 数据长度
*/
private Long dataLength;
/**
* 数据文件文件表的ID
*/
private Long datasetFile;
/**
* 文件URL地址
*/
private String datasetImageUrl;
private String datasetImageName;
}

View File

@ -0,0 +1,41 @@
package cn.iocoder.yudao.module.llm.dal.dataobject.dataset;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import com.baomidou.mybatisplus.annotation.KeySequence;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
/**
* 数据集数据文件 DO
*
* @author 华大大模型
*/
@TableName("llm_dataset_question_answer_image")
@KeySequence("llm_dataset_question_answer_image_seq") // 用于 OraclePostgreSQLKingbaseDB2H2 数据库的主键自增如果是 MySQL 等数据库可不写
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class DatasetQuestionAnswerImageDO extends BaseDO {
/**
* 数据集ID
*/
@TableId
private Long id;
/**
* 数据集ID
*/
private Long datasetId;
private Long questionId;
private Long answerId;
private Long dataImageId;
private String imageUrl;
}

View File

@ -46,4 +46,6 @@ public class DatasetQuestionDO extends BaseDO {
@TableField("`system`")
private String system;
private String questionFrom;
}

View File

@ -0,0 +1,30 @@
package cn.iocoder.yudao.module.llm.dal.mysql.dataset;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.DatasetFilesPageReqVO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetFilesDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetImagesDO;
import org.apache.ibatis.annotations.Mapper;
/**
* 数据集数据文件 Mapper
*
* @author 华大大模型
*/
@Mapper
public interface DatasetImagesMapper extends BaseMapperX<DatasetImagesDO> {
// default PageResult<DatasetFilesDO> selectPage(DatasetFilesPageReqVO reqVO) {
// return selectPage(reqVO, new LambdaQueryWrapperX<DatasetFilesDO>()
// .eqIfPresent(DatasetFilesDO::getDatasetId, reqVO.getDatasetId())
// .eqIfPresent(DatasetFilesDO::getDataLength, reqVO.getDataLength())
// .eqIfPresent(DatasetFilesDO::getDatasetFile, reqVO.getDatasetFile())
// .eqIfPresent(DatasetFilesDO::getDatasetFileUrl, reqVO.getDatasetFileUrl())
// .betweenIfPresent(DatasetFilesDO::getCreateTime, reqVO.getCreateTime())
// .orderByDesc(DatasetFilesDO::getId));
// }
}

View File

@ -30,6 +30,7 @@ public interface DatasetMapper extends BaseMapperX<DatasetDO> {
.eqIfPresent(DatasetDO::getDatasetFile, reqVO.getDatasetFile())
.eqIfPresent(DatasetDO::getDatasetType, reqVO.getDatasetType())
.eqIfPresent(DatasetDO::getDatasetFileUrl, reqVO.getDatasetFileUrl())
.eq(DatasetDO::getDatasetParentType,reqVO.getDatasetParentType())
.betweenIfPresent(DatasetDO::getCreateTime, reqVO.getCreateTime())
.orderByDesc(DatasetDO::getId));
}

View File

@ -0,0 +1,27 @@
package cn.iocoder.yudao.module.llm.dal.mysql.dataset;
import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetImagesDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionAnswerImageDO;
import org.apache.ibatis.annotations.Mapper;
/**
* 数据集数据文件 Mapper
*
* @author 华大大模型
*/
@Mapper
public interface DatasetQuestionAnswerImageMapper extends BaseMapperX<DatasetQuestionAnswerImageDO> {
// default PageResult<DatasetFilesDO> selectPage(DatasetFilesPageReqVO reqVO) {
// return selectPage(reqVO, new LambdaQueryWrapperX<DatasetFilesDO>()
// .eqIfPresent(DatasetFilesDO::getDatasetId, reqVO.getDatasetId())
// .eqIfPresent(DatasetFilesDO::getDataLength, reqVO.getDataLength())
// .eqIfPresent(DatasetFilesDO::getDatasetFile, reqVO.getDatasetFile())
// .eqIfPresent(DatasetFilesDO::getDatasetFileUrl, reqVO.getDatasetFileUrl())
// .betweenIfPresent(DatasetFilesDO::getCreateTime, reqVO.getCreateTime())
// .orderByDesc(DatasetFilesDO::getId));
// }
}

View File

@ -292,7 +292,7 @@ public class BaseModelServiceImpl implements BaseModelService {
@Override
public List<BaseModelDO> listBaseModels() {
LambdaQueryWrapper<BaseModelDO> select = new LambdaQueryWrapper<BaseModelDO>()
.select(BaseModelDO::getId, BaseModelDO::getModelName, BaseModelDO::getTheTuningName);
.select(BaseModelDO::getId, BaseModelDO::getModelName, BaseModelDO::getTheTuningName,BaseModelDO::getModelType);
List<BaseModelDO> selects = baseModelMapper.selectList(select);

View File

@ -2,12 +2,16 @@ package cn.iocoder.yudao.module.llm.service.dataset;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.*;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetAnswerDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionAnswerImageDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetAnswerMapper;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetMapper;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetQuestionAnswerImageMapper;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetQuestionMapper;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import jodd.util.StringUtil;
import org.springframework.stereotype.Service;
@ -43,6 +47,9 @@ public class DatasetQuestionServiceImpl implements DatasetQuestionService {
@Resource
private DatasetMapper datasetMapper;
@Resource
private DatasetQuestionAnswerImageMapper datasetQuestionAnswerImageMapper;
@Override
public Long createDatasetQuestion(DatasetQuestionSaveReqVO createReqVO) {
// 插入
@ -83,6 +90,9 @@ public class DatasetQuestionServiceImpl implements DatasetQuestionService {
@Override
public PageResult<DatasetQuestionRespVO> getDatasetQuestionPage(DatasetQuestionPageReqVO pageReqVO) {
PageResult<DatasetQuestionDO> datasetQuestionDOPageResult = datasetQuestionMapper.selectPage(pageReqVO);
Long datasetId = pageReqVO.getDatasetId();
DatasetDO datasetDO = datasetMapper.selectById(datasetId);
Integer datasetParentType = datasetDO.getDatasetParentType();
PageResult<DatasetQuestionRespVO> result = BeanUtils.toBean(datasetQuestionDOPageResult, DatasetQuestionRespVO.class);
if (CollectionUtils.isNotEmpty(result.getList())) {
// result.getList().forEach(item -> {
@ -102,7 +112,17 @@ public class DatasetQuestionServiceImpl implements DatasetQuestionService {
Map<Long, List<DatasetAnswerRespVO>> collect1 = respVOS.stream().collect(Collectors.groupingBy(DatasetAnswerRespVO::getQuestionId));
list.forEach(item -> {
item.setDatasetAnswerRespVO(collect1.get(item.getId()));
if(datasetParentType==2){
LambdaQueryWrapper<DatasetQuestionAnswerImageDO> imagewrapper = new LambdaQueryWrapper<DatasetQuestionAnswerImageDO>()
.eq(DatasetQuestionAnswerImageDO::getQuestionId, item.getId())
.eq(DatasetQuestionAnswerImageDO::getDatasetId,item.getDatasetId());
List<DatasetQuestionAnswerImageDO> datasetQuestionAnswerImageDOList = datasetQuestionAnswerImageMapper.selectList(imagewrapper);
List<String> imageUrlList = datasetQuestionAnswerImageDOList.stream().map(DatasetQuestionAnswerImageDO::getImageUrl).collect(Collectors.toList());
item.setImagesList(imageUrlList);
}
});
}
return result;
}

View File

@ -8,6 +8,7 @@ import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.*;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetDO;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import org.springframework.transaction.annotation.Transactional;
/**
* 数据集 Service 接口
@ -24,6 +25,9 @@ public interface DatasetService {
*/
Long createDataset(@Valid DatasetSaveReqVO createReqVO);
@Transactional
Long createDatasetMoreModal(DatasetSaveReqVO createReqVO);
/**
* 更新数据集
*
@ -57,4 +61,5 @@ public interface DatasetService {
List<DatasetTreeNode> queryAll();
List<DatasetTreeNode> queryAllByBaseModelType(Integer type);
}

View File

@ -8,17 +8,13 @@ import cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.infra.api.file.FileApi;
import cn.iocoder.yudao.module.infra.service.file.FileService;
import cn.iocoder.yudao.module.llm.constant.DataConstants;
import cn.iocoder.yudao.module.llm.constant.DatasetMoreModalConstants;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.dto.DataJsonTemplate;
import cn.iocoder.yudao.module.llm.controller.admin.dataset.vo.*;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetAnswerDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetFilesDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetAnswerMapper;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetFilesMapper;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetMapper;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetQuestionMapper;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.*;
import cn.iocoder.yudao.module.llm.dal.mysql.dataset.*;
import cn.iocoder.yudao.module.llm.enums.DatasetStatusEnum;
import cn.iocoder.yudao.module.llm.service.finetuningtask.FineTuningTaskService;
import cn.iocoder.yudao.module.llm.service.modelassesstaskauto.ModelAssessTaskAutoService;
@ -26,6 +22,8 @@ import cn.iocoder.yudao.module.llm.service.modelassesstaskmanual.ModelAssessTask
import cn.iocoder.yudao.module.llm.service.modelassesstaskmanualbackup.ModelAssessTaskManualBackupService;
import cn.iocoder.yudao.module.llm.utils.DataSetReadFileUtils;
import cn.iocoder.yudao.module.llm.utils.vo.CsvDataSetVO;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.baomidou.mybatisplus.core.toolkit.StringUtils;
@ -41,6 +39,7 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.validation.annotation.Validated;
@ -49,9 +48,12 @@ import javax.annotation.Resource;
import java.io.*;
import java.net.HttpURLConnection;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
@ -92,6 +94,20 @@ public class DatasetServiceImpl implements DatasetService {
@Resource
private ModelAssessTaskManualBackupService modelAssessTaskManualBackupService;
@Resource
private DatasetImagesMapper datasetImagesMapper;
@Resource
private FileService fileService;
@Resource
private DatasetQuestionAnswerImageMapper datasetQuestionAnswerImageMapper;
private static final Pattern IMAGE_FILE_EXTENSION_PATTERN = Pattern.compile("\\.([^.]+)$");
//json文件中的地址前缀都有一个image/;
private static final String JSON_IMAGE_PERFEX="image/";
private static long getFileContentLength(File file) throws IOException {
FileInputStream fis = new FileInputStream(file);
byte[] buffer = new byte[1024];
@ -164,6 +180,318 @@ public class DatasetServiceImpl implements DatasetService {
return dataset.getId();
}
/**多模让i数据集创建
*
* */
@Override
@Transactional
public Long createDatasetMoreModal(DatasetSaveReqVO createReqVO) {
// 校验
validateDatasetNameExists(createReqVO);
if (createReqVO.getType() == null) {
createReqVO.setType(0);
}
List<DatasetFilesSaveReqVO> datasetFiles = createReqVO.getDatasetFiles();
// 插入
DatasetDO dataset = BeanUtils.toBean(createReqVO, DatasetDO.class);
datasetMapper.insert(dataset);
if (CollectionUtils.isNotEmpty(datasetFiles)) {
datasetFiles.stream().forEach(
datasetFilesSaveReqVO -> {
datasetFilesSaveReqVO.setDatasetId(dataset.getId());
}
);
// parseFile(datasetFiles);
resolveZipFileDatasetMoreModal(datasetFiles);
Long count = datasetQuestionMapper.selectCount(new LambdaQueryWrapper<DatasetQuestionDO>()
.eq(DatasetQuestionDO::getDatasetId, dataset.getId()));
if (count <= 0) {
throw new ServiceException(new ErrorCode(20000, "数据集问题不能为空"));
}
dataset.setDataLength(count);
Long annoCount = datasetQuestionMapper.selectCount(new LambdaQueryWrapper<DatasetQuestionDO>()
.eq(DatasetQuestionDO::getDatasetId, dataset.getId())
.eq(DatasetQuestionDO::getStatus, 2));
double ratio = count == 0 ? 0 : ((double) annoCount / count) * 100;
Integer formattedRatio = ratio == 0 ? 0 : (int) ratio;
Integer status = formattedRatio == 100 ? 2 : 1;
if (formattedRatio != null) {
dataset.setAnnotateProgress(formattedRatio);
}
if (annoCount == 0) {
status = 0;
}
if (CollectionUtils.isEmpty(datasetFiles)) {
throw new ServiceException(new ErrorCode(
20000, "数据集文件不能为空"));
}
// if (dataset.getDatasetType() == 2) {
// if (status != 2) {
// throw new ServiceException(new ErrorCode(
// 20000, "评估数据集只能上传标注完成的数据"));
// }
// } else {
// if (dataset.getStatus() != status) {
// throw new ServiceException(new ErrorCode(
// 20000, "数据集标注状态错误!应该是【" + DatasetStatusEnum.getStatusByName(status) + ""));
// }
// }
dataset.setStatus(status);
datasetMapper.updateById(dataset);
}
return dataset.getId();
}
//解析多模态的zip文件
public void resolveZipFileDatasetMoreModal(List<DatasetFilesSaveReqVO> datasetFiles){
List<DatasetFilesDO> insertDatasetFiles = BeanUtils.toBean(datasetFiles, DatasetFilesDO.class);
datasetFilesMapper.insertBatch(insertDatasetFiles, 100);
List<DatasetFilesDO> zipFiles = insertDatasetFiles.stream()
.filter(datasetFilesDO -> datasetFilesDO.getDatasetFileUrl().toLowerCase().endsWith(".zip"))
.collect(Collectors.toList());
// List<Map<>>
zipFiles.forEach(datasetFilesDO ->{
HttpURLConnection connection = DataSetReadFileUtils.readFile(datasetFilesDO.getDatasetFileUrl());
List<Map<String,Object>> questionAnswerList=new ArrayList<>();
if (connection != null) {
try {
InputStream inputStream = connection.getInputStream();
ZipProcessingResultVo zipProcessingResultVo = processZipFileStream(inputStream);
String trailJson = zipProcessingResultVo.getTrailJson();
JSONArray jsonArray=JSONArray.parseArray(trailJson);
if(jsonArray!=null&&jsonArray.size()>0){
for (int i=0;i<jsonArray.size();i++){
JSONObject jsonObject = jsonArray.getJSONObject(i);
JSONArray conversationsJsonArray = jsonObject.getJSONArray("conversations");
JSONArray imagesJsonArray = jsonObject.getJSONArray("images");
//conversationsJsonArray中奇数数据为问题偶数数据为回答
for(int j=0;j<conversationsJsonArray.size();j++){
Map<String,Object> quesionanswermap=new HashMap<>();
JSONObject conversationsJson=conversationsJsonArray.getJSONObject(j);
if(j%2==0){
DatasetQuestionDO qdo=new DatasetQuestionDO();
qdo.setDatasetId(datasetFiles.get(0).getDatasetId());
qdo.setDatasetFilesId(datasetFilesDO.getId());
String question=conversationsJson.getString("value");
String questionFrom=conversationsJson.getString("from");
qdo.setQuestion(question);
qdo.setCreateTime(LocalDateTime.now());
qdo.setQuestionFrom(questionFrom);
datasetQuestionMapper.insert(qdo);
//获取回答数据
JSONObject conversationsAnswerJson=conversationsJsonArray.getJSONObject(j+1);
DatasetAnswerDO ado=new DatasetAnswerDO();
ado.setDatasetId(datasetFiles.get(0).getDatasetId());
ado.setDatasetFilesId(datasetFilesDO.getId());
String answer=conversationsAnswerJson.getString("value");
String answerfrom=conversationsAnswerJson.getString("from");
ado.setAnswer(answer);
ado.setAnswerFrom(answerfrom);
ado.setCreateTime(LocalDateTime.now());
ado.setQuestionId(qdo.getId());
datasetAnswerMapper.insert(ado);
quesionanswermap.put("questionid",qdo.getId());
quesionanswermap.put("answerid",ado.getId());
quesionanswermap.put("images",imagesJsonArray);
questionAnswerList.add(quesionanswermap);
}
}
}
}
//将图片路径入库
List<Map<String, String>> imagesList = zipProcessingResultVo.getImagesList();
List<DatasetImagesDO> datasetimagesdolist=new ArrayList<>();
List<DatasetQuestionAnswerImageDO> datasetQuestionAnswerImageDOList=new ArrayList<>();
imagesList.forEach(url ->{
DatasetImagesDO ido=new DatasetImagesDO();
ido.setDatasetId(datasetFilesDO.getDatasetId());
ido.setDatasetFile(datasetFilesDO.getDatasetFile());
ido.setDatasetImageUrl(url.get("url").toString());
ido.setCreateTime(LocalDateTime.now());
// ido.setDatasetImageName()
// datasetimagesdolist.add(ido);
datasetImagesMapper.insert(ido);
//向llm_dataset_question_answer_image表中插入数据
for(int i=0;i<questionAnswerList.size();i++){
Map<String, Object> map = questionAnswerList.get(i);
Long questionid=Long.parseLong(map.get("questionid").toString());
Long answerid=Long.parseLong(map.get("answerid").toString());
JSONArray imagesJsonArray= (JSONArray) map.get("images");
String imagename=url.get("imagename");
for (int k=0;k<imagesJsonArray.size();k++){
if(imagesJsonArray.get(k).toString().equals(JSON_IMAGE_PERFEX+imagename)){
DatasetQuestionAnswerImageDO data=new DatasetQuestionAnswerImageDO();
data.setDatasetId(datasetFilesDO.getDatasetId());
data.setDataImageId(ido.getId());
data.setQuestionId(questionid);
data.setAnswerId(answerid);
data.setImageUrl(url.get("url").toString());
datasetQuestionAnswerImageDOList.add(data);
}
}
}
});
datasetQuestionAnswerImageMapper.insertBatch(datasetQuestionAnswerImageDOList);
// datasetImagesMapper.insertBatch(datasetimagesdolist);
} catch (Exception e) {
throw exception(new ErrorCode(
11000, "请正确上传zip格式得数据"));
} finally {
connection.disconnect();
}
}
});
}
public ZipProcessingResultVo processZipFileStream(InputStream zipInputStream) throws IOException {
String trainJsonlContent = null;
List<Map<String,String>> uploadedImageUrls = new ArrayList<>();
// 使用 try-with-resources 确保 ZipInputStream 被正确关闭
try (ZipInputStream zis = new ZipInputStream(zipInputStream, StandardCharsets.UTF_8)) { // 假设ZIP文件条目名称是UTF-8编码
ZipEntry entry;
// 遍历ZIP文件中的每一个条目
while ((entry = zis.getNextEntry()) != null) {
String entryName = entry.getName();
// 忽略目录条目
if (entry.isDirectory()) {
zis.closeEntry();
continue;
}
String jsonfilename=entryName.split("/")[entryName.split("/").length-1];
// 1. 获取 train.jsonl 文件内容
if (DatasetMoreModalConstants.train_jsonfile.equals(jsonfilename)) {
trainJsonlContent = readEntryContentAsString(zis);
System.out.println("成功读取 train.jsonl 内容,大小: " + (trainJsonlContent != null ? trainJsonlContent.length() : 0) + " 字符。");
}
// 2. 处理 images 目录下的图片
else if (isImageFile(entryName)) {
Map<String,String> map=new HashMap<>();
// 提取文件名 (例如img1.jpg)
String filename = Paths.get(entryName).getFileName().toString();
String contentType = getContentType(filename);
// 获取图片大小用于上传服务
long imageSize = entry.getSize();
// 核心直接将zis当前entry的流传递给ImageUploadService
// 注意如果entry.getSize()-1依赖于ImageUploadService的实现
// 如果ImageUploadService不能处理未知大小的流则此处需要先将流读入内存
String imageUrl;
if (imageSize == -1) {
System.err.println("Warning: Image entry " + entryName + " has unknown size (-1). Reading to ByteArrayOutputStream first for upload.");
byte[] imageData = readEntryContentAsBytes(zis); // 读取到内存
imageSize = imageData.length;
InputStream tempStream = new java.io.ByteArrayInputStream(imageData);
MockMultipartFile file = new MockMultipartFile("file", filename, contentType, tempStream);
imageUrl = fileService.createFile(file.getOriginalFilename(), "", IoUtil.readBytes(file.getInputStream()));
} else {
byte[] imageData = readEntryContentAsBytes(zis);
InputStream tempStream = new java.io.ByteArrayInputStream(imageData);
MockMultipartFile file = new MockMultipartFile("file", filename, contentType, tempStream);
// 推荐方式直接传递zisImageUploadService从zis中读取entry内容
imageUrl = fileService.createFile(file.getOriginalFilename(), "", IoUtil.readBytes(file.getInputStream()));
}
map.put("url",imageUrl);
map.put("imagename",filename);
uploadedImageUrls.add(map);
System.out.println("成功上传图片: " + filename + "URL: " + imageUrl);
}
zis.closeEntry(); // 关闭当前条目准备读取下一个
}
} catch (IOException e) {
System.err.println("处理ZIP文件流失败: " + e.getMessage());
throw e; // 重新抛出让控制器处理
}
// 检查是否找到了必要文件/图片 (可选根据业务需求)
// if (trainJsonlContent == null) {
// System.out.println("警告: ZIP文件中未找到 " + TRAIN_JSONL_PATH);
// // throw new IOException(TRAIN_JSONL_PATH + " 文件缺失。");
// }
// if (uploadedImageUrls.isEmpty()) {
// System.out.println("警告: ZIP文件中未找到 " + IMAGES_DIR_PATH_PREFIX + " 目录下的任何图片。");
// }
return new ZipProcessingResultVo(trainJsonlContent, uploadedImageUrls);
}
private String readEntryContentAsString(ZipInputStream zis) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[DatasetMoreModalConstants.BUFFER_SIZE];
int len;
while ((len = zis.read(buffer)) != -1) {
baos.write(buffer, 0, len);
}
return baos.toString(StandardCharsets.UTF_8.name()); // 假设 train.jsonl UTF-8 编码
}
/**
* ZipInputStream 中读取当前条目的所有内容作为字节数组
* 主要用于 ZipEntry.getSize() -1 时作为备用方案或者 ImageUploadService 不支持流式上传时
*/
private byte[] readEntryContentAsBytes(ZipInputStream zis) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[DatasetMoreModalConstants.BUFFER_SIZE];
int len;
while ((len = zis.read(buffer)) != -1) {
baos.write(buffer, 0, len);
}
return baos.toByteArray();
}
/**
* 判断文件名是否为常见图片格式
*/
private boolean isImageFile(String filename) {
String lowerCaseFilename = filename.toLowerCase();
return lowerCaseFilename.endsWith(".jpg") ||
lowerCaseFilename.endsWith(".jpeg") ||
lowerCaseFilename.endsWith(".png") ||
lowerCaseFilename.endsWith(".gif") ||
lowerCaseFilename.endsWith(".bmp") ||
lowerCaseFilename.endsWith(".webp");
}
/**
* 根据文件名后缀获取MIME Content-Type
*/
private String getContentType(String filename) {
Matcher matcher = IMAGE_FILE_EXTENSION_PATTERN.matcher(filename.toLowerCase());
if (matcher.find()) {
String extension = matcher.group(1);
switch (extension) {
case "jpg":
case "jpeg": return "image/jpeg";
case "png": return "image/png";
case "gif": return "image/gif";
case "bmp": return "image/bmp";
case "webp": return "image/webp";
}
}
return "application/octet-stream"; // 默认MIME类型
}
@Override
@Transactional
public void updateDataset(DatasetSaveReqVO updateReqVO) {
@ -364,8 +692,10 @@ public class DatasetServiceImpl implements DatasetService {
public DatasetRespVO getDataset(Long id) {
DatasetDO datasetDO = datasetMapper.selectById(id);
DatasetRespVO datasetRespVO = BeanUtils.toBean(datasetDO, DatasetRespVO.class);
Integer datasetParentType = datasetDO.getDatasetParentType();
List<DatasetFilesDO> datasetFilesDOS = datasetFilesMapper.selectList(new LambdaQueryWrapper<DatasetFilesDO>().eq(DatasetFilesDO::getDatasetId, id));
datasetRespVO.setDatasetFiles(BeanUtils.toBean(datasetFilesDOS, DatasetFilesRespVO.class));
/*List<DatasetQuestionDO> datasetQuestionDO = datasetQuestionMapper.selectList(new LambdaQueryWrapper<DatasetQuestionDO>().eq(DatasetQuestionDO::getDatasetId, id));
List<DatasetQuestionRespVO> datasetQuestionRespVOS = BeanUtils.toBean(datasetQuestionDO, DatasetQuestionRespVO.class);
datasetRespVO.setDatasetQuestionRespVOS(datasetQuestionRespVOS);*/
@ -387,7 +717,41 @@ public class DatasetServiceImpl implements DatasetService {
result.add(datasetRespVOS0);
result.add(datasetRespVOS1);*/
List<DatasetDO> datasetDOS = datasetMapper.selectList(new LambdaQueryWrapper<DatasetDO>()
.eq(DatasetDO::getStatus, 2)); // 获取所有数据集
.eq(DatasetDO::getStatus, 2)
); // 获取所有数据集
// 创建两个根节点分别代表两种 type
DatasetTreeNode privateRoot = new DatasetTreeNode(DataConstants.dataTypePrivate);
DatasetTreeNode publicRoot = new DatasetTreeNode(DataConstants.dataTypePublic);
for (DatasetDO datasetDO : datasetDOS) {
DatasetRespVO datasetRespVO = BeanUtils.toBean(datasetDO, DatasetRespVO.class);
// 根据 type 字段决定节点的位置
if (datasetRespVO.getType() == DataConstants.dataTypePrivate) {
privateRoot.getChildren().add(datasetRespVO);
} else if (datasetRespVO.getType() == DataConstants.dataTypePublic) {
publicRoot.getChildren().add(datasetRespVO);
}
}
List<DatasetTreeNode> root = new ArrayList<>();
root.add(privateRoot);
root.add(publicRoot);
return root;
}
@Override
public List<DatasetTreeNode> queryAllByBaseModelType(Integer type) {
/*List<DatasetDO> datasetDOS0 = datasetMapper.selectList(new LambdaQueryWrapper<DatasetDO>().eq(DatasetDO::getType, DataConstants.dataTypePrivate));
List<DatasetRespVO> datasetRespVOS0 = BeanUtils.toBean(datasetDOS0, DatasetRespVO.class);
List<DatasetDO> datasetDOS1 = datasetMapper.selectList(new LambdaQueryWrapper<DatasetDO>().eq(DatasetDO::getType, DataConstants.dataTypePublic));
List<DatasetRespVO> datasetRespVOS1 = BeanUtils.toBean(datasetDOS1, DatasetRespVO.class);
List<List<DatasetRespVO>> result = new ArrayList<>();
result.add(datasetRespVOS0);
result.add(datasetRespVOS1);*/
List<DatasetDO> datasetDOS = datasetMapper.selectList(new LambdaQueryWrapper<DatasetDO>()
.eq(DatasetDO::getStatus, 2)
.eq(DatasetDO::getDatasetParentType,type)
); // 获取所有数据集
// 创建两个根节点分别代表两种 type
DatasetTreeNode privateRoot = new DatasetTreeNode(DataConstants.dataTypePrivate);
@ -554,7 +918,8 @@ public class DatasetServiceImpl implements DatasetService {
// 解析JSON数据
jsonParsing(content, datasetFilesDO);
} catch (Exception e) {
throw exception(new ErrorCode(11000, "请正确上传json格式得数据"));
e.printStackTrace();
// throw exception(new ErrorCode(11000, "请正确上传json格式得数据"));
} finally {
connection.disconnect();
}

View File

@ -92,6 +92,8 @@ public interface ModelServiceService {
void startTheModel(ModelServiceSaveReqVO updateReqVO);
void startDatasetMoreModal(ModelServiceSaveReqVO updateReqVO);
void disableTheModel(ModelServiceSaveReqVO updateReqVO);
void inspectTheApplication(ModelServiceSaveReqVO updateReqVO);

View File

@ -1,28 +1,23 @@
package cn.iocoder.yudao.module.llm.service.modelservice;
import cn.hutool.json.JSONUtil;
import cn.iocoder.yudao.framework.common.exception.ErrorCode;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.collection.CollectionUtils;
import cn.iocoder.yudao.framework.common.util.http.HttpUtils;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.llm.controller.admin.basemodel.BaseModelController;
import cn.iocoder.yudao.module.llm.controller.admin.modelservice.vo.ModelServicePageReqVO;
import cn.iocoder.yudao.module.llm.controller.admin.modelservice.vo.ModelServiceRespVO;
import cn.iocoder.yudao.module.llm.controller.admin.modelservice.vo.ModelServiceSaveReqVO;
import cn.iocoder.yudao.module.llm.dal.dataobject.application.ApplicationDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.basemodel.BaseModelDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.finetuningtask.FineTuningTaskDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.modelservice.ModelServiceDO;
import cn.iocoder.yudao.module.llm.dal.dataobject.servername.ServerNameDO;
import cn.iocoder.yudao.module.llm.dal.mysql.basemodel.BaseModelMapper;
import cn.iocoder.yudao.module.llm.dal.mysql.finetuningtask.FineTuningTaskMapper;
import cn.iocoder.yudao.module.llm.dal.mysql.modelservice.ModelServiceMapper;
import cn.iocoder.yudao.module.llm.framework.backend.config.LLMBackendProperties;
import cn.iocoder.yudao.module.llm.service.application.ApplicationService;
import cn.iocoder.yudao.module.llm.service.async.AsyncModelServiceService;
import cn.iocoder.yudao.module.llm.service.basemodel.BaseModelService;
import cn.iocoder.yudao.module.llm.service.basemodel.BaseModelTaskService;
import cn.iocoder.yudao.module.llm.service.http.ModelService;
import cn.iocoder.yudao.module.llm.service.http.TrainHttpService;
import cn.iocoder.yudao.module.llm.service.modelassesstaskauto.ModelAssessTaskAutoService;
@ -33,8 +28,6 @@ import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.context.annotation.Lazy;
import org.springframework.stereotype.Service;
import org.springframework.validation.annotation.Validated;
@ -343,6 +336,7 @@ public class ModelServiceServiceImpl implements ModelServiceService {
item.setFineTuningTaskName(fineTuningTaskDO.getModelName());
item.setBaseApiUrl(fineTuningTaskDO.getApiUrl());
item.setBaseApiKey(fineTuningTaskDO.getApiKey());
item.setModelType(fineTuningTaskDO.getModelType()==null?"":fineTuningTaskDO.getModelType());
}
});
}
@ -474,6 +468,25 @@ public class ModelServiceServiceImpl implements ModelServiceService {
}
}
@Override
public void startDatasetMoreModal(ModelServiceSaveReqVO updateReqVO) {
try {
Long fineTuningTask = updateReqVO.getFineTuningTask();
BaseModelDO byAigcId = baseModelService.getById(fineTuningTask);
Map<String,String> map = new HashMap<>();
map.put("model",byAigcId.getModelName());
log.info("开始请求", llmBackendProperties.getDeployModel());
String resStr = HttpUtils.post(llmBackendProperties.getDeployModel(), null,JSON.toJSONString(map));
log.info(" unActive:{}", resStr);
ModelServiceDO updateObj = BeanUtils.toBean(updateReqVO, ModelServiceDO.class);
updateObj.setStatus(1);
updateObj.setNumber(1);
modelServiceMapper.updateById(updateObj);
}catch (Exception e){
log.error("启动基础模型状态时发生异常: {}", e.getMessage(), e);
}
}
@Override
public void disableTheModel(ModelServiceSaveReqVO updateReqVO) {
try {

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetImagesMapper">
<!--
一般情况下,尽可能使用 Mapper 进行 CRUD 增删改查即可。
无法满足的场景,例如说多表关联查询,才使用 XML 编写 SQL。
代码生成器暂时只生成 Mapper XML 文件本身,更多推荐 MybatisX 快速开发插件来生成查询。
文档可见https://www.iocoder.cn/MyBatis/x-plugins/
-->
</mapper>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.iocoder.yudao.module.llm.dal.mysql.dataset.DatasetQuestionAnswerImageMapper">
<!--
一般情况下,尽可能使用 Mapper 进行 CRUD 增删改查即可。
无法满足的场景,例如说多表关联查询,才使用 XML 编写 SQL。
代码生成器暂时只生成 Mapper XML 文件本身,更多推荐 MybatisX 快速开发插件来生成查询。
文档可见https://www.iocoder.cn/MyBatis/x-plugins/
-->
</mapper>