中台添加数据集相关功能
Some checks failed
Java CI with Maven / build (11) (push) Has been cancelled
Java CI with Maven / build (17) (push) Has been cancelled
Java CI with Maven / build (8) (push) Has been cancelled
yudao-ui-admin CI / build (14.x) (push) Has been cancelled
yudao-ui-admin CI / build (16.x) (push) Has been cancelled

This commit is contained in:
baggio19852005 2025-10-16 18:34:18 +08:00
parent f1acce8447
commit 526283f605
67 changed files with 5185 additions and 3 deletions

View File

@ -152,6 +152,12 @@
<groupId>org.springframework</groupId>
<artifactId>spring-webflux</artifactId>
</dependency>
<dependency>
<groupId>cn.iocoder.boot</groupId>
<artifactId>yudao-module-mdpf-biz</artifactId>
<version>2.3.0-jdk8-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -12,6 +12,8 @@ import cn.iocoder.yudao.module.llm.service.dataset.DatasetFilesService;
import cn.iocoder.yudao.module.llm.service.dataset.vo.AigcDatasetVo;
import cn.iocoder.yudao.module.llm.service.http.TrainHttpService;
import cn.iocoder.yudao.module.llm.service.http.vo.AigcDatasetFileRespV0;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.StringUtils;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
@ -184,4 +186,94 @@ public class AsyncDataSetService {
}
}
public String JsonFileWriteFineMiddle (String hostUrl, DataSetMiddleDO datasetDO, List<PlatformDatasetQuestionRespVO> datasetQuestionList) {
try {
log.info("开始生成 JSON 文件并上传数据集ID: {}", datasetDO.getId());
// 构建 AigcDatasetVo 列表
log.debug("正在构建 AigcDatasetVo 列表...");
List<AigcDatasetVo> aigcDatasetVoList = new ArrayList<>();
for (PlatformDatasetQuestionRespVO dataSource : datasetQuestionList) {
AigcDatasetVo aigcDatasetVo = new AigcDatasetVo();
aigcDatasetVo.setInstruction(StringUtils.isNotBlank(dataSource.getSystem()) ? dataSource.getSystem() : "");
aigcDatasetVo.setInput(StringUtils.isNotBlank(dataSource.getQuestion()) ? dataSource.getQuestion() : "");
// 检查答案列表是否为空
if (!CollectionUtils.isAnyEmpty(dataSource.getDatasetAnswerRespVO())) {
aigcDatasetVo.setOutput(StringUtils.isNotBlank(dataSource.getDatasetAnswerRespVO().get(0).getAnswer()) ?
dataSource.getDatasetAnswerRespVO().get(0).getAnswer() : "");
} else {
aigcDatasetVo.setOutput("");
}
aigcDatasetVoList.add(aigcDatasetVo);
}
log.debug("AigcDatasetVo 列表构建完成。记录数量: {}", aigcDatasetVoList.size());
// AigcDatasetVo 列表转换为 JSON 字符串
log.debug("正在将 AigcDatasetVo 列表转换为 JSON 字符串...");
ObjectMapper mapper = new ObjectMapper();
StringBuilder sb = new StringBuilder();
for (AigcDatasetVo aigcDatasetVo : aigcDatasetVoList) {
String json = mapper.writeValueAsString(aigcDatasetVo);
sb.append(json).append("\n");
}
// JSON 字符串转换为输入流
log.debug("正在将 JSON 字符串转换为输入流...");
InputStream inputStream = new ByteArrayInputStream(sb.toString().getBytes());
// 上传文件
log.info("正在上传 JSON 文件...");
String fileName = datasetDO.getDatasetName() + "new" + datasetDO.getId() + ".json";
AigcDatasetFileRespV0 aigcDatasetFileRespV0 = trainHttpService.AigcUploadFile(new HashMap<>(), hostUrl, inputStream, fileName);
if (aigcDatasetFileRespV0 != null) {
log.debug("文件上传成功。文件ID: {}", aigcDatasetFileRespV0.getFileId());
// 更新数据集的 Job ID
log.debug("正在更新数据集的 Job ID...");
datasetMapper.setJobid(datasetDO.getId(), aigcDatasetFileRespV0.getFileId());
log.info("hostUrl:{}", hostUrl);
// 更新数据集的 URL
String s3Url = aigcDatasetFileRespV0.getS3Url();
log.info("s3Url:{}", s3Url);
// int lastIndex = s3Url.lastIndexOf("/storage");
// String url = s3Url.substring(lastIndex + 1);
// log.info("url:{}", url);
// 找到 "/uploads" 的位置
int uploadsIndex = s3Url.indexOf("/uploads");
if (uploadsIndex == -1) {
log.error("s3Url 中未找到 '/uploads' 路径");
return "";
}
// 提取 "/uploads" 及之后的部分
String uploadsPath = s3Url.substring(uploadsIndex);
log.info("uploadsPath: {}", uploadsPath);
// 构建新的完整 URL
String newUrl = hostUrl + uploadsPath;
log.info("newUrl: {}", newUrl);
datasetMapper.setUrl(datasetDO.getId(), newUrl);
// 返回结果
String result = newUrl.substring(hostUrl.length());
log.info("JSON 文件生成并上传成功。返回结果: {}", result);
return result;
} else {
log.error("文件上传失败。数据集ID: {}", datasetDO.getId());
return "";
}
} catch (IOException e) {
log.error("生成或上传 JSON 文件时发生异常。数据集ID: {}", datasetDO.getId(), e);
return "";
}
}
}

View File

@ -17,8 +17,13 @@ import cn.iocoder.yudao.module.llm.service.http.FineTuningTaskHttpService;
import cn.iocoder.yudao.module.llm.service.http.TrainHttpService;
import cn.iocoder.yudao.module.llm.service.http.vo.AigcFineTuningCreateReqVO;
import cn.iocoder.yudao.module.llm.service.http.vo.AigcFineTuningCreateRespVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO;
import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetMiddleService;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetQuestionService;
import lombok.extern.slf4j.Slf4j;
import org.jetbrains.annotations.Nullable;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Lazy;
import org.springframework.scheduling.annotation.Async;
@ -58,12 +63,121 @@ public class AsyncFineTuningTaskService {
@Value("${spring.profiles.active}")
private String active;
@Autowired
@Lazy
private DataSetMiddleService dataSetMiddleService;
@Autowired
@Lazy
private PlatformDatasetQuestionService platformDatasetQuestionService;
//大模型平台创建调优任务
@Async
public void createTuning (FineTuningTaskDO fineTuningTask) {
// 记录开始创建任务的日志
log.info("异步创建。 开始创建微调任务,请求参数: {}", fineTuningTask);
try {
log.info("开始创建微调任务任务ID: {}", fineTuningTask.getId());
ServerNameDO serverNameDO = getServerNameDO(fineTuningTask);
if (serverNameDO == null) {
return;
}
// 构建微调任务请求对象
log.debug("正在构建微调任务请求对象...");
AigcFineTuningCreateReqVO req = getAigcFineTuningCreateReqVO(fineTuningTask);
// 查询基础模型信息
log.debug("正在查询基础模型信息模型ID: {}", fineTuningTask.getBaseModelId());
BaseModelDO baseModelDO = baseModelMapper.selectById(fineTuningTask.getBaseModelId());
if (baseModelDO != null) {
req.setModel(baseModelDO.getAigcModelName());
log.debug("基础模型信息设置成功。模型名称: {}", baseModelDO.getAigcModelName());
} else {
log.warn("未找到基础模型信息模型ID: {}", fineTuningTask.getBaseModelId());
}
// 查询数据集信息
Long datasetId = fineTuningTask.getDataset();
log.debug("正在查询数据集信息数据集ID: {}", datasetId);
// DatasetRespVO dataset = datasetService.getDataset(datasetId);
DataSetMiddleDO dataset = dataSetMiddleService.getOne(datasetId);
if (dataset == null) {
log.error("未找到数据集信息数据集ID: {}", datasetId);
throw new RuntimeException("数据集信息不存在");
}
log.debug("数据集信息查询成功。数据集名称: {}", dataset.getDatasetName());
// 查询数据集问题列表
log.debug("正在查询数据集问题列表数据集ID: {}", dataset.getId());
List<PlatformDatasetQuestionRespVO> datasetQuestionList = platformDatasetQuestionService.getDatasetQuestionList(dataset.getId());
log.debug("数据集问题列表查询成功。问题数量: {}", datasetQuestionList.size());
// 将数据集信息转换为 DO 对象
log.debug("正在转换数据集信息为 DO 对象...");
DatasetDO datasetDO = BeanUtils.toBean(dataset, DatasetDO.class);
// 生成 JSON 文件并获取文件 URL
log.debug("正在生成 JSON 文件并获取文件 URL...");
String fileUrl = dataSetService.JsonFileWriteFineMiddle(serverNameDO.getHost(), dataset, datasetQuestionList);
req.setDataset(fileUrl);
log.info("JSON 文件生成成功。文件 URL: {}", fileUrl);
// 设置部署次数
int newDeployCount = Optional.ofNullable(fineTuningTask.getDeployCount())
.orElse(0) + 1;
fineTuningTask.setDeployCount(newDeployCount);
// 设置后缀
req.setSuffix(active + "-" + fineTuningTask.getId() + "-" + newDeployCount);
log.info("请求参数设置完成。后缀: {}", req.getSuffix());
// 调用模型服务创建微调任务
log.info("正在调用模型服务创建微调任务...");
AigcFineTuningCreateRespVO resp=null;
String modelType = baseModelDO.getModelType();
if("1".equals(modelType)){
resp = fineTuningTaskHttpService.finetuningCreate(new HashMap<>(), serverNameDO.getHost(), req);
}else{
resp = fineTuningTaskHttpService.finetuningCreateModal(new HashMap<>(), serverNameDO.getHost(), req);
}
// 更新任务状态
FineTuningTaskDO updateObj = new FineTuningTaskDO();
updateObj.setId(fineTuningTask.getId());
updateObj.setDeployCount(newDeployCount);
if (resp != null && resp.getId() != 0) {
updateObj.setJobId(resp.getJobId());
updateObj.setStatus(FinetuningTaskStatusEnum.WAITING.getStatus());
updateObj.setJobModelName(resp.getFineTunedModel());
updateObj.setTrainLog(resp.getTrainLog());
updateObj.setMergeLogPath(resp.getMergeLogPath());
log.info("微调任务创建成功。任务ID: {}, 任务模型名称: {} , 任务状态: {}", fineTuningTask.getId(), resp.getFineTunedModel(), FinetuningTaskStatusEnum.WAITING.getStatus());
} else {
updateObj.setStatus(FinetuningTaskStatusEnum.FAILED.getStatus());
log.error("微调任务创建失败。任务ID: {}", fineTuningTask.getId());
}
// 更新数据库
log.debug("正在更新数据库中的任务状态...");
fineTuningTaskMapper.updateById(updateObj);
log.info("数据库更新完成。任务ID: {}", fineTuningTask.getId());
} catch (Exception e) {
log.error("创建微调任务时发生异常。任务ID: {}", fineTuningTask.getId(), e);
throw e;
}
}
@Async
public void createTuningMiddleData (FineTuningTaskDO fineTuningTask) {
// 记录开始创建任务的日志
log.info("异步创建。 开始创建微调任务,请求参数: {}", fineTuningTask);
try {
log.info("开始创建微调任务任务ID: {}", fineTuningTask.getId());
@ -113,7 +227,7 @@ public class AsyncFineTuningTaskService {
// 设置部署次数
int newDeployCount = Optional.ofNullable(fineTuningTask.getDeployCount())
.orElse(0) + 1;
.orElse(0) + 1;
fineTuningTask.setDeployCount(newDeployCount);
// 设置后缀
@ -125,7 +239,7 @@ public class AsyncFineTuningTaskService {
AigcFineTuningCreateRespVO resp=null;
String modelType = baseModelDO.getModelType();
if("1".equals(modelType)){
resp = fineTuningTaskHttpService.finetuningCreate(new HashMap<>(), serverNameDO.getHost(), req);
resp = fineTuningTaskHttpService.finetuningCreate(new HashMap<>(), serverNameDO.getHost(), req);
}else{
resp = fineTuningTaskHttpService.finetuningCreateModal(new HashMap<>(), serverNameDO.getHost(), req);
}

View File

@ -96,7 +96,8 @@ public class FineTuningTaskServiceImpl implements FineTuningTaskService {
// 异步调用模型服务创建调优任务
log.debug("正在异步调用模型服务,创建微调任务...");
asyncFineTuningTaskService.createTuning(fineTuningTask);
// asyncFineTuningTaskService.createTuning(fineTuningTask);
asyncFineTuningTaskService.createTuningMiddleData(fineTuningTask);
log.info("已成功发起异步微调任务创建。任务ID: {}", fineTuningTask.getId());
// 返回任务ID

View File

@ -0,0 +1,27 @@
package cn.iocoder.module.mdpf.enums;
import cn.hutool.core.util.ObjUtil;
import lombok.AllArgsConstructor;
import lombok.Getter;
import java.util.Arrays;
@Getter
@AllArgsConstructor
public enum DatasetStatusMiddleEnum {
NOPENDING("未标注",0),
RUNNING("进行中",1),
SUCCESS("已完成",2);
private final String name;
private final Integer status;
public static final int[] ARRAYS = Arrays.stream(values()).mapToInt(DatasetStatusMiddleEnum::getStatus).toArray();
public static String getStatusByName(Integer status) {
for (DatasetStatusMiddleEnum name : values()) {
if (ObjUtil.equal(name.getStatus(), status)) {
return name.getName();
}
}
return null; // 如果未找到对应的 name返回 null
}
}

View File

@ -0,0 +1,129 @@
package cn.iocoder.module.mdpf.enums;
import cn.iocoder.yudao.framework.common.exception.ErrorCode;
public interface ErrorCodeConstants {
ErrorCode KNOWLEDGE_BASE_NOT_EXISTS = new ErrorCode(10001, "知识库不存在");
ErrorCode DATASET_NOT_EXISTS = new ErrorCode(10002, "数据集不存在");
ErrorCode MODEL_SERVICE_NOT_EXISTS = new ErrorCode(10003, "模型服务不存在");
ErrorCode LABEL_NOT_EXISTS = new ErrorCode(10004,"标签不存在");
ErrorCode FINE_TUNING_TASK_NOT_EXISTS = new ErrorCode(10005, "微调任务不存在");
ErrorCode APPLICATION_NOT_EXISTS = new ErrorCode(10006, "应用名称服务不存在");
ErrorCode DATA_REFLUX_DATA_NOT_EXISTS = new ErrorCode(10007, "数据回流 —— 数据不存在");
ErrorCode DATA_REFLUX_CONFIG_NOT_EXISTS = new ErrorCode(10008, "数据回流不存在");
ErrorCode MODEL_ASSESS_TASK_MANUAL_NOT_EXISTS = new ErrorCode(10009, "人工评估不存在");
ErrorCode MODEL_ASSESS_DIMENSION_NOT_EXISTS = new ErrorCode(10010, "评估维度不存在");
ErrorCode MODEL_ASSESS_TASK_DIMENSION_NOT_EXISTS = new ErrorCode(10011, "人工评估维度不存在");
ErrorCode MODEL_ASSESS_TASK_AUTO_NOT_EXISTS = new ErrorCode(10012, "自动评估维度不存在");
ErrorCode MODEL_ASSESS_TASK_STOPLIST_NOT_EXISTS = new ErrorCode(10013, "自动评估维度不存在");
ErrorCode MODEL_ASSESS_STOPLIST_NOT_EXISTS = new ErrorCode(10014, "自动评估维度不存在");
ErrorCode THE_AUTO_EVALUATE_DIMENSION_IS_IN_USE = new ErrorCode(10014_1, "自动评估维度正在使用中不可删除");
ErrorCode LEARNING_RESOURCES_NOT_EXISTS = new ErrorCode(10015, "学习资源不存在");
ErrorCode LEARNING_RESOURCES_FILE_URL_NOT_NULL = new ErrorCode(10016, "文件地址不能为空");
ErrorCode VIDEO_COVER_IMAGE_EMPTY = new ErrorCode(10016_1, "视频封面图不能为空");
/*
ErrorCode DATASET_FILES_NOT_EXISTS = new ErrorCode(10016, "数据集文件资源不存在");
ErrorCode DATASET_QUESTION_NOT_EXISTS = new ErrorCode(10017, "数据集标准问题不存在");
ErrorCode DATASET_ANSWER_NOT_EXISTS = new ErrorCode(10018, "数据集标准问题答案不存在");
*/
ErrorCode PROMPT_TEMPLATES_NOT_EXISTS = new ErrorCode(100_1000, "模板信息不存在");
ErrorCode PROMPT_TEMPLATES_EXISTS = new ErrorCode(100_1001, "模板信息已存在");
ErrorCode PROMPT_TEMPLATESBACKUP_EXISTS = new ErrorCode(100_1002, "模板信息已备份");
ErrorCode PROMPT_TEMPLATES_BACKUP_NOT_EXISTS = new ErrorCode(101_1000, "Prompt模板备份不存在");
ErrorCode PROMPT_TEMPLATES_APPLICATIONS_NOT_EXISTS = new ErrorCode(10017, "模板信息不存在");
ErrorCode PROMPT_TEMPLATES_TAGS_NOT_EXISTS = new ErrorCode(10017, "模板信息不存在");
ErrorCode FINE_TUNING_LOSS_NOT_EXISTS = new ErrorCode(10018, "损失记录不存在");
ErrorCode FINE_TUNING_NOT_EXISTS = new ErrorCode(10019, "模型微调不存在");
ErrorCode DATA_PROCESS_TASK_NOT_EXISTS = new ErrorCode(10020, "数据处理任务不存在");
ErrorCode CONVERSATION_NOT_EXISTS = new ErrorCode(10021, "大模型对话记录不存在");
ErrorCode BASE_MODEL_NOT_EXISTS = new ErrorCode(10022, "基座模型不存在");
ErrorCode DATASET_ANSWER_NOT_EXISTS = new ErrorCode(10023, "数据集数据问题标注内容不存在");
ErrorCode DATASET_FILES_NOT_EXISTS = new ErrorCode(10024, "数据集数据文件不存在");
ErrorCode DATASET_QUESTION_NOT_EXISTS = new ErrorCode(10025, "数据集数据问题不存在");
ErrorCode KNOWLEDGE_DOCUMENTS_NOT_EXISTS = new ErrorCode(10026, "知识库文档不存在");
ErrorCode KNOWLEDGE_DOCUMENTS_CHUNKS_NOT_EXISTS = new ErrorCode(10027, "知识库文档块不存在");
ErrorCode KNOWLEDGE_DOCUMENTS_CHUNKS_VECTORIZED_NOT_EXISTS = new ErrorCode(10028, "向量化存储不存在");
ErrorCode TRAINING_NOT_EXISTS = new ErrorCode(10029, "训练不存在");
ErrorCode MODEL_COMPLETIONS_ERROR = new ErrorCode(10030, "模型推理失败");
ErrorCode MANUAL_MODEL_ANSWER_NOT_EXISTS = new ErrorCode(10031, "模型评估人工评估信息不存在");
ErrorCode MANUAL_MODEL_ANNO_NOT_EXISTS = new ErrorCode(10032, "模型评估人工评估标注信息不存在");
ErrorCode MODEL_ASSESS_TASK_MANUAL_BACKUP_NOT_EXISTS = new ErrorCode(10033, "人工评估备份不存在");
ErrorCode MODEL_ASSESS_TASK_MANUAL_BACKUP_EXISTS = new ErrorCode(100_1002, "人工评估已备份");
ErrorCode DATASET_NAME_EXISTS = new ErrorCode(10034, "数据集名称重复");
ErrorCode MODEL_ASSESS_TASK_MANUAL_NAME_EXISTS = new ErrorCode(10035, "模型评估任务人工评估名称重复");
ErrorCode PROMPT_TEMPLATES_APPLICATIONS_BACKUP_NOT_EXISTS = new ErrorCode(10036, "模板信息不存在");
ErrorCode PROMPT_TEMPLATES_TAGS_BACKUP_NOT_EXISTS = new ErrorCode(10037, "模板信息不存在");
ErrorCode DATA_PROCESS_TASK_NAME_NOT_EXISTS = new ErrorCode(10038, "数据处理任务名称已存在");
ErrorCode FINE_TUNING_TASK_NAME_NOT_EXISTS = new ErrorCode(10039, "模型调优任务名称已存在");
ErrorCode LEARNING_RESOURCES_NAME_NOT_EXISTS = new ErrorCode(10040, "学习资源标题名称已存在");
ErrorCode KNOWLEDGE_BASE_NAME_NOT_EXISTS = new ErrorCode(10040, "知识库名称已存在");
ErrorCode CHUNK_SIZE_MUST_BE_GREATER_THAN_ZERO = new ErrorCode(10040_1, "分块大小必须大于 0");
ErrorCode CHUNK_OVERLAP_MUST_BE_GREATER_THAN_OR_EQUAL_TO_ZERO = new ErrorCode(10040_2, "分块重叠必须大于或等于 0");
ErrorCode CHUNK_OVERLAP_MUST_BE_LESS_THAN_CHUNK_SIZE = new ErrorCode(10040_3, "分块重叠必须小于分块大小");
ErrorCode APPLICATION_NAME_NOT_EXISTS = new ErrorCode(10041, "应用中心名称已存在");
ErrorCode MODEL_SERVIC_ENAME_NOT_EXISTS = new ErrorCode(10043, "模型名称已存在");
ErrorCode OPTIMIZE_PROMPT_NOT_EXISTS = new ErrorCode(10044, "优化后信息不存在");
ErrorCode LABEL_NAME_EXISTS = new ErrorCode(10045, "标签名称重复");
ErrorCode PARSE_CSV_ERROR = new ErrorCode(10034, "请正确上传csv格式得数据");
ErrorCode QUESTION_NOT_EXISTS = new ErrorCode(10046, "数据集信息不完整,无法进行评估");
ErrorCode BASE_MODEL_NAME_EXISTS = new ErrorCode(10047, "基座模型名称重复");
ErrorCode SERVER_NAME_NOT_EXISTS = new ErrorCode(10048, "服务器主机名称不存在");
ErrorCode SERVER_NAME_URL_ERROR = new ErrorCode(10049, "主机地址URL格式不正确");
ErrorCode SET_FILE_MIDDLE_NOT_EXISTS = new ErrorCode(10050, "文件不能为空");
ErrorCode SET_MIDDLE_NOT_EXISTS = new ErrorCode(10051, "数据集ID不能为空");
}

View File

@ -54,5 +54,37 @@
<groupId>cn.iocoder.boot</groupId>
<artifactId>yudao-spring-boot-starter-excel</artifactId>
</dependency>
<!-- 添加mongodb相关-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-mongodb</artifactId>
</dependency>
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>opencc4j</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId>
<version>0.24.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>1.9.3</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.3.4</version>
</dependency>
<dependency>
<groupId>cn.iocoder.boot</groupId>
<artifactId>yudao-module-infra-biz</artifactId>
<version>2.3.0-jdk8-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,98 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddlePageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleRespVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetFileMiddleService;
import org.springframework.web.bind.annotation.*;
import org.springframework.validation.annotation.Validated;
import org.springframework.security.access.prepost.PreAuthorize;
import io.swagger.v3.oas.annotations.tags.Tag;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.Operation;
import java.util.*;
import java.io.IOException;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.pojo.CommonResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
import cn.iocoder.yudao.framework.excel.core.util.ExcelUtils;
import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog;
import javax.annotation.Resource;
import javax.servlet.http.HttpServletResponse;
import javax.validation.Valid;
import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.*;
@Tag(name = "管理后台 - 数据集对应的文件地址")
@RestController
@RequestMapping("/data/set-file-middle")
@Validated
public class DataSetFileMiddleController {
@Resource
private DataSetFileMiddleService setFileMiddleService;
@PostMapping("/create")
@Operation(summary = "创建数据集对应的文件地址")
@PreAuthorize("@ss.hasPermission('data:set-file-middle:create')")
public CommonResult<Long> createSetFileMiddle(@Valid @RequestBody DataSetFileMiddleSaveReqVO createReqVO) {
return success(setFileMiddleService.createSetFileMiddle(createReqVO));
}
@PutMapping("/update")
@Operation(summary = "更新数据集对应的文件地址")
@PreAuthorize("@ss.hasPermission('data:set-file-middle:update')")
public CommonResult<Boolean> updateSetFileMiddle(@Valid @RequestBody DataSetFileMiddleSaveReqVO updateReqVO) {
setFileMiddleService.updateSetFileMiddle(updateReqVO);
return success(true);
}
@DeleteMapping("/delete")
@Operation(summary = "删除数据集对应的文件地址")
@Parameter(name = "id", description = "编号", required = true)
@PreAuthorize("@ss.hasPermission('data:set-file-middle:delete')")
public CommonResult<Boolean> deleteSetFileMiddle(@RequestParam("id") Long id) {
setFileMiddleService.deleteSetFileMiddle(id);
return success(true);
}
@GetMapping("/get")
@Operation(summary = "获得数据集对应的文件地址")
@Parameter(name = "id", description = "编号", required = true, example = "1024")
@PreAuthorize("@ss.hasPermission('data:set-file-middle:query')")
public CommonResult<DataSetFileMiddleRespVO> getSetFileMiddle(@RequestParam("id") Long id) {
DataSetFileMiddleDO setFileMiddle = setFileMiddleService.getSetFileMiddle(id);
return success(BeanUtils.toBean(setFileMiddle, DataSetFileMiddleRespVO.class));
}
@GetMapping("/page")
@Operation(summary = "获得数据集对应的文件地址分页")
@PreAuthorize("@ss.hasPermission('data:set-file-middle:query')")
public CommonResult<PageResult<DataSetFileMiddleRespVO>> getSetFileMiddlePage(@Valid DataSetFileMiddlePageReqVO pageReqVO) {
PageResult<DataSetFileMiddleDO> pageResult = setFileMiddleService.getSetFileMiddlePage(pageReqVO);
return success(BeanUtils.toBean(pageResult, DataSetFileMiddleRespVO.class));
}
@GetMapping("/export-excel")
@Operation(summary = "导出数据集对应的文件地址 Excel")
@PreAuthorize("@ss.hasPermission('data:set-file-middle:export')")
@ApiAccessLog(operateType = EXPORT)
public void exportSetFileMiddleExcel(@Valid DataSetFileMiddlePageReqVO pageReqVO,
HttpServletResponse response) throws IOException {
pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE);
List<DataSetFileMiddleDO> list = setFileMiddleService.getSetFileMiddlePage(pageReqVO).getList();
// 导出 Excel
ExcelUtils.write(response, "数据集对应的文件地址.xls", "数据", DataSetFileMiddleRespVO.class,
BeanUtils.toBean(list, DataSetFileMiddleRespVO.class));
}
}

View File

@ -0,0 +1,121 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.*;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO;
import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetMiddleService;
import org.springframework.web.bind.annotation.*;
import org.springframework.validation.annotation.Validated;
import org.springframework.security.access.prepost.PreAuthorize;
import io.swagger.v3.oas.annotations.tags.Tag;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.Operation;
import java.util.*;
import java.io.IOException;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.pojo.CommonResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
import cn.iocoder.yudao.framework.excel.core.util.ExcelUtils;
import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog;
import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.*;
import javax.annotation.Resource;
import javax.annotation.security.PermitAll;
import javax.validation.Valid;
@Tag(name = "管理后台 - 中台中的数据集")
@RestController
@RequestMapping("/data/data-set-middle")
@Validated
public class DataSetMiddleController {
@Resource
private DataSetMiddleService setMiddleService;
@PostMapping("/create")
@Operation(summary = "创建中台中的数据集")
// @PreAuthorize("@ss.hasPermission('data:set-middle:create')")
public CommonResult<Long> createSetMiddle(@Valid @RequestBody DataSetMiddleSaveReqVO createReqVO) {
return success(setMiddleService.createSetMiddle(createReqVO));
}
@PutMapping("/update")
@Operation(summary = "更新中台中的数据集")
// @PreAuthorize("@ss.hasPermission('data:set-middle:update')")
public CommonResult<Boolean> updateSetMiddle(@Valid @RequestBody DataSetMiddleSaveReqVO updateReqVO) {
setMiddleService.updateSetMiddle(updateReqVO);
return success(true);
}
@DeleteMapping("/delete")
@Operation(summary = "删除中台中的数据集")
@Parameter(name = "id", description = "编号", required = true)
// @PreAuthorize("@ss.hasPermission('data:set-middle:delete')")
public CommonResult<Boolean> deleteSetMiddle(@RequestParam("id") Long id) {
setMiddleService.deleteSetMiddle(id);
return success(true);
}
@GetMapping("/get")
@Operation(summary = "获得中台中的数据集")
@Parameter(name = "id", description = "编号", required = true, example = "1024")
// @PreAuthorize("@ss.hasPermission('data:set-middle:query')")
public CommonResult<DataSetMiddleRespVO> getSetMiddle(@RequestParam("id") Long id) {
DataSetMiddleDO setMiddle = setMiddleService.getSetMiddle(id);
return success(BeanUtils.toBean(setMiddle, DataSetMiddleRespVO.class));
}
@GetMapping("/getOneInfo")
@Operation(summary = "获得中台中的数据集")
@Parameter(name = "id", description = "编号", required = true, example = "1024")
// @PreAuthorize("@ss.hasPermission('data:set-middle:query')")
public CommonResult<DataSetMiddleRespVO> getOneInfo(@RequestParam("id") Long id) {
DataSetMiddleRespVO oneInfo = setMiddleService.getOneInfo(id);
return success(BeanUtils.toBean(oneInfo, DataSetMiddleRespVO.class));
}
@GetMapping("/page")
@Operation(summary = "获得中台中的数据集分页")
// @PreAuthorize("@ss.hasPermission('data:set-middle:query')")
public CommonResult<PageResult<DataSetMiddleRespVO>> getSetMiddlePage(@Valid DataSetMiddlePageReqVO pageReqVO) {
PageResult<DataSetMiddleDO> pageResult = setMiddleService.getSetMiddlePage(pageReqVO);
return success(BeanUtils.toBean(pageResult, DataSetMiddleRespVO.class));
}
@GetMapping("/getAllList")
@Operation(summary = "获得中台中的数据集")
public CommonResult<List<DataSetMiddleDO>> getDataSetMiddleList(DataSetMiddlePageReqVO pageReqVO){
List<DataSetMiddleDO> dataSetMiddleList = setMiddleService.getDataSetMiddleList(pageReqVO.getDatasetParentType());
return success(dataSetMiddleList);
}
// @GetMapping("getdataseturl")
// @PermitAll
// public CommonResult<String> getDataSetUrl(@RequestParam("datasetid") Long id,@RequestParam("hostUrl") String hostUrl){
//
// String dataSetUrl = setMiddleService.getDataSetUrl(id,hostUrl);
// return success(dataSetUrl);
// }
// @GetMapping("/export-excel")
// @Operation(summary = "导出中台中的数据集 Excel")
// @PreAuthorize("@ss.hasPermission('data:set-middle:export')")
// @ApiAccessLog(operateType = EXPORT)
// public void exportSetMiddleExcel(@Valid DataSetMiddlePageReqVO pageReqVO,
// HttpServletResponse response) throws IOException {
// pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE);
// List<DataSetMiddleDO> list = setMiddleService.getSetMiddlePage(pageReqVO).getList();
// // 导出 Excel
// ExcelUtils.write(response, "中台中的数据集.xls", "数据", DataSetMiddleRespVO.class,
// BeanUtils.toBean(list, DataSetMiddleRespVO.class));
// }
}

View File

@ -0,0 +1,92 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset;
import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog;
import cn.iocoder.yudao.framework.common.pojo.CommonResult;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.framework.excel.core.util.ExcelUtils;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerRespVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetAnswerService;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.web.bind.annotation.*;
import javax.annotation.Resource;
import javax.servlet.http.HttpServletResponse;
import javax.validation.Valid;
import java.io.IOException;
import java.util.List;
import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.EXPORT;
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
/*@Tag(name = "管理后台 - 数据集数据问题标注内容")
@RestController
@RequestMapping("/llm/dataset-answer")
@Validated*/
public class PlatformDatasetAnswerController {
@Resource
private PlatformDatasetAnswerService platformDatasetAnswerService;
@PostMapping("/create")
@Operation(summary = "创建数据集数据问题标注内容")
@PreAuthorize("@ss.hasPermission('llm:dataset-answer:create')")
public CommonResult<Long> createDatasetAnswer(@Valid @RequestBody PlatformDatasetAnswerSaveReqVO createReqVO) {
return success(platformDatasetAnswerService.createDatasetAnswer(createReqVO));
}
@PutMapping("/update")
@Operation(summary = "更新数据集数据问题标注内容")
@PreAuthorize("@ss.hasPermission('llm:dataset-answer:update')")
public CommonResult<Boolean> updateDatasetAnswer(@Valid @RequestBody PlatformDatasetAnswerSaveReqVO updateReqVO) {
platformDatasetAnswerService.updateDatasetAnswer(updateReqVO);
return success(true);
}
@DeleteMapping("/delete")
@Operation(summary = "删除数据集数据问题标注内容")
@Parameter(name = "id", description = "编号", required = true)
@PreAuthorize("@ss.hasPermission('llm:dataset-answer:delete')")
public CommonResult<Boolean> deleteDatasetAnswer(@RequestParam("id") Long id) {
platformDatasetAnswerService.deleteDatasetAnswer(id);
return success(true);
}
@GetMapping("/get")
@Operation(summary = "获得数据集数据问题标注内容")
@Parameter(name = "id", description = "编号", required = true, example = "1024")
@PreAuthorize("@ss.hasPermission('llm:dataset-answer:query')")
public CommonResult<PlatformDatasetAnswerRespVO> getDatasetAnswer(@RequestParam("id") Long id) {
PlatformDatasetAnswerDO datasetAnswer = platformDatasetAnswerService.getDatasetAnswer(id);
return success(BeanUtils.toBean(datasetAnswer, PlatformDatasetAnswerRespVO.class));
}
@GetMapping("/page")
@Operation(summary = "获得数据集数据问题标注内容分页")
@PreAuthorize("@ss.hasPermission('llm:dataset-answer:query')")
public CommonResult<PageResult<PlatformDatasetAnswerRespVO>> getDatasetAnswerPage(@Valid PlatformDatasetAnswerPageReqVO pageReqVO) {
PageResult<PlatformDatasetAnswerDO> pageResult = platformDatasetAnswerService.getDatasetAnswerPage(pageReqVO);
return success(BeanUtils.toBean(pageResult, PlatformDatasetAnswerRespVO.class));
}
@GetMapping("/export-excel")
@Operation(summary = "导出数据集数据问题标注内容 Excel")
@PreAuthorize("@ss.hasPermission('llm:dataset-answer:export')")
@ApiAccessLog(operateType = EXPORT)
public void exportDatasetAnswerExcel(@Valid PlatformDatasetAnswerPageReqVO pageReqVO,
HttpServletResponse response) throws IOException {
pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE);
List<PlatformDatasetAnswerDO> list = platformDatasetAnswerService.getDatasetAnswerPage(pageReqVO).getList();
// 导出 Excel
ExcelUtils.write(response, "数据集数据问题标注内容.xls", "数据", PlatformDatasetAnswerRespVO.class,
BeanUtils.toBean(list, PlatformDatasetAnswerRespVO.class));
}
}

View File

@ -0,0 +1,93 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset;
import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog;
import cn.iocoder.yudao.framework.common.pojo.CommonResult;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.framework.excel.core.util.ExcelUtils;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesRespVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetFilesService;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.web.bind.annotation.*;
import javax.annotation.Resource;
import javax.servlet.http.HttpServletResponse;
import javax.validation.Valid;
import java.io.IOException;
import java.util.List;
import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.EXPORT;
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
/*
@Tag(name = "管理后台 - 数据集数据文件")
@RestController
@RequestMapping("/llm/dataset-files")
@Validated*/
public class PlatformDatasetFilesController {
@Resource
private PlatformDatasetFilesService platformDatasetFilesService;
@PostMapping("/create")
@Operation(summary = "创建数据集数据文件")
@PreAuthorize("@ss.hasPermission('llm:dataset-files:create')")
public CommonResult<Long> createDatasetFiles(@Valid @RequestBody PlatformDatasetFilesSaveReqVO createReqVO) {
return success(platformDatasetFilesService.createDatasetFiles(createReqVO));
}
@PutMapping("/update")
@Operation(summary = "更新数据集数据文件")
@PreAuthorize("@ss.hasPermission('llm:dataset-files:update')")
public CommonResult<Boolean> updateDatasetFiles(@Valid @RequestBody PlatformDatasetFilesSaveReqVO updateReqVO) {
platformDatasetFilesService.updateDatasetFiles(updateReqVO);
return success(true);
}
@DeleteMapping("/delete")
@Operation(summary = "删除数据集数据文件")
@Parameter(name = "id", description = "编号", required = true)
@PreAuthorize("@ss.hasPermission('llm:dataset-files:delete')")
public CommonResult<Boolean> deleteDatasetFiles(@RequestParam("id") Long id) {
platformDatasetFilesService.deleteDatasetFiles(id);
return success(true);
}
@GetMapping("/get")
@Operation(summary = "获得数据集数据文件")
@Parameter(name = "id", description = "编号", required = true, example = "1024")
@PreAuthorize("@ss.hasPermission('llm:dataset-files:query')")
public CommonResult<PlatformDatasetFilesRespVO> getDatasetFiles(@RequestParam("id") Long id) {
PlatformDatasetFilesDO datasetFiles = platformDatasetFilesService.getDatasetFiles(id);
return success(BeanUtils.toBean(datasetFiles, PlatformDatasetFilesRespVO.class));
}
@GetMapping("/page")
@Operation(summary = "获得数据集数据文件分页")
@PreAuthorize("@ss.hasPermission('llm:dataset-files:query')")
public CommonResult<PageResult<PlatformDatasetFilesRespVO>> getDatasetFilesPage(@Valid PlatformDatasetFilesPageReqVO pageReqVO) {
PageResult<PlatformDatasetFilesDO> pageResult = platformDatasetFilesService.getDatasetFilesPage(pageReqVO);
return success(BeanUtils.toBean(pageResult, PlatformDatasetFilesRespVO.class));
}
@GetMapping("/export-excel")
@Operation(summary = "导出数据集数据文件 Excel")
@PreAuthorize("@ss.hasPermission('llm:dataset-files:export')")
@ApiAccessLog(operateType = EXPORT)
public void exportDatasetFilesExcel(@Valid PlatformDatasetFilesPageReqVO pageReqVO,
HttpServletResponse response) throws IOException {
pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE);
List<PlatformDatasetFilesDO> list = platformDatasetFilesService.getDatasetFilesPage(pageReqVO).getList();
// 导出 Excel
ExcelUtils.write(response, "数据集数据文件.xls", "数据", PlatformDatasetFilesRespVO.class,
BeanUtils.toBean(list, PlatformDatasetFilesRespVO.class));
}
}

View File

@ -0,0 +1,191 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset;
import cn.iocoder.yudao.framework.apilog.core.annotation.ApiAccessLog;
import cn.iocoder.yudao.framework.common.pojo.CommonResult;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerRespVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionSaveReqVO;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetAnswerService;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetQuestionService;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.HorizontalAlignment;
import org.apache.poi.ss.usermodel.VerticalAlignment;
import org.apache.poi.ss.util.CellRangeAddress;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.*;
import javax.annotation.Resource;
import javax.servlet.http.HttpServletResponse;
import javax.validation.Valid;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import static cn.iocoder.yudao.framework.apilog.core.enums.OperateTypeEnum.EXPORT;
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
@Tag(name = "管理后台 - 数据集数据问题+标注")
@RestController
@RequestMapping("/platform/dataset-question")
@Validated
public class PlatformDatasetQuestionController {
@Resource
private PlatformDatasetQuestionService platformDatasetQuestionService;
@Resource
private PlatformDatasetAnswerService platformDatasetAnswerService;
@PutMapping("data-anno")
@Operation(summary = "保存标注接口")
// @PreAuthorize("@ss.hasPermission('llm:dataset-question:anno')")
public CommonResult<Boolean> updateDatasetQuestionDataAnno(@Valid @RequestBody List<PlatformDatasetQuestionSaveReqVO> updateReqVOS) {
platformDatasetQuestionService.updateDatasetQuestionDataAnno(updateReqVOS);
return success(true);
}
@GetMapping("/page")
@Operation(summary = "获得数据集数据问题分页")
// @PreAuthorize("@ss.hasPermission('llm:dataset-question:query')")
public CommonResult<PageResult<PlatformDatasetQuestionRespVO>> getDatasetQuestionPage(@Valid PlatformDatasetQuestionPageReqVO pageReqVO) {
PageResult<PlatformDatasetQuestionRespVO> pageResult = platformDatasetQuestionService.getDatasetQuestionPage(pageReqVO);
return success(pageResult);
}
@GetMapping("/export-excel")
@Operation(summary = "导出数据集数据文件 Excel")
// @PreAuthorize("@ss.hasPermission('llm:dataset-files:export')")
@ApiAccessLog(operateType = EXPORT)
public void exportDatasetFilesExcel(@Valid PlatformDatasetQuestionPageReqVO pageReqVO,
HttpServletResponse response) throws IOException {
// DatasetRespVO dataset = datasetService.getDataset(pageReqVO.getDatasetId());
// if(dataset!=null&&dataset.getStatus()!=2){
// throw new RuntimeException("只有状态为已完成的数据才能导出");
// }
HSSFWorkbook template = new HSSFWorkbook();
HSSFSheet sheet = template.createSheet();
// 创建样式并设置垂直居中
HSSFCellStyle cellStyle = template.createCellStyle();
cellStyle.setVerticalAlignment(VerticalAlignment.CENTER);
cellStyle.setAlignment(HorizontalAlignment.CENTER);
int count = 0;
List<Integer> id = new ArrayList<>();
HSSFRow row = sheet.createRow(count);
row.createCell(0).setCellValue("system");
row.getCell(0).setCellStyle(cellStyle);
row.createCell(1).setCellValue("question");
row.getCell(1).setCellStyle(cellStyle);
row.createCell(2).setCellValue("answer");
row.getCell(2).setCellStyle(cellStyle);
id.add(count);
count++;
pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE);
List<PlatformDatasetQuestionRespVO> list = platformDatasetQuestionService.getDatasetQuestionPage(pageReqVO).getList();
for (PlatformDatasetQuestionRespVO item : list){
String system = item.getSystem();
String question = item.getQuestion();
List<PlatformDatasetAnswerRespVO> datasetAnswerRespVO = item.getDatasetAnswerRespVO();
if(datasetAnswerRespVO!=null&&datasetAnswerRespVO.size()>0){
List<String> collect = datasetAnswerRespVO.stream().map(PlatformDatasetAnswerRespVO::getAnswer).collect(Collectors.toList());
if (collect.size() == 0){
row = sheet.createRow(count);
row.createCell(0).setCellValue(system);
row.getCell(0).setCellStyle(cellStyle);
row.createCell(1).setCellValue(question);
row.getCell(1).setCellStyle(cellStyle);
row.createCell(2).setCellValue("");
row.getCell(2).setCellStyle(cellStyle);
id.add(count);
count++;
}else {
for (String s : collect) {
row = sheet.createRow(count);
row.createCell(0).setCellValue(system);
row.getCell(0).setCellStyle(cellStyle);
row.createCell(1).setCellValue(question);
row.getCell(1).setCellStyle(cellStyle);
row.createCell(2).setCellValue(s);
row.getCell(2).setCellStyle(cellStyle);
count++;
}
id.add(count-1);
}
}
}
//合并相同内容的单元格
for (int i = 0; i < id.size() - 1; i++){
if (id.get(i+1)-id.get(i)>1) {
sheet.addMergedRegion(new CellRangeAddress(id.get(i)+1, id.get(i + 1), 0, 1));
}
}
// 导出 Excel
try {
response.setCharacterEncoding("UTF-8");
response.setContentType("application/vnd.ms-excel");
template.write(response.getOutputStream());
response.getOutputStream().close();
template.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/* @PutMapping("/update")
@Operation(summary = "更新数据集数据问题")
@PreAuthorize("@ss.hasPermission('llm:dataset-question:update')")
public CommonResult<Boolean> updateDatasetQuestion(@Valid @RequestBody DatasetQuestionSaveReqVO updateReqVO) {
datasetQuestionService.updateDatasetQuestion(updateReqVO);
return success(true);
}*/
/* @DeleteMapping("/delete")
@Operation(summary = "删除数据集数据问题")
@Parameter(name = "id", description = "编号", required = true)
@PreAuthorize("@ss.hasPermission('llm:dataset-question:delete')")
public CommonResult<Boolean> deleteDatasetQuestion(@RequestParam("id") Long id) {
datasetQuestionService.deleteDatasetQuestion(id);
return success(true);
}*/
/* @GetMapping("/get")
@Operation(summary = "获得数据集数据问题")
@Parameter(name = "id", description = "编号", required = true, example = "1024")
@PreAuthorize("@ss.hasPermission('llm:dataset-question:query')")
public CommonResult<DatasetQuestionRespVO> getDatasetQuestion(@RequestParam("id") Long id) {
DatasetQuestionDO datasetQuestion = datasetQuestionService.getDatasetQuestion(id);
return success(BeanUtils.toBean(datasetQuestion, DatasetQuestionRespVO.class));
}*/
/* @GetMapping("/export-excel")
@Operation(summary = "导出数据集数据问题 Excel")
@PreAuthorize("@ss.hasPermission('llm:dataset-question:export')")
@ApiAccessLog(operateType = EXPORT)
public void exportDatasetQuestionExcel(@Valid DatasetQuestionPageReqVO pageReqVO,
HttpServletResponse response) throws IOException {
pageReqVO.setPageSize(PageParam.PAGE_SIZE_NONE);
List<DatasetQuestionDO> list = datasetQuestionService.getDatasetQuestionPage(pageReqVO).getList();
// 导出 Excel
ExcelUtils.write(response, "数据集数据问题.xls", "数据", DatasetQuestionRespVO.class,
BeanUtils.toBean(list, DatasetQuestionRespVO.class));
}*/
/* @PostMapping("/create")
@Operation(summary = "创建数据集数据问题")
@PreAuthorize("@ss.hasPermission('llm:dataset-question:create')")
public CommonResult<Long> createDatasetQuestion(@Valid @RequestBody DatasetQuestionSaveReqVO createReqVO) {
return success(datasetQuestionService.createDatasetQuestion(createReqVO));
}*/
}

View File

@ -0,0 +1,12 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.dto;
import lombok.Data;
import java.util.List;
@Data
public class PlatformDataJsonTemplate {
private String system;
private String question;
private List<String> answers;
}

View File

@ -0,0 +1,21 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class AigcDatasetFileMiddleRespV0 {
private String createdAt;
private String fileId;
private String fileType;
private String filename;
private Integer lineCount;
private String purpose;
private String s3Url;
private Integer size;
private Integer tenantId;
private Integer tokenCount;
}

View File

@ -0,0 +1,15 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class AigcDatasetMiddleVo {
private String instruction;
private String input;
private String output;
}

View File

@ -0,0 +1,34 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import lombok.*;
import java.util.*;
import io.swagger.v3.oas.annotations.media.Schema;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import org.springframework.format.annotation.DateTimeFormat;
import java.time.LocalDateTime;
import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND;
@Schema(description = "管理后台 - 数据集对应的文件地址分页 Request VO")
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
public class DataSetFileMiddlePageReqVO extends PageParam {
@Schema(description = "对应数据集的id", example = "8353")
private Long dataSetId;
@Schema(description = "数据集文件对应的上传地址", example = "https://www.iocoder.cn")
private String dataSetFileUrl;
@Schema(description = "数据集文件类型0数据文件1图片2视频", example = "2")
private String dataSetFileType;
@Schema(description = "创建时间")
@DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND)
private LocalDateTime[] createTime;
@Schema(description = "数据集文件名称", example = "赵六")
private String datasetFileName;
}

View File

@ -0,0 +1,39 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.*;
import java.util.*;
import org.springframework.format.annotation.DateTimeFormat;
import java.time.LocalDateTime;
import com.alibaba.excel.annotation.*;
@Schema(description = "管理后台 - 数据集对应的文件地址 Response VO")
@Data
@ExcelIgnoreUnannotated
public class DataSetFileMiddleRespVO {
@Schema(description = "主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "22402")
@ExcelProperty("主键ID")
private Long id;
@Schema(description = "对应数据集的id", example = "8353")
@ExcelProperty("对应数据集的id")
private Long dataSetId;
@Schema(description = "数据集文件对应的上传地址", example = "https://www.iocoder.cn")
@ExcelProperty("数据集文件对应的上传地址")
private String dataSetFileUrl;
@Schema(description = "数据集文件类型0数据文件1图片2视频", example = "2")
@ExcelProperty("数据集文件类型0数据文件1图片2视频")
private String dataSetFileType;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
@ExcelProperty("创建时间")
private LocalDateTime createTime;
@Schema(description = "数据集文件名称", example = "赵六")
@ExcelProperty("数据集文件名称")
private String datasetFileName;
}

View File

@ -0,0 +1,26 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.*;
import java.util.*;
@Schema(description = "管理后台 - 数据集对应的文件地址新增/修改 Request VO")
@Data
public class DataSetFileMiddleSaveReqVO {
@Schema(description = "主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "22402")
private Long id;
@Schema(description = "对应数据集的id", example = "8353")
private Long dataSetId;
@Schema(description = "数据集文件对应的上传地址", example = "https://www.iocoder.cn")
private String dataSetFileUrl;
@Schema(description = "数据集文件类型0数据文件1图片2视频", example = "2")
private String dataSetFileType;
@Schema(description = "数据集文件名称", example = "赵六")
private String datasetFileName;
}

View File

@ -0,0 +1,61 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import lombok.*;
import java.util.*;
import io.swagger.v3.oas.annotations.media.Schema;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import org.springframework.format.annotation.DateTimeFormat;
import java.time.LocalDateTime;
import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND;
@Schema(description = "管理后台 - 中台中的数据集分页 Request VO")
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
public class DataSetMiddlePageReqVO extends PageParam {
@Schema(description = "数据集名称", example = "张三")
private String datasetName;
@Schema(description = "数据集来源")
private String datasetSource;
@Schema(description = "清洗状态0未清洗1已经清洗", example = "2")
private Integer cleanStatus;
@Schema(description = "数据标注状态0未完成1进行中2已完成", example = "1")
private Integer markStatus;
@Schema(description = "数据集父类型(1文本数据集2多模态数据集)", example = "2")
private Integer datasetParentType;
@Schema(description = "备注", example = "你猜")
private String remark;
@Schema(description = "创建时间")
@DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND)
private LocalDateTime[] createTime;
@Schema(description = "数据集类型 0 普通 1 官方", example = "1")
private Integer type;
@Schema(description = "数据集描述")
private String datasetIntro;
@Schema(description = "数据集类型1-训练数据集、2-评估数据集)", example = "2")
private Integer datasetType;
@Schema(description = "数据集类型使用字典llm_dataset_category_1、llm_dataset_category_2")
private Integer datasetCategory;
@Schema(description = "数据长度")
private Long dataLength;
@Schema(description = "标注进度")
private Integer annotateProgress;
@Schema(description = "对应mongodb中的数据iD", example = "12642")
private Long mongoId;
}

View File

@ -0,0 +1,78 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.*;
import java.util.*;
import org.springframework.format.annotation.DateTimeFormat;
import java.time.LocalDateTime;
import com.alibaba.excel.annotation.*;
@Schema(description = "管理后台 - 中台中的数据集 Response VO")
@Data
@ExcelIgnoreUnannotated
public class DataSetMiddleRespVO {
@Schema(description = "主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31179")
@ExcelProperty("主键ID")
private Long id;
@Schema(description = "数据集名称", example = "张三")
@ExcelProperty("数据集名称")
private String datasetName;
@Schema(description = "数据集来源")
@ExcelProperty("数据集来源")
private String datasetSource;
@Schema(description = "清洗状态0未清洗1已经清洗", example = "2")
@ExcelProperty("清洗状态0未清洗1已经清洗")
private Integer cleanStatus;
@Schema(description = "数据标注状态0未完成1进行中2已完成", example = "1")
@ExcelProperty("数据标注状态0未完成1进行中2已完成")
private Integer markStatus;
@Schema(description = "数据集父类型(1文本数据集2多模态数据集)", example = "2")
@ExcelProperty("数据集父类型(1文本数据集2多模态数据集)")
private Integer datasetParentType;
@Schema(description = "备注", example = "你猜")
@ExcelProperty("备注")
private String remark;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
@ExcelProperty("创建时间")
private LocalDateTime createTime;
@Schema(description = "数据集类型 0 普通 1 官方", example = "1")
@ExcelProperty("数据集类型 0 普通 1 官方")
private Integer type;
@Schema(description = "数据集描述")
@ExcelProperty("数据集描述")
private String datasetIntro;
@Schema(description = "数据集类型1-训练数据集、2-评估数据集)", example = "2")
@ExcelProperty("数据集类型1-训练数据集、2-评估数据集)")
private Integer datasetType;
@Schema(description = "数据集类型使用字典llm_dataset_category_1、llm_dataset_category_2")
@ExcelProperty("数据集类型使用字典llm_dataset_category_1、llm_dataset_category_2")
private Integer datasetCategory;
@Schema(description = "数据长度")
@ExcelProperty("数据长度")
private Long dataLength;
@Schema(description = "标注进度")
@ExcelProperty("标注进度")
private Integer annotateProgress;
@Schema(description = "对应mongodb中的数据iD", example = "12642")
@ExcelProperty("对应mongodb中的数据iD")
private Long mongoId;
@Schema(description = "数据集数据文件", example = "[]")
private List<PlatformDatasetFilesRespVO> datasetFiles;
}

View File

@ -0,0 +1,55 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.*;
import java.util.*;
@Schema(description = "管理后台 - 中台中的数据集新增/修改 Request VO")
@Data
public class DataSetMiddleSaveReqVO {
@Schema(description = "主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31179")
private Long id;
@Schema(description = "数据集名称", example = "张三")
private String datasetName;
@Schema(description = "数据集来源")
private String datasetSource;
@Schema(description = "清洗状态0未清洗1已经清洗", example = "2")
private Integer cleanStatus;
@Schema(description = "数据标注状态0未完成1进行中2已完成", example = "1")
private Integer markStatus;
@Schema(description = "数据集父类型(1文本数据集2多模态数据集)", example = "2")
private Integer datasetParentType;
@Schema(description = "备注", example = "你猜")
private String remark;
@Schema(description = "数据集类型 0 普通 1 官方", example = "1")
private Integer type;
@Schema(description = "数据集描述")
private String datasetIntro;
@Schema(description = "数据集类型1-训练数据集、2-评估数据集)", example = "2")
private Integer datasetType;
@Schema(description = "数据集类型使用字典llm_dataset_category_1、llm_dataset_category_2")
private Integer datasetCategory;
@Schema(description = "数据长度")
private Long dataLength;
@Schema(description = "标注进度")
private Integer annotateProgress;
@Schema(description = "对应mongodb中的数据iD", example = "12642")
private Long mongoId;
private List<Map<String,String>> filesList;
}

View File

@ -0,0 +1,36 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.ToString;
import org.springframework.format.annotation.DateTimeFormat;
import java.time.LocalDateTime;
import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND;
@Schema(description = "管理后台 - 数据集数据问题标注内容分页 Request VO")
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
public class PlatformDatasetAnswerPageReqVO extends PageParam {
@Schema(description = "数据集ID", example = "31073")
private Long datasetId;
@Schema(description = "数据文件ID", example = "21597")
private Long datasetFilesId;
@Schema(description = "问题ID", example = "23725")
private Long questionId;
@Schema(description = "标注内容")
private String answer;
@Schema(description = "创建时间")
@DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND)
private LocalDateTime[] createTime;
}

View File

@ -0,0 +1,39 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import com.alibaba.excel.annotation.ExcelIgnoreUnannotated;
import com.alibaba.excel.annotation.ExcelProperty;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import java.time.LocalDateTime;
@Schema(description = "管理后台 - 数据集数据问题标注内容 Response VO")
@Data
@ExcelIgnoreUnannotated
public class PlatformDatasetAnswerRespVO {
@Schema(description = "数据集问题ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "32153")
@ExcelProperty("数据集问题ID")
private Long id;
@Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31073")
@ExcelProperty("数据集ID")
private Long datasetId;
@Schema(description = "数据文件ID", example = "21597")
@ExcelProperty("数据文件ID")
private Long datasetFilesId;
@Schema(description = "问题ID", example = "23725")
@ExcelProperty("问题ID")
private Long questionId;
@Schema(description = "标注内容")
@ExcelProperty("标注内容")
private String answer;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
@ExcelProperty("创建时间")
private LocalDateTime createTime;
}

View File

@ -0,0 +1,28 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import javax.validation.constraints.NotNull;
@Schema(description = "管理后台 - 数据集数据问题标注内容新增/修改 Request VO")
@Data
public class PlatformDatasetAnswerSaveReqVO {
@Schema(description = "数据集问题ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "32153")
private Long id;
@Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31073")
@NotNull(message = "数据集ID不能为空")
private Long datasetId;
@Schema(description = "数据文件ID", example = "21597")
private Long datasetFilesId;
@Schema(description = "问题ID", example = "23725")
private Long questionId;
@Schema(description = "标注内容")
private String answer;
}

View File

@ -0,0 +1,38 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.ToString;
import org.springframework.format.annotation.DateTimeFormat;
import java.time.LocalDateTime;
import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND;
@Schema(description = "管理后台 - 数据集数据文件分页 Request VO")
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
public class PlatformDatasetFilesPageReqVO extends PageParam {
@Schema(description = "数据集ID", example = "8530")
private Long datasetId;
@Schema(description = "数据长度")
private Long dataLength;
@Schema(description = "数据文件文件表的ID")
private Long datasetFile;
@Schema(description = "文件URL地址", example = "https://www.iocoder.cn")
private String datasetFileUrl;
@Schema(description = "创建时间")
@DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND)
private LocalDateTime[] createTime;
@Schema(description = "文件名称", example = "https://www.iocoder.cn")
private String datasetFileName;
}

View File

@ -0,0 +1,45 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import com.alibaba.excel.annotation.ExcelIgnoreUnannotated;
import com.alibaba.excel.annotation.ExcelProperty;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import java.time.LocalDateTime;
import java.util.List;
@Schema(description = "管理后台 - 数据集数据文件 Response VO")
@Data
@ExcelIgnoreUnannotated
public class PlatformDatasetFilesRespVO {
@Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31018")
@ExcelProperty("数据集ID")
private Long id;
@Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "8530")
@ExcelProperty("数据集ID")
private Long datasetId;
@Schema(description = "数据长度")
@ExcelProperty("数据长度")
private Long dataLength;
@Schema(description = "数据文件文件表的ID")
@ExcelProperty("数据文件文件表的ID")
private Long datasetFile;
@Schema(description = "文件URL地址", example = "https://www.iocoder.cn")
@ExcelProperty("文件URL地址")
private String datasetFileUrl;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
@ExcelProperty("创建时间")
private LocalDateTime createTime;
@Schema(description = "文件名称", example = "https://www.iocoder.cn")
private String datasetFileName;
@Schema(description = "数据集数据文件", example = "[]")
private List<PlatformDatasetFilesRespVO> datasetFiles;
}

View File

@ -0,0 +1,28 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
@Schema(description = "管理后台 - 数据集数据文件新增/修改 Request VO")
@Data
public class PlatformDatasetFilesSaveReqVO {
@Schema(description = "数据集文件主键ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "31018")
private Long id;
@Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "8530")
// @NotNull(message = "数据集ID不能为空")
private Long datasetId;
@Schema(description = "数据长度")
private Long dataLength;
@Schema(description = "数据文件文件表的ID")
private Long datasetFile;
@Schema(description = "文件URL地址", example = "https://www.iocoder.cn")
private String datasetFileUrl;
@Schema(description = "文件名称", example = "https://www.iocoder.cn")
private String datasetFileName;
}

View File

@ -0,0 +1,38 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.ToString;
import org.springframework.format.annotation.DateTimeFormat;
import javax.validation.constraints.NotNull;
import java.time.LocalDateTime;
import static cn.iocoder.yudao.framework.common.util.date.DateUtils.FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND;
@Schema(description = "管理后台 - 数据集数据问题分页 Request VO")
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
public class PlatformDatasetQuestionPageReqVO extends PageParam {
@Schema(description = "数据集ID", example = "15672")
@NotNull(message = "数据集ID不能为空")
private Long datasetId;
@Schema(description = "数据文件ID", example = "23062")
private Long datasetFilesId;
@Schema(description = "问题内容")
private String question;
@Schema(description = "标注状态使用字典llm_dataset_mark_status", example = "1")
private Integer status;
@Schema(description = "创建时间")
@DateTimeFormat(pattern = FORMAT_YEAR_MONTH_DAY_HOUR_MINUTE_SECOND)
private LocalDateTime[] createTime;
}

View File

@ -0,0 +1,47 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import com.alibaba.excel.annotation.ExcelIgnoreUnannotated;
import com.alibaba.excel.annotation.ExcelProperty;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import java.time.LocalDateTime;
import java.util.List;
@Schema(description = "管理后台 - 数据集数据问题 Response VO")
@Data
@ExcelIgnoreUnannotated
public class PlatformDatasetQuestionRespVO {
@Schema(description = "数据集问题ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "394")
@ExcelProperty("数据集问题ID")
private Long id;
@Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "15672")
@ExcelProperty("数据集ID")
private Long datasetId;
@Schema(description = "数据文件ID", example = "23062")
@ExcelProperty("数据文件ID")
private Long datasetFilesId;
@Schema(description = "问题内容")
@ExcelProperty("问题内容")
private String question;
@Schema(description = "标注状态使用字典llm_dataset_mark_status", example = "1")
@ExcelProperty("标注状态使用字典llm_dataset_mark_status")
private Integer status;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
@ExcelProperty("创建时间")
private LocalDateTime createTime;
@Schema(description = "系统身份")
private String system;
@Schema(description = "标注内容")
private List<PlatformDatasetAnswerRespVO> datasetAnswerRespVO;
@Schema(description = "问题对应的图片")
private List<String> imagesList;
}

View File

@ -0,0 +1,34 @@
package cn.iocoder.yudao.module.mdpf.controller.dataset.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import javax.validation.constraints.NotNull;
import java.util.List;
@Schema(description = "管理后台 - 数据集数据问题新增/修改 Request VO")
@Data
public class PlatformDatasetQuestionSaveReqVO {
@Schema(description = "数据集问题ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "394")
private Long id;
@Schema(description = "数据集ID", requiredMode = Schema.RequiredMode.REQUIRED, example = "15672")
@NotNull(message = "数据集ID不能为空")
private Long datasetId;
@Schema(description = "数据文件ID", example = "23062")
private Long datasetFilesId;
@Schema(description = "问题内容")
private String question;
@Schema(description = "标注状态使用字典llm_dataset_mark_status", example = "1")
private Integer status;
@Schema(description = "系统身份")
private String system;
@Schema(description = "标注内容")
private List<PlatformDatasetAnswerSaveReqVO> datasetAnswerRespVO;
}

View File

@ -0,0 +1,59 @@
package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset;
import lombok.*;
import java.math.BigDecimal;
import java.util.*;
import java.time.LocalDateTime;
import java.time.LocalDateTime;
import com.baomidou.mybatisplus.annotation.*;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
/**
* 数据集对应的文件地址 DO
*
* @author 管理员
*/
@TableName("data_set_file_middle")
@KeySequence("data_set_file_middle_seq") // 用于 OraclePostgreSQLKingbaseDB2H2 数据库的主键自增如果是 MySQL 等数据库可不写
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class DataSetFileMiddleDO extends BaseDO {
/**
* 主键ID
*/
@TableId
private Long id;
/**
* 对应数据集的id
*/
private Long dataSetId;
/**
* 数据集文件对应的上传地址
*/
private String dataSetFileUrl;
/**
* 数据集文件类型0数据文件1图片2视频
*/
private String dataSetFileType;
/**
* 数据集文件名称
*/
private String datasetFileName;
private String sourceFileUrl;
private Long sourceFileId;
private String cleanedText;
private String cleanedTextHash;
private BigDecimal qualityScore;
private Integer tokenCount;
private LocalDateTime cleanTime;
private String sourceFileExtension;
private String sourceFileName;
}

View File

@ -0,0 +1,83 @@
package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset;
import lombok.*;
import java.util.*;
import java.time.LocalDateTime;
import java.time.LocalDateTime;
import com.baomidou.mybatisplus.annotation.*;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
/**
* 中台中的数据集 DO
*
* @author 管理员
*/
@TableName("data_set_middle")
@KeySequence("data_set_middle_seq") // 用于 OraclePostgreSQLKingbaseDB2H2 数据库的主键自增如果是 MySQL 等数据库可不写
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class DataSetMiddleDO extends BaseDO {
/**
* 主键ID
*/
@TableId
private Long id;
/**
* 数据集名称
*/
private String datasetName;
/**
* 数据集来源
*/
private String datasetSource;
/**
* 清洗状态0未清洗1已经清洗
*/
private Integer cleanStatus;
/**
* 数据标注状态0未完成1进行中2已完成
*/
private Integer markStatus;
/**
* 数据集父类型(1文本数据集2多模态数据集)
*/
private Integer datasetParentType;
/**
* 备注
*/
private String remark;
/**
* 数据集类型 0 普通 1 官方
*/
private Integer type;
/**
* 数据集描述
*/
private String datasetIntro;
/**
* 数据集类型1-训练数据集2-评估数据集
*/
private Integer datasetType;
/**
* 数据集类型使用字典llm_dataset_category_1llm_dataset_category_2
*/
private Integer datasetCategory;
/**
* 数据长度
*/
private Long dataLength;
/**
* 标注进度
*/
private Integer annotateProgress;
/**
* 对应mongodb中的数据iD
*/
private Long mongoId;
}

View File

@ -0,0 +1,48 @@
package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import com.baomidou.mybatisplus.annotation.KeySequence;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
/**
* 数据集数据问题标注内容 DO
*
* @author 华大大模型
*/
@TableName("platform_dataset_answer")
@KeySequence("platform_dataset_answer_seq") // 用于 OraclePostgreSQLKingbaseDB2H2 数据库的主键自增如果是 MySQL 等数据库可不写
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class PlatformDatasetAnswerDO extends BaseDO {
/**
* 数据集问题ID
*/
@TableId
private Long id;
/**
* 数据集ID
*/
private Long datasetId;
/**
* 数据文件ID
*/
private Long datasetFilesId;
/**
* 问题ID
*/
private Long questionId;
/**
* 标注内容
*/
private String answer;
private String answerFrom;
}

View File

@ -0,0 +1,48 @@
package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import com.baomidou.mybatisplus.annotation.KeySequence;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
/**
* 数据集数据文件 DO
*
* @author 华大大模型
*/
@TableName("platform_dataset_files")
@KeySequence("platform_dataset_files_seq") // 用于 OraclePostgreSQLKingbaseDB2H2 数据库的主键自增如果是 MySQL 等数据库可不写
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class PlatformDatasetFilesDO extends BaseDO {
/**
* 数据集ID
*/
@TableId
private Long id;
/**
* 数据集ID
*/
private Long datasetId;
/**
* 数据长度
*/
private Long dataLength;
/**
* 数据文件文件表的ID
*/
private Long datasetFile;
/**
* 文件URL地址
*/
private String datasetFileUrl;
private String datasetFileName;
}

View File

@ -0,0 +1,51 @@
package cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import com.baomidou.mybatisplus.annotation.KeySequence;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
/**
* 数据集数据问题 DO
*
* @author 华大大模型
*/
@TableName("platform_dataset_question")
@KeySequence("platform_dataset_question_seq") // 用于 OraclePostgreSQLKingbaseDB2H2 数据库的主键自增如果是 MySQL 等数据库可不写
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class PlatformDatasetQuestionDO extends BaseDO {
/**
* 数据集问题ID
*/
@TableId
private Long id;
/**
* 数据集ID
*/
private Long datasetId;
/**
* 数据文件ID
*/
private Long datasetFilesId;
/**
* 问题内容
*/
private String question;
/**
* 标注状态使用字典llm_dataset_mark_status
*/
private Integer status;
@TableField("`system`")
private String system;
private String questionFrom;
}

View File

@ -0,0 +1,32 @@
package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset;
import java.util.*;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddlePageReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import com.baomidou.dynamic.datasource.annotation.DS;
import org.apache.ibatis.annotations.Mapper;
/**
* 数据集对应的文件地址 Mapper
*
* @author 管理员
*/
@Mapper
@DS("slave")
public interface DataSetFileMiddleMapper extends BaseMapperX<DataSetFileMiddleDO> {
default PageResult<DataSetFileMiddleDO> selectPage(DataSetFileMiddlePageReqVO reqVO) {
return selectPage(reqVO, new LambdaQueryWrapperX<DataSetFileMiddleDO>()
.eqIfPresent(DataSetFileMiddleDO::getDataSetId, reqVO.getDataSetId())
.eqIfPresent(DataSetFileMiddleDO::getDataSetFileUrl, reqVO.getDataSetFileUrl())
.eqIfPresent(DataSetFileMiddleDO::getDataSetFileType, reqVO.getDataSetFileType())
.betweenIfPresent(DataSetFileMiddleDO::getCreateTime, reqVO.getCreateTime())
.likeIfPresent(DataSetFileMiddleDO::getDatasetFileName, reqVO.getDatasetFileName())
.orderByDesc(DataSetFileMiddleDO::getId));
}
}

View File

@ -0,0 +1,45 @@
package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset;
import java.util.*;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddlePageReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Update;
/**
* 中台中的数据集 Mapper
*
* @author 管理员
*/
@Mapper
public interface DataSetMiddleMapper extends BaseMapperX<DataSetMiddleDO> {
default PageResult<DataSetMiddleDO> selectPage(DataSetMiddlePageReqVO reqVO) {
return selectPage(reqVO, new LambdaQueryWrapperX<DataSetMiddleDO>()
.likeIfPresent(DataSetMiddleDO::getDatasetName, reqVO.getDatasetName())
.eqIfPresent(DataSetMiddleDO::getDatasetSource, reqVO.getDatasetSource())
.eqIfPresent(DataSetMiddleDO::getCleanStatus, reqVO.getCleanStatus())
.eqIfPresent(DataSetMiddleDO::getMarkStatus, reqVO.getMarkStatus())
.eqIfPresent(DataSetMiddleDO::getDatasetParentType, reqVO.getDatasetParentType())
.eqIfPresent(DataSetMiddleDO::getRemark, reqVO.getRemark())
.betweenIfPresent(DataSetMiddleDO::getCreateTime, reqVO.getCreateTime())
.eqIfPresent(DataSetMiddleDO::getType, reqVO.getType())
.eqIfPresent(DataSetMiddleDO::getDatasetIntro, reqVO.getDatasetIntro())
.eqIfPresent(DataSetMiddleDO::getDatasetType, reqVO.getDatasetType())
.eqIfPresent(DataSetMiddleDO::getDatasetCategory, reqVO.getDatasetCategory())
.eqIfPresent(DataSetMiddleDO::getDataLength, reqVO.getDataLength())
.eqIfPresent(DataSetMiddleDO::getAnnotateProgress, reqVO.getAnnotateProgress())
.eqIfPresent(DataSetMiddleDO::getMongoId, reqVO.getMongoId())
.orderByDesc(DataSetMiddleDO::getId));
}
@Update("update data_set_middle set annotate_progress = #{formattedRatio},status=#{status} where id = #{datasetId}")
void updateProcess(@Param("formattedRatio") Integer formattedRatio, @Param("datasetId") Long datasetId, @Param("status") Integer status);
}

View File

@ -0,0 +1,40 @@
package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerRespVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO;
import com.baomidou.dynamic.datasource.annotation.DS;
import org.apache.ibatis.annotations.Delete;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
/**
* 数据集数据问题标注内容 Mapper
*
* @author 华大大模型
*/
@Mapper
@DS("slave")
public interface PlatformDatasetAnswerMapper extends BaseMapperX<PlatformDatasetAnswerDO> {
default PageResult<PlatformDatasetAnswerDO> selectPage(PlatformDatasetAnswerPageReqVO reqVO) {
return selectPage(reqVO, new LambdaQueryWrapperX<PlatformDatasetAnswerDO>()
.eqIfPresent(PlatformDatasetAnswerDO::getDatasetId, reqVO.getDatasetId())
.eqIfPresent(PlatformDatasetAnswerDO::getDatasetFilesId, reqVO.getDatasetFilesId())
.eqIfPresent(PlatformDatasetAnswerDO::getQuestionId, reqVO.getQuestionId())
.eqIfPresent(PlatformDatasetAnswerDO::getAnswer, reqVO.getAnswer())
.betweenIfPresent(PlatformDatasetAnswerDO::getCreateTime, reqVO.getCreateTime())
.orderByDesc(PlatformDatasetAnswerDO::getId));
}
@Delete("delete from platform_dataset_answer where dataset_id = #{datasetPostId}")
void deleteTrue(@Param("datasetPostId") Long datasetPostId);
List<PlatformDatasetAnswerRespVO> getAnswersToYourQuestions(@Param("collected") List<Long> collected);
void deleteTheAnswer(@Param("id") Long id);
}

View File

@ -0,0 +1,31 @@
package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesPageReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO;
import com.baomidou.dynamic.datasource.annotation.DS;
import org.apache.ibatis.annotations.Mapper;
/**
* 数据集数据文件 Mapper
*
* @author 华大大模型
*/
@Mapper
@DS("slave")
public interface PlatformDatasetFilesMapper extends BaseMapperX<PlatformDatasetFilesDO> {
default PageResult<PlatformDatasetFilesDO> selectPage(PlatformDatasetFilesPageReqVO reqVO) {
return selectPage(reqVO, new LambdaQueryWrapperX<PlatformDatasetFilesDO>()
.eqIfPresent(PlatformDatasetFilesDO::getDatasetId, reqVO.getDatasetId())
.eqIfPresent(PlatformDatasetFilesDO::getDataLength, reqVO.getDataLength())
.eqIfPresent(PlatformDatasetFilesDO::getDatasetFile, reqVO.getDatasetFile())
.eqIfPresent(PlatformDatasetFilesDO::getDatasetFileUrl, reqVO.getDatasetFileUrl())
.betweenIfPresent(PlatformDatasetFilesDO::getCreateTime, reqVO.getCreateTime())
.orderByDesc(PlatformDatasetFilesDO::getId));
}
}

View File

@ -0,0 +1,41 @@
package cn.iocoder.yudao.module.mdpf.dal.mapper.dataset;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO;
import com.baomidou.dynamic.datasource.annotation.DS;
import org.apache.ibatis.annotations.Delete;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
/**
* 数据集数据问题 Mapper
*
* @author 华大大模型
*/
@Mapper
@DS("slave")
public interface PlatformDatasetQuestionMapper extends BaseMapperX<PlatformDatasetQuestionDO> {
default PageResult<PlatformDatasetQuestionDO> selectPage(PlatformDatasetQuestionPageReqVO reqVO) {
return selectPage(reqVO, new LambdaQueryWrapperX<PlatformDatasetQuestionDO>()
.eqIfPresent(PlatformDatasetQuestionDO::getDatasetId, reqVO.getDatasetId())
.eqIfPresent(PlatformDatasetQuestionDO::getDatasetFilesId, reqVO.getDatasetFilesId())
.eqIfPresent(PlatformDatasetQuestionDO::getQuestion, reqVO.getQuestion())
.eqIfPresent(PlatformDatasetQuestionDO::getStatus, reqVO.getStatus())
.betweenIfPresent(PlatformDatasetQuestionDO::getCreateTime, reqVO.getCreateTime())
.orderByDesc(PlatformDatasetQuestionDO::getId));
}
@Delete("delete from platform_dataset_answer where dataset_id = #{datasetPostId}")
void deleteTrue(@Param("datasetPostId") Long datasetPostId);
List<PlatformDatasetQuestionRespVO> getAListOfIssues(@Param("datasetId") Long datasetId);
void deleteTheIssue(@Param("id") Long id);
}

View File

@ -0,0 +1,43 @@
package cn.iocoder.yudao.module.mdpf.dal.mongo;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import com.baomidou.mybatisplus.annotation.KeySequence;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
/**
* 数据集对应的文件地址 DO
*
* @author 管理员
*/
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class DataSetFileMiddleMongoDO extends BaseDO {
/**
* 主键ID
*/
private Long id;
/**
* 对应数据集的id
*/
private Long dataSetId;
/**
* 数据集文件对应的上传地址
*/
private String dataSetFileUrl;
/**
* 数据集文件类型0数据文件1图片2视频
*/
private String dataSetFileType;
/**
* 数据集文件名称
*/
private String datasetFileName;
}

View File

@ -0,0 +1,82 @@
package cn.iocoder.yudao.module.mdpf.dal.mongo;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import lombok.*;
import org.springframework.data.annotation.Id;
import org.springframework.data.mongodb.core.mapping.Document;
import java.util.List;
/**
* 中台中的数据集 DO
*
* @author 管理员
*/
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Document(collation = "data_set")
public class DataSetMiddleMongoDO extends BaseDO {
/**
* 主键ID
*/
@Id
private Long id;
/**
* 数据集名称
*/
private String dataSetName;
/**
* 数据集来源
*/
private String dataSetSource;
/**
* 清洗状态0未清洗1已经清洗
*/
private Integer cleanStatus;
/**
* 数据标注状态0未完成1进行中2已完成
*/
private Integer markStatus;
/**
* 数据集父类型(1文本数据集2多模态数据集)
*/
private Integer datasetParentType;
/**
* 备注
*/
private String remark;
/**
* 数据集类型 0 普通 1 官方
*/
private Integer type;
/**
* 数据集描述
*/
private String datasetIntro;
/**
* 数据集类型1-训练数据集2-评估数据集
*/
private Integer datasetType;
/**
* 数据集类型使用字典llm_dataset_category_1llm_dataset_category_2
*/
private Integer datasetCategory;
/**
* 数据长度
*/
private Long dataLength;
/**
* 标注进度
*/
private Integer annotateProgress;
/**
* 对应mongodb中的数据iD
*/
private List<DataSetFileMiddleMongoDO> dataSetFileList;
}

View File

@ -0,0 +1,11 @@
package cn.iocoder.yudao.module.mdpf.dal.mongorepository;
import cn.iocoder.yudao.module.mdpf.dal.mongo.DataSetMiddleMongoDO;
import org.springframework.data.mongodb.repository.MongoRepository;
import org.springframework.stereotype.Repository;
@Repository
public interface DataSetMiddleRepository extends MongoRepository<DataSetMiddleMongoDO,String> {
@Override
DataSetMiddleMongoDO insert(DataSetMiddleMongoDO entity);
}

View File

@ -0,0 +1,74 @@
package cn.iocoder.yudao.module.mdpf.factory.datset;
import cn.iocoder.yudao.module.mdpf.factory.datset.IFileParserStrategy;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.PostConstruct;
/**
* 文件解析策略工厂
* Spring 启动时自动注册所有 FileParserStrategy 实现并根据文件扩展名提供对应策略
*/
@Component
@Slf4j
public class FileParserStrategyFactory {
private final List<IFileParserStrategy> allStrategies; // Spring 会自动注入所有 FileParserStrategy 的实现类
private final Map<String, IFileParserStrategy> strategyMap = new HashMap<>(); // 缓存策略的映射
@Autowired
public FileParserStrategyFactory(List<IFileParserStrategy> allStrategies) {
this.allStrategies = allStrategies;
}
@PostConstruct // 在所有依赖注入完成后执行此方法
public void init() {
log.info("Initializing FileParserStrategyFactory, registering strategies...");
for (IFileParserStrategy strategy : allStrategies) {
List<String> supportedExtensions = strategy.getSupportedExtensions(); // 调用策略接口的新方法
if (supportedExtensions == null || supportedExtensions.isEmpty()) {
log.warn("Strategy {} does not declare any supported extensions. It will not be registered.",
strategy.getClass().getSimpleName());
continue;
}
for (String extension : supportedExtensions) {
String lowerCaseExtension = extension.toLowerCase();
if (strategyMap.containsKey(lowerCaseExtension)) {
log.warn("Duplicate file parser strategy registered for extension '{}'. Overwriting with {}. " +
"Previous strategy was {}.",
lowerCaseExtension,
strategy.getClass().getSimpleName(),
strategyMap.get(lowerCaseExtension).getClass().getSimpleName());
}
strategyMap.put(lowerCaseExtension, strategy);
log.info("Registered strategy {} for extension '{}'.", strategy.getClass().getSimpleName(), lowerCaseExtension);
}
}
log.info("FileParserStrategyFactory initialization complete. Total {} strategies registered for {} extensions.",
allStrategies.size(), strategyMap.size());
}
/**
* 根据文件扩展名获取对应的解析策略
* 直接从预填充的 map 中获取无需遍历
*
* @param fileExtension 文件扩展名
* @return 对应的 FileParserStrategy 实例
* @throws IllegalArgumentException 如果没有找到支持该类型文件的策略
*/
public IFileParserStrategy getStrategy(String fileExtension) {
String lowerCaseExtension = fileExtension.toLowerCase();
IFileParserStrategy strategy = strategyMap.get(lowerCaseExtension);
if (strategy == null) {
throw new IllegalArgumentException("No file parser strategy found for extension: " + fileExtension);
}
return strategy;
}
}

View File

@ -0,0 +1,21 @@
package cn.iocoder.yudao.module.mdpf.factory.datset;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO;
import java.io.File;
import java.io.InputStream;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Map;
public interface IFileParserStrategy {
public boolean supports(String fileExtension);
List<Map<String, Object>> parseFileContentToString(File file, InputStream inputStream, String datasetMetaId,
String originalMinioPath, LocalDateTime processTime,
Map<String, Object> additionalMetadata, PlatformDatasetFilesDO platformDatasetFilesDO);
public List<String> getSupportedExtensions();
public DataSetFileMiddleDO createFileToMiIO(List<DataSetFileMiddleDO> cleanedFileRecordsForOneFile, DataSetMiddleSaveReqVO createReqVO,Long fileid,String filename,String extendFilename,String url);
}

View File

@ -0,0 +1,181 @@
package cn.iocoder.yudao.module.mdpf.factory.datset;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.infra.service.file.FileService;
import cn.iocoder.yudao.module.mdpf.controller.dataset.dto.PlatformDataJsonTemplate;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetAnswerMapper;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetQuestionMapper;
import cn.iocoder.yudao.module.mdpf.util.ParserUtils;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.util.*;
import java.util.stream.Collectors;
import com.fasterxml.jackson.core.type.TypeReference;
import org.springframework.util.StringUtils;
@Component
@Slf4j
public class JsonFileParser implements IFileParserStrategy {
@Autowired
private ObjectMapper objectMapper;
@Autowired
private ParserUtils parserUtils;
@Autowired
private FileService fileService;
@Autowired
private PlatformDatasetAnswerMapper platformDatasetAnswerMapper;
@Autowired
private PlatformDatasetQuestionMapper platformDatasetQuestionMapper;
@Override
public boolean supports(String fileExtension) {
return "json".equals(fileExtension);
}
@Override
public List<Map<String,Object>> parseFileContentToString(File file, InputStream inputStream, String datasetMetaId,
String originalMinioPath, LocalDateTime processTime, Map<String, Object> additionalMetadata, PlatformDatasetFilesDO platformDatasetFilesDO) {
List<Map<String, Object>> parsedRecords = new ArrayList<>();
if (inputStream == null) {
log.error("Input stream is required for JSON file parsing for datasetMetaId: {}", datasetMetaId);
return Collections.emptyList();
}
try {
// 读取整个 JSON 文件到一个 JsonNode
JsonNode rootNode = objectMapper.readTree(inputStream);
// 将整个 JsonNode 序列化为字符串作为 extractedText
String extractedText = objectMapper.writeValueAsString(rootNode);
// 构建 sourceSpecificMetadata
Map<String, Object> segmentMetadata = new HashMap<>(additionalMetadata != null ? additionalMetadata : Collections.emptyMap());
if (rootNode.isObject()) {
// 如果根节点是 JSON 对象将其内容直接放入元数据 Map
segmentMetadata.putAll(objectMapper.convertValue(rootNode, new TypeReference<Map<String, Object>>() {}));
} else if (rootNode.isArray()) {
// 如果根节点是 JSON 数组将其放入元数据 Map 的一个特定键下
segmentMetadata.put("json_root_array_content", objectMapper.convertValue(rootNode, new TypeReference<List<Object>>() {}));
//解析入库
jsonParsing(rootNode,platformDatasetFilesDO);
} else if (rootNode.isValueNode()) {
// 如果根节点是单个值字符串数字布尔等也放入特定键下
segmentMetadata.put("json_root_value_content", rootNode.asText());
} else {
// 其他情况例如 null
segmentMetadata.put("json_root_content_type", "unsupported");
segmentMetadata.put("json_root_content_raw", rootNode.toString());
}
log.info("Parsed entire JSON file as a single segment for datasetMetaId: {}.", datasetMetaId);
// 使用 ParserUtils 创建唯一的文本片段 Map
Map<String, Object> singleSegment = parserUtils.createSegmentMap(
datasetMetaId, originalMinioPath, "json", extractedText,
segmentMetadata, processTime, "json_full_file_segment");
return Collections.singletonList(singleSegment); // 返回只包含这一个片段的列表
} catch (Exception e) {
log.error("Failed to parse JSON file content as single segment for datasetMetaId: {} (MinIO: {}): {}",
datasetMetaId, originalMinioPath, e.getMessage(), e);
String errorMessage = "Failed to parse entire JSON file as single segment: " + e.getMessage();
Map<String, Object> errorMetadata = new HashMap<>();
errorMetadata.putAll(additionalMetadata != null ? additionalMetadata : Collections.emptyMap());
errorMetadata.put("error", errorMessage);
return Collections.singletonList(parserUtils.createSegmentMap(
datasetMetaId, originalMinioPath, "json", errorMessage,
errorMetadata, processTime, "json_parsing_error"
));
}
}
@Override
public List<String> getSupportedExtensions() {
return Arrays.asList("json");
}
public DataSetFileMiddleDO createFileToMiIO(List<DataSetFileMiddleDO> cleanedFileRecordsForOneFile, DataSetMiddleSaveReqVO createReqVO, Long fileid, String filename, String extendFilename, String url){
String aggregatedJson = null;
try {
aggregatedJson = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(cleanedFileRecordsForOneFile);
byte[] jsonBytes = aggregatedJson.getBytes("UTF-8");
String outputObjectName = String.format("cleaned_data/%d/%d_aggregated_cleaned_files.json", createReqVO.getId(), fileid);
// String cleanedOutputMinioPath = minioService.uploadBytes(jsonBytes, outputObjectName, "application/json", minioService.getMinioBucketName());
// log.info("Successfully uploaded aggregated cleaned JSON to MinIO: {}", cleanedOutputMinioPath);
String fileurl = fileService.createFile(filename, "", jsonBytes);
DataSetFileMiddleDO dataSetFileMiddleDO = cleanedFileRecordsForOneFile.get(0);
dataSetFileMiddleDO.setDataSetFileUrl(fileurl);
dataSetFileMiddleDO.setSourceFileExtension(extendFilename);
dataSetFileMiddleDO.setSourceFileUrl(url);
return dataSetFileMiddleDO;
} catch (JsonProcessingException e) {
throw new RuntimeException(e);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
private void jsonParsing(JsonNode jsonNode, PlatformDatasetFilesDO datasetFilesDO) throws JsonProcessingException,IOException {
// 使用Jackson解析 Json 字符串为List<String>对象
// 使用Jackson解析 Json 字符串为List<String>对象
// 使用 TypeReference 解析 JSON 字符串为 List<String>
List<PlatformDataJsonTemplate> jsonList = null;
jsonList = objectMapper.readValue(objectMapper.treeAsTokens(jsonNode), new TypeReference<List<PlatformDataJsonTemplate>>() {});
jsonList.forEach(
dataJsonTemplate -> {
List<String> answers = dataJsonTemplate.getAnswers();
PlatformDatasetQuestionDO datasetQuestionDO = BeanUtils.toBean(dataJsonTemplate, PlatformDatasetQuestionDO.class);
datasetQuestionDO.setDatasetId(datasetFilesDO.getDatasetId());
datasetQuestionDO.setDatasetFilesId(datasetFilesDO.getId());
datasetQuestionDO.setStatus(CollectionUtils.isNotEmpty(answers) ? 2 : 0);
platformDatasetQuestionMapper.insert(datasetQuestionDO);
if (CollectionUtils.isNotEmpty(answers)) {
for (String answer : answers) {
PlatformDatasetAnswerDO datasetAnswerDO = new PlatformDatasetAnswerDO();
datasetAnswerDO.setDatasetId(datasetFilesDO.getDatasetId());
datasetAnswerDO.setDatasetFilesId(datasetFilesDO.getId());
datasetAnswerDO.setQuestionId(datasetQuestionDO.getId());
datasetAnswerDO.setAnswer(answer);
platformDatasetAnswerMapper.insert(datasetAnswerDO);
}
}
}
);
}
}

View File

@ -0,0 +1,37 @@
package cn.iocoder.yudao.module.mdpf.factory.datset;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.InputStream;
import java.time.LocalDateTime;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
@Component
public class ZIPFileParser implements IFileParserStrategy {
@Override
public boolean supports(String fileExtension) {
return "zip".equals(fileExtension);
}
@Override
public List<Map<String,Object>> parseFileContentToString(File file, InputStream inputStream, String datasetMetaId, String originalMinioPath, LocalDateTime processTime, Map<String, Object> additionalMetadata, PlatformDatasetFilesDO platformDatasetFilesDO) {
return null;
}
@Override
public List<String> getSupportedExtensions() {
return Arrays.asList("zip");
}
@Override
public DataSetFileMiddleDO createFileToMiIO(List<DataSetFileMiddleDO> cleanedFileRecordsForOneFile, DataSetMiddleSaveReqVO createReqVO, Long fileid, String filename, String extendFilename, String url) {
return null;
}
}

View File

@ -0,0 +1,12 @@
package cn.iocoder.yudao.module.mdpf.framework.config;
import org.springframework.context.annotation.Configuration;
import org.springframework.data.mongodb.repository.config.EnableMongoRepositories;
@Configuration // 标记为Spring配置类
@EnableMongoRepositories(
basePackages = "cn.iocoder.yudao.module.mdpf.dal.mongorepository" // <--- 明确指定 MongoDB Repository 的扫描路径
)
public class MdpfMongoRepositoryConfiguration {
// 这个类可以为空它的主要作用是通过 @EnableMongoRepositories 注解来启用和配置 MongoDB Repository 扫描
}

View File

@ -0,0 +1,65 @@
package cn.iocoder.yudao.module.mdpf.service.dataset;
import java.util.*;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddlePageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import com.baomidou.dynamic.datasource.annotation.DS;
import javax.validation.Valid;
/**
* 数据集对应的文件地址 Service 接口
*
* @author 管理员
*/
public interface DataSetFileMiddleService {
/**
* 创建数据集对应的文件地址
*
* @param createReqVO 创建信息
* @return 编号
*/
Long createSetFileMiddle(@Valid DataSetFileMiddleSaveReqVO createReqVO);
@DS("slave")
Long createSetFileMiddle(DataSetFileMiddleDO createReqVO);
/**
* 更新数据集对应的文件地址
*
* @param updateReqVO 更新信息
*/
void updateSetFileMiddle(@Valid DataSetFileMiddleSaveReqVO updateReqVO);
@DS("slave")
void updateDataSetFileMiddle(DataSetFileMiddleSaveReqVO updateReqVO);
/**
* 删除数据集对应的文件地址
*
* @param id 编号
*/
void deleteSetFileMiddle(Long id);
/**
* 获得数据集对应的文件地址
*
* @param id 编号
* @return 数据集对应的文件地址
*/
DataSetFileMiddleDO getSetFileMiddle(Long id);
/**
* 获得数据集对应的文件地址分页
*
* @param pageReqVO 分页查询
* @return 数据集对应的文件地址分页
*/
PageResult<DataSetFileMiddleDO> getSetFileMiddlePage(DataSetFileMiddlePageReqVO pageReqVO);
}

View File

@ -0,0 +1,79 @@
package cn.iocoder.yudao.module.mdpf.service.dataset;
import java.util.*;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleSaveReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddlePageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleRespVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetMiddleSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO;
import com.baomidou.dynamic.datasource.annotation.DS;
import javax.validation.Valid;
/**
* 中台中的数据集 Service 接口
*
* @author 管理员
*/
public interface DataSetMiddleService {
@DS("salve")
Long createDataSetMiddle(DataSetMiddleDO createReqVO);
/**
* 创建中台中的数据集
*
* @param createReqVO 创建信息
* @return 编号
*/
Long createSetMiddle(@Valid DataSetMiddleSaveReqVO createReqVO);
@DS("slave")
void updataDataSetMiddle(DataSetMiddleSaveReqVO updateReqVO);
/**
* 更新中台中的数据集
*
* @param updateReqVO 更新信息
*/
void updateSetMiddle(@Valid DataSetMiddleSaveReqVO updateReqVO);
/**
* 删除中台中的数据集
*
* @param id 编号
*/
void deleteSetMiddle(Long id);
/**
* 获得中台中的数据集
*
* @param id 编号
* @return 中台中的数据集
*/
DataSetMiddleDO getSetMiddle(Long id);
@DS("slave")
DataSetMiddleRespVO getOneInfo(Long id);
/**
* 获得中台中的数据集分页
*
* @param pageReqVO 分页查询
* @return 中台中的数据集分页
*/
PageResult<DataSetMiddleDO> getSetMiddlePage(DataSetMiddlePageReqVO pageReqVO);
@DS("slave")
DataSetMiddleDO getOne(Long datasetid);
@DS("slave")
List<DataSetMiddleDO> getDataSetMiddleList(Integer datasetParentType);
@DS("slave")
void updateProcess(Integer formattedRatio, Long datasetId, Integer status);
// String getDataSetUrl(Long datasetId,String hostUrl);
}

View File

@ -0,0 +1,55 @@
package cn.iocoder.yudao.module.mdpf.service.dataset;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO;
import javax.validation.Valid;
/**
* 数据集数据问题标注内容 Service 接口
*
* @author 华大大模型
*/
public interface PlatformDatasetAnswerService {
/**
* 创建数据集数据问题标注内容
*
* @param createReqVO 创建信息
* @return 编号
*/
Long createDatasetAnswer(@Valid PlatformDatasetAnswerSaveReqVO createReqVO);
/**
* 更新数据集数据问题标注内容
*
* @param updateReqVO 更新信息
*/
void updateDatasetAnswer(@Valid PlatformDatasetAnswerSaveReqVO updateReqVO);
/**
* 删除数据集数据问题标注内容
*
* @param id 编号
*/
void deleteDatasetAnswer(Long id);
/**
* 获得数据集数据问题标注内容
*
* @param id 编号
* @return 数据集数据问题标注内容
*/
PlatformDatasetAnswerDO getDatasetAnswer(Long id);
/**
* 获得数据集数据问题标注内容分页
*
* @param pageReqVO 分页查询
* @return 数据集数据问题标注内容分页
*/
PageResult<PlatformDatasetAnswerDO> getDatasetAnswerPage(PlatformDatasetAnswerPageReqVO pageReqVO);
}

View File

@ -0,0 +1,59 @@
package cn.iocoder.yudao.module.mdpf.service.dataset;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import javax.validation.Valid;
import java.util.List;
/**
* 数据集数据文件 Service 接口
*
* @author 华大大模型
*/
public interface PlatformDatasetFilesService {
/**
* 创建数据集数据文件
*
* @param createReqVO 创建信息
* @return 编号
*/
Long createDatasetFiles(@Valid PlatformDatasetFilesSaveReqVO createReqVO);
Long createDatasetFiles( PlatformDatasetFilesDO platformDatasetFilesDO);
/**
* 更新数据集数据文件
*
* @param updateReqVO 更新信息
*/
void updateDatasetFiles(@Valid PlatformDatasetFilesSaveReqVO updateReqVO);
/**
* 删除数据集数据文件
*
* @param id 编号
*/
void deleteDatasetFiles(Long id);
/**
* 获得数据集数据文件
*
* @param id 编号
* @return 数据集数据文件
*/
PlatformDatasetFilesDO getDatasetFiles(Long id);
/**
* 获得数据集数据文件分页
*
* @param pageReqVO 分页查询
* @return 数据集数据文件分页
*/
PageResult<PlatformDatasetFilesDO> getDatasetFilesPage(PlatformDatasetFilesPageReqVO pageReqVO);
List<PlatformDatasetFilesDO> selectList(LambdaQueryWrapper query);
}

View File

@ -0,0 +1,71 @@
package cn.iocoder.yudao.module.mdpf.service.dataset;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import javax.validation.Valid;
import java.util.List;
/**
* 数据集数据问题 Service 接口
*
* @author 华大大模型
*/
public interface PlatformDatasetQuestionService {
/**
* 创建数据集数据问题
*
* @param createReqVO 创建信息
* @return 编号
*/
Long createDatasetQuestion (@Valid PlatformDatasetQuestionSaveReqVO createReqVO);
/**
* 更新数据集数据问题
*
* @param updateReqVO 更新信息
*/
void updateDatasetQuestion (@Valid PlatformDatasetQuestionSaveReqVO updateReqVO);
/**
* 删除数据集数据问题
*
* @param id 编号
*/
void deleteDatasetQuestion (Long id);
/**
* 获得数据集数据问题
*
* @param id 编号
* @return 数据集数据问题
*/
PlatformDatasetQuestionDO getDatasetQuestion (Long id);
/**
* 获得数据集数据问题分页
*
* @param pageReqVO 分页查询
* @return 数据集数据问题分页
*/
PageResult<PlatformDatasetQuestionRespVO> getDatasetQuestionPage (PlatformDatasetQuestionPageReqVO pageReqVO);
void updateDatasetQuestionDataAnno (List<PlatformDatasetQuestionSaveReqVO> updateReqVOS);
/**
* 获得 数据集数据问题 列表
*
* @param datasetId 数据集ID
* @return 数据集数据问题 列表
*/
List<PlatformDatasetQuestionRespVO> getDatasetQuestionList (Long datasetId);
Long getCountByDataSetId(Long datasetid);
Long getCountByDatasetid(LambdaQueryWrapper query);
}

View File

@ -0,0 +1,96 @@
package cn.iocoder.yudao.module.mdpf.service.dataset.impl;
import cn.iocoder.module.mdpf.enums.ErrorCodeConstants;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddlePageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.DataSetFileMiddleSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.DataSetFileMiddleMapper;
import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetFileMiddleService;
import com.baomidou.dynamic.datasource.annotation.DS;
import org.springframework.stereotype.Service;
import org.springframework.validation.annotation.Validated;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import javax.annotation.Resource;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
/**
* 数据集对应的文件地址 Service 实现类
*
* @author 管理员
*/
@Service
@Validated
public class DataSetFileMiddleServiceImpl implements DataSetFileMiddleService {
@Resource
private DataSetFileMiddleMapper setFileMiddleMapper;
@Override
@DS("slave")
public Long createSetFileMiddle(DataSetFileMiddleSaveReqVO createReqVO) {
// 插入
DataSetFileMiddleDO setFileMiddle = BeanUtils.toBean(createReqVO, DataSetFileMiddleDO.class);
// setFileMiddleMapper.insert(setFileMiddle);
// 返回
return setFileMiddle.getId();
}
@Override
@DS("slave")
public Long createSetFileMiddle(DataSetFileMiddleDO createReqVO) {
// 插入
setFileMiddleMapper.insert(createReqVO);
// 返回
return createReqVO.getId();
}
@Override
public void updateSetFileMiddle(DataSetFileMiddleSaveReqVO updateReqVO) {
// 校验存在
validateSetFileMiddleExists(updateReqVO.getId());
// 更新
DataSetFileMiddleDO updateObj = BeanUtils.toBean(updateReqVO, DataSetFileMiddleDO.class);
setFileMiddleMapper.updateById(updateObj);
}
@Override
@DS("slave")
public void updateDataSetFileMiddle(DataSetFileMiddleSaveReqVO updateReqVO) {
// 校验存在
// 更新
DataSetFileMiddleDO updateObj = BeanUtils.toBean(updateReqVO, DataSetFileMiddleDO.class);
setFileMiddleMapper.updateById(updateObj);
}
@Override
public void deleteSetFileMiddle(Long id) {
// 校验存在
validateSetFileMiddleExists(id);
// 删除
setFileMiddleMapper.deleteById(id);
}
private void validateSetFileMiddleExists(Long id) {
if (setFileMiddleMapper.selectById(id) == null) {
throw exception(ErrorCodeConstants.SET_FILE_MIDDLE_NOT_EXISTS);
}
}
@Override
public DataSetFileMiddleDO getSetFileMiddle(Long id) {
return setFileMiddleMapper.selectById(id);
}
@Override
public PageResult<DataSetFileMiddleDO> getSetFileMiddlePage(DataSetFileMiddlePageReqVO pageReqVO) {
return setFileMiddleMapper.selectPage(pageReqVO);
}
}

View File

@ -0,0 +1,440 @@
package cn.iocoder.yudao.module.mdpf.service.dataset.impl;
import cn.hutool.core.util.IdUtil;
import cn.iocoder.module.mdpf.enums.DatasetStatusMiddleEnum;
import cn.iocoder.module.mdpf.enums.ErrorCodeConstants;
import cn.iocoder.yudao.framework.common.exception.ErrorCode;
import cn.iocoder.yudao.framework.common.exception.ServiceException;
import cn.iocoder.yudao.module.infra.service.file.FileService;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.*;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.DataSetFileMiddleMapper;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.DataSetMiddleMapper;
import cn.iocoder.yudao.module.mdpf.dal.mongorepository.DataSetMiddleRepository;
import cn.iocoder.yudao.module.mdpf.dal.mongo.DataSetMiddleMongoDO;
import cn.iocoder.yudao.module.mdpf.factory.datset.FileParserStrategyFactory;
import cn.iocoder.yudao.module.mdpf.factory.datset.IFileParserStrategy;
import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetFileMiddleService;
import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetMiddleService;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetFilesService;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetQuestionService;
import cn.iocoder.yudao.module.mdpf.util.HttpURLConnectionUtil;
import cn.iocoder.yudao.module.mdpf.util.TextProcessor;
import com.baomidou.dynamic.datasource.annotation.DS;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Lazy;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.validation.annotation.Validated;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import javax.annotation.Resource;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.time.LocalDateTime;
import java.util.*;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
/**
* 中台中的数据集 Service 实现类
*
* @author 管理员
*/
@Service
@Validated
@Slf4j
public class DataSetMiddleServiceImpl implements DataSetMiddleService {
@Resource
private DataSetMiddleMapper setMiddleMapper;
@Autowired
private DataSetMiddleRepository dataSetMiddleRepository;
@Autowired
private FileParserStrategyFactory fileParserStrategyFactory;
@Autowired
private ObjectMapper objectMapper;
@Autowired
private TextProcessor textProcessor;
@Autowired
private DataSetFileMiddleMapper dataSetFileMiddleMapper;
@Autowired
private FileService fileService;
@Autowired
private DataSetFileMiddleService dataSetFileMiddleService;
@Autowired
@Lazy
private DataSetMiddleService dataSetMiddleService;
@Autowired
private PlatformDatasetFilesService platformDatasetFilesService;
@Autowired
private PlatformDatasetQuestionService platformDatasetQuestionService;
@Override
@DS("slave")
public Long createDataSetMiddle(DataSetMiddleDO createReqVO){
setMiddleMapper.insert(createReqVO);
return createReqVO.getId();
}
@Override
public Long createSetMiddle(DataSetMiddleSaveReqVO createReqVO) {
// 插入
DataSetMiddleDO setMiddle = BeanUtils.toBean(createReqVO, DataSetMiddleDO.class);
DataSetMiddleMongoDO dataSetMiddleMongoDO = BeanUtils.toBean(createReqVO, DataSetMiddleMongoDO.class);
long mongoid = IdUtil.getSnowflake(1, 1).nextId();
dataSetMiddleMongoDO.setId(mongoid);
//将原始数据入库到Mongodb
DataSetMiddleMongoDO DataSetMiddleMongoDO = dataSetMiddleRepository.insert(dataSetMiddleMongoDO);
setMiddle.setMongoId(dataSetMiddleMongoDO.getId());
setMiddle.setCleanStatus(0);
setMiddle.setMarkStatus(0);
//将表单数据入库
dataSetMiddleService.createDataSetMiddle(setMiddle);
createReqVO.setId(setMiddle.getId());
dataProcessClean(createReqVO);
// 返回
return setMiddle.getId();
}
@Async
public void dataProcessClean(DataSetMiddleSaveReqVO createReqVO){
List<Map<String, String>> filesList = createReqVO.getFilesList();
//
String filetypes[]=new String[]{"docx","doc","txt","json","xls","xlsx","pdf"};
String ziptypes []=new String[]{"zip"};
String imagetypes []=new String[]{"png","jpg","jpeg","gif"};
String mediaTypes []=new String[]{"mp3","mp4","rmvb"};
filesList.stream().forEach(filemap ->{
//获取文件名
String filename = filemap.get("filename");
Long fileid = Long.parseLong(filemap.get("id"));
//获取文件名中最后一个.对应的索引值
int index = filename.lastIndexOf(".");
//获取文件扩展名
String extendFilename=filename.substring(index+1,filename.length());
//获取文件的url
String url=filemap.get("url");
//判断是否时文档文件
HttpURLConnection connection = HttpURLConnectionUtil.readFile(url);
List<Map<String,Object>> questionAnswerList=new ArrayList<>();
List<DataSetFileMiddleDO> allCleanedFileRecords = new ArrayList<>();
//将文件信息入库到platform_dataset_files表中
PlatformDatasetFilesDO platformDatasetFilesDO=new PlatformDatasetFilesDO();
platformDatasetFilesDO.setDatasetFile(fileid);
platformDatasetFilesDO.setDatasetFileUrl(url);
platformDatasetFilesDO.setDatasetId(createReqVO.getId());
platformDatasetFilesDO.setCreateTime(LocalDateTime.now());
platformDatasetFilesDO.setDatasetFileName(filename);
platformDatasetFilesService.createDatasetFiles(platformDatasetFilesDO);
if (connection != null) {
try {
InputStream inputStream = connection.getInputStream();
// JsonNode rootNode = objectMapper.readTree(inputStream);
IFileParserStrategy parserStrategy = fileParserStrategyFactory.getStrategy(extendFilename);
List<Map<String, Object>> extractedSegmentsForOneFile=parserStrategy.parseFileContentToString(null,inputStream,"","",null,new HashMap<>(),platformDatasetFilesDO);
Map<String, Object> additionalMetadataForParser = new HashMap<>();
additionalMetadataForParser.put("datasetId", createReqVO.getId()); // MySQL 数据集 ID
additionalMetadataForParser.put("sourceFileId", fileid); // MongoDB 数据集元数据 ID
// additionalMetadataForParser.put("dataSetFileUrl", url); // 实际下载 URL
additionalMetadataForParser.put("dataSetFileType", 0);
additionalMetadataForParser.put("datasetFileName", filename);
additionalMetadataForParser.put("sourceFileName", filename); // 原始文件名
additionalMetadataForParser.put("originalMinioPath", url); // 原始 MinIO 路径信息
if (extractedSegmentsForOneFile != null && !extractedSegmentsForOneFile.isEmpty()) {
log.info("Successfully received {} segments from parser for file '{}'. Proceeding to clean.", extractedSegmentsForOneFile.size(), filename);
// **对当前文件解析出的原始文本片段进行深度清洗和质量评估**
// TextProcessor 现在返回 List<DataSetFileMiddleDO>
List<DataSetFileMiddleDO> cleanedFileRecordsForOneFile = textProcessor.cleanAndEvaluateList(
extractedSegmentsForOneFile,
createReqVO.getId(),
fileid
);
if (!cleanedFileRecordsForOneFile.isEmpty()) {
allCleanedFileRecords.addAll(cleanedFileRecordsForOneFile); // 聚合清洗后的 DataSetFileMiddleDO
log.info("Successfully cleaned {} high-quality records from file '{}'.", cleanedFileRecordsForOneFile.size(), filename);
} else {
log.warn("No high-quality cleaned records found for file '{}' after cleaning. (Dataset ID: {})", filename, fileid);
}
DataSetFileMiddleDO dataSetFileMiddleDO=parserStrategy.createFileToMiIO(cleanedFileRecordsForOneFile,createReqVO,fileid,filename,extendFilename,url);
dataSetFileMiddleService.createSetFileMiddle(dataSetFileMiddleDO);
} else {
log.warn("Parser returned no segments for file '{}' (URL: {}) in DataSetMiddleMongoDO ID {}. Skipping cleaning for this file.",
filename, url, fileid);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
Long count = platformDatasetQuestionService.getCountByDatasetid(new LambdaQueryWrapper<PlatformDatasetQuestionDO>()
.eq(PlatformDatasetQuestionDO::getDatasetId, createReqVO.getId()));
if (count <= 0) {
throw new ServiceException(new ErrorCode(20000, "数据集问题不能为空"));
}
createReqVO.setDataLength(count);
Long annoCount = platformDatasetQuestionService.getCountByDatasetid(new LambdaQueryWrapper<PlatformDatasetQuestionDO>()
.eq(PlatformDatasetQuestionDO::getDatasetId, createReqVO.getId())
.eq(PlatformDatasetQuestionDO::getStatus, 2));
double ratio = count == 0 ? 0 : ((double) annoCount / count) * 100;
Integer formattedRatio = ratio == 0 ? 0 : (int) ratio;
Integer status = formattedRatio == 100 ? 2 : 1;
if (formattedRatio != null) {
createReqVO.setAnnotateProgress(formattedRatio);
}
if (annoCount == 0) {
status = 0;
}
if (CollectionUtils.isEmpty(filesList)) {
throw new ServiceException(new ErrorCode(
20000, "数据集文件不能为空"));
}
if (createReqVO.getDatasetType() == 2) {
if (status != 2) {
throw new ServiceException(new ErrorCode(
20000, "评估数据集只能上传标注完成的数据"));
}
} else {
if (createReqVO.getMarkStatus() != status) {
throw new ServiceException(new ErrorCode(
20000, "数据集标注状态错误!应该是【" + DatasetStatusMiddleEnum.getStatusByName(status) + ""));
}
}
//所文件都处理完成以后将数据集中的clean_status字段设置为1
createReqVO.setCleanStatus(1);
dataSetMiddleService.updataDataSetMiddle(createReqVO);
}
public static void main(String[] args) {
String a="adsfdsa.234.erterter.pdf";
int index = a.lastIndexOf(".");
String substring = a.substring(index + 1, a.length());
System.out.println(substring);
}
public void insertSetMiddle(){
}
@Override
@DS("slave")
public void updataDataSetMiddle(DataSetMiddleSaveReqVO updateReqVO){
DataSetMiddleDO updateObj = BeanUtils.toBean(updateReqVO, DataSetMiddleDO.class);
setMiddleMapper.updateById(updateObj);
}
@Override
@DS("slave")
public void updateSetMiddle(DataSetMiddleSaveReqVO updateReqVO) {
// 校验存在
validateSetMiddleExists(updateReqVO.getId());
// 更新
DataSetMiddleDO updateObj = BeanUtils.toBean(updateReqVO, DataSetMiddleDO.class);
setMiddleMapper.updateById(updateObj);
}
@Override
public void deleteSetMiddle(Long id) {
// 校验存在
validateSetMiddleExists(id);
// 删除
setMiddleMapper.deleteById(id);
}
private void validateSetMiddleExists(Long id) {
if (setMiddleMapper.selectById(id) == null) {
throw exception(ErrorCodeConstants.SET_FILE_MIDDLE_NOT_EXISTS);
}
}
@Override
@DS("slave")
public DataSetMiddleDO getSetMiddle(Long id) {
return setMiddleMapper.selectById(id);
}
@Override
@DS("slave")
public DataSetMiddleRespVO getOneInfo(Long id) {
DataSetMiddleDO datasetDO = dataSetMiddleService.getOne(id);
DataSetMiddleRespVO datasetRespVO = BeanUtils.toBean(datasetDO, DataSetMiddleRespVO.class);
Integer datasetParentType = datasetDO.getDatasetParentType();
List<PlatformDatasetFilesDO> datasetFilesDOS = platformDatasetFilesService.selectList(new LambdaQueryWrapper<PlatformDatasetFilesDO>().eq(PlatformDatasetFilesDO::getDatasetId, id));
datasetRespVO.setDatasetFiles(BeanUtils.toBean(datasetFilesDOS, PlatformDatasetFilesRespVO.class));
/*List<DatasetQuestionDO> datasetQuestionDO = datasetQuestionMapper.selectList(new LambdaQueryWrapper<DatasetQuestionDO>().eq(DatasetQuestionDO::getDatasetId, id));
List<DatasetQuestionRespVO> datasetQuestionRespVOS = BeanUtils.toBean(datasetQuestionDO, DatasetQuestionRespVO.class);
datasetRespVO.setDatasetQuestionRespVOS(datasetQuestionRespVOS);*/
return datasetRespVO;
}
@Override
@DS("slave")
public PageResult<DataSetMiddleDO> getSetMiddlePage(DataSetMiddlePageReqVO pageReqVO) {
return setMiddleMapper.selectPage(pageReqVO);
}
@Override
@DS("slave")
public DataSetMiddleDO getOne(Long datasetid){
return setMiddleMapper.selectById(datasetid);
}
@Override
@DS("slave")
public List<DataSetMiddleDO> getDataSetMiddleList(Integer datasetParentType){
List<DataSetMiddleDO> resultlist = setMiddleMapper.selectList(Wrappers.<DataSetMiddleDO>lambdaQuery()
// .eq(DataSetMiddleDO::getDatasetParentType, datasetParentType)
.eq(DataSetMiddleDO::getMarkStatus, 2)
.eq(DataSetMiddleDO::getDeleted,0)
);
return resultlist;
}
@Override
@DS("slave")
public void updateProcess(Integer formattedRatio, Long datasetId, Integer status){
setMiddleMapper.updateProcess(formattedRatio,datasetId,status);
}
// @Override
// public String getDataSetUrl(Long datasetId,String hostUrl){
// DataSetMiddleDO dataset = dataSetMiddleService.getOne(datasetId);
// if (dataset == null) {
// log.error("未找到数据集信息数据集ID: {}", datasetId);
// throw new RuntimeException("数据集信息不存在");
// }
// log.debug("数据集信息查询成功。数据集名称: {}", dataset.getDatasetName());
//
// // 查询数据集问题列表
// log.debug("正在查询数据集问题列表数据集ID: {}", dataset.getId());
// List<PlatformDatasetQuestionRespVO> datasetQuestionList = platformDatasetQuestionService.getDatasetQuestionList(dataset.getId());
// log.debug("数据集问题列表查询成功。问题数量: {}", datasetQuestionList.size());
//
// // 将数据集信息转换为 DO 对象
// log.debug("正在转换数据集信息为 DO 对象...");
//
// // 生成 JSON 文件并获取文件 URL
// log.debug("正在生成 JSON 文件并获取文件 URL...");
// String fileUrl = JsonFileWriteFine(hostUrl, dataset, datasetQuestionList);
// return fileUrl;
// }
// public String JsonFileWriteFine (String hostUrl, DataSetMiddleDO datasetDO, List<PlatformDatasetQuestionRespVO> datasetQuestionList) {
// try {
// log.info("开始生成 JSON 文件并上传数据集ID: {}", datasetDO.getId());
//
// // 构建 AigcDatasetVo 列表
// log.debug("正在构建 AigcDatasetVo 列表...");
// List<AigcDatasetMiddleVo> aigcDatasetVoList = new ArrayList<>();
// for (PlatformDatasetQuestionRespVO dataSource : datasetQuestionList) {
// AigcDatasetMiddleVo aigcDatasetVo = new AigcDatasetMiddleVo();
// aigcDatasetVo.setInstruction(StringUtils.isNotBlank(dataSource.getSystem()) ? dataSource.getSystem() : "");
// aigcDatasetVo.setInput(StringUtils.isNotBlank(dataSource.getQuestion()) ? dataSource.getQuestion() : "");
//
// // 检查答案列表是否为空
// if (!CollectionUtils.isAnyEmpty(dataSource.getDatasetAnswerRespVO())) {
// aigcDatasetVo.setOutput(StringUtils.isNotBlank(dataSource.getDatasetAnswerRespVO().get(0).getAnswer()) ?
// dataSource.getDatasetAnswerRespVO().get(0).getAnswer() : "");
// } else {
// aigcDatasetVo.setOutput("");
// }
// aigcDatasetVoList.add(aigcDatasetVo);
// }
// log.debug("AigcDatasetVo 列表构建完成。记录数量: {}", aigcDatasetVoList.size());
//
// // AigcDatasetVo 列表转换为 JSON 字符串
// log.debug("正在将 AigcDatasetVo 列表转换为 JSON 字符串...");
// ObjectMapper mapper = new ObjectMapper();
// StringBuilder sb = new StringBuilder();
// for (AigcDatasetMiddleVo aigcDatasetVo : aigcDatasetVoList) {
// String json = mapper.writeValueAsString(aigcDatasetVo);
// sb.append(json).append("\n");
// }
//
// // JSON 字符串转换为输入流
// log.debug("正在将 JSON 字符串转换为输入流...");
// InputStream inputStream = new ByteArrayInputStream(sb.toString().getBytes());
//
// // 上传文件
// log.info("正在上传 JSON 文件...");
// String fileName = datasetDO.getDatasetName() + "new" + datasetDO.getId() + ".json";
// AigcDatasetFileMiddleRespV0 aigcDatasetFileRespV0 = trainHttpService.AigcUploadFile(new HashMap<>(), hostUrl, inputStream, fileName);
//
// if (aigcDatasetFileRespV0 != null) {
// log.debug("文件上传成功。文件ID: {}", aigcDatasetFileRespV0.getFileId());
//
// // 更新数据集的 Job ID
// log.debug("正在更新数据集的 Job ID...");
// datasetMapper.setJobid(datasetDO.getId(), aigcDatasetFileRespV0.getFileId());
//
// log.info("hostUrl:{}", hostUrl);
// // 更新数据集的 URL
// String s3Url = aigcDatasetFileRespV0.getS3Url();
// log.info("s3Url:{}", s3Url);
//
// // int lastIndex = s3Url.lastIndexOf("/storage");
// // String url = s3Url.substring(lastIndex + 1);
// // log.info("url:{}", url);
// // 找到 "/uploads" 的位置
// int uploadsIndex = s3Url.indexOf("/uploads");
// if (uploadsIndex == -1) {
// log.error("s3Url 中未找到 '/uploads' 路径");
// return "";
// }
//
// // 提取 "/uploads" 及之后的部分
// String uploadsPath = s3Url.substring(uploadsIndex);
// log.info("uploadsPath: {}", uploadsPath);
//
// // 构建新的完整 URL
// String newUrl = hostUrl + uploadsPath;
// log.info("newUrl: {}", newUrl);
// datasetMapper.setUrl(datasetDO.getId(), newUrl);
//
// // 返回结果
// String result = newUrl.substring(hostUrl.length());
// log.info("JSON 文件生成并上传成功。返回结果: {}", result);
//
// return result;
// } else {
// log.error("文件上传失败。数据集ID: {}", datasetDO.getId());
// return "";
// }
//
// } catch (IOException e) {
// log.error("生成或上传 JSON 文件时发生异常。数据集ID: {}", datasetDO.getId(), e);
// return "";
// }
// }
}

View File

@ -0,0 +1,70 @@
package cn.iocoder.yudao.module.mdpf.service.dataset.impl;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetAnswerMapper;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetAnswerService;
import org.springframework.stereotype.Service;
import org.springframework.validation.annotation.Validated;
import javax.annotation.Resource;
/**
* 数据集数据问题标注内容 Service 实现类
*
* @author 华大大模型
*/
@Service
@Validated
public class PlatformDatasetAnswerServiceImpl implements PlatformDatasetAnswerService {
@Resource
private PlatformDatasetAnswerMapper platformDatasetAnswerMapper;
@Override
public Long createDatasetAnswer(PlatformDatasetAnswerSaveReqVO createReqVO) {
// 插入
PlatformDatasetAnswerDO datasetAnswer = BeanUtils.toBean(createReqVO, PlatformDatasetAnswerDO.class);
platformDatasetAnswerMapper.insert(datasetAnswer);
// 返回
return datasetAnswer.getId();
}
@Override
public void updateDatasetAnswer(PlatformDatasetAnswerSaveReqVO updateReqVO) {
// 校验存在
validateDatasetAnswerExists(updateReqVO.getId());
// 更新
PlatformDatasetAnswerDO updateObj = BeanUtils.toBean(updateReqVO, PlatformDatasetAnswerDO.class);
platformDatasetAnswerMapper.updateById(updateObj);
}
@Override
public void deleteDatasetAnswer(Long id) {
// 校验存在
validateDatasetAnswerExists(id);
// 删除
platformDatasetAnswerMapper.deleteById(id);
}
private void validateDatasetAnswerExists(Long id) {
if (platformDatasetAnswerMapper.selectById(id) == null) {
throw new RuntimeException("数据集回答不存在");
}
}
@Override
public PlatformDatasetAnswerDO getDatasetAnswer(Long id) {
return platformDatasetAnswerMapper.selectById(id);
}
@Override
public PageResult<PlatformDatasetAnswerDO> getDatasetAnswerPage(PlatformDatasetAnswerPageReqVO pageReqVO) {
return platformDatasetAnswerMapper.selectPage(pageReqVO);
}
}

View File

@ -0,0 +1,83 @@
package cn.iocoder.yudao.module.mdpf.service.dataset.impl;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesPageReqVO;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetFilesSaveReqVO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetFilesDO;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetFilesMapper;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetFilesService;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import org.springframework.stereotype.Service;
import org.springframework.validation.annotation.Validated;
import javax.annotation.Resource;
import java.util.List;
/**
* 数据集数据文件 Service 实现类
*
* @author 华大大模型
*/
@Service
@Validated
public class PlatformDatasetFilesServiceImpl implements PlatformDatasetFilesService {
@Resource
private PlatformDatasetFilesMapper platformDatasetFilesMapper;
@Override
public Long createDatasetFiles(PlatformDatasetFilesSaveReqVO createReqVO) {
// 插入
PlatformDatasetFilesDO datasetFiles = BeanUtils.toBean(createReqVO, PlatformDatasetFilesDO.class);
platformDatasetFilesMapper.insert(datasetFiles);
// 返回
return datasetFiles.getId();
}
@Override
public Long createDatasetFiles(PlatformDatasetFilesDO platformDatasetFilesDO) {
platformDatasetFilesMapper.insert(platformDatasetFilesDO);
return platformDatasetFilesDO.getId();
}
@Override
public void updateDatasetFiles(PlatformDatasetFilesSaveReqVO updateReqVO) {
// 校验存在
validateDatasetFilesExists(updateReqVO.getId());
// 更新
PlatformDatasetFilesDO updateObj = BeanUtils.toBean(updateReqVO, PlatformDatasetFilesDO.class);
platformDatasetFilesMapper.updateById(updateObj);
}
@Override
public void deleteDatasetFiles(Long id) {
// 校验存在
validateDatasetFilesExists(id);
// 删除
platformDatasetFilesMapper.deleteById(id);
}
private void validateDatasetFilesExists(Long id) {
if (platformDatasetFilesMapper.selectById(id) == null) {
throw new RuntimeException("数据集文件不存在");
}
}
@Override
public PlatformDatasetFilesDO getDatasetFiles(Long id) {
return platformDatasetFilesMapper.selectById(id);
}
@Override
public PageResult<PlatformDatasetFilesDO> getDatasetFilesPage(PlatformDatasetFilesPageReqVO pageReqVO) {
return platformDatasetFilesMapper.selectPage(pageReqVO);
}
@Override
public List<PlatformDatasetFilesDO> selectList(LambdaQueryWrapper query){
return platformDatasetFilesMapper.selectList(query);
}
}

View File

@ -0,0 +1,246 @@
package cn.iocoder.yudao.module.mdpf.service.dataset.impl;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.mdpf.controller.dataset.vo.*;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetMiddleDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetAnswerDO;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.PlatformDatasetQuestionDO;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.DataSetMiddleMapper;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetAnswerMapper;
import cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetQuestionMapper;
import cn.iocoder.yudao.module.mdpf.service.dataset.DataSetMiddleService;
import cn.iocoder.yudao.module.mdpf.service.dataset.PlatformDatasetQuestionService;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import jodd.util.StringUtil;
import org.springframework.context.annotation.Lazy;
import org.springframework.stereotype.Service;
import org.springframework.validation.annotation.Validated;
import javax.annotation.Resource;
import java.util.*;
import java.util.stream.Collectors;
/**
* 数据集数据问题 Service 实现类
*
* @author 华大大模型
*/
@Service
@Validated
public class PlatformDatasetQuestionServiceImpl implements PlatformDatasetQuestionService {
@Resource
private PlatformDatasetQuestionMapper platformDatasetQuestionMapper;
@Resource
private PlatformDatasetAnswerMapper platformDatasetAnswerMapper;
@Resource
private DataSetMiddleMapper dataSetMiddleMapper;
@Resource
@Lazy
private DataSetMiddleService dataSetMiddleService;
// @Resource
// private DatasetQuestionAnswerImageMapper datasetQuestionAnswerImageMapper;
@Override
public Long createDatasetQuestion(PlatformDatasetQuestionSaveReqVO createReqVO) {
// 插入
PlatformDatasetQuestionDO datasetQuestion = BeanUtils.toBean(createReqVO, PlatformDatasetQuestionDO.class);
platformDatasetQuestionMapper.insert(datasetQuestion);
// 返回
return datasetQuestion.getId();
}
@Override
public void updateDatasetQuestion(PlatformDatasetQuestionSaveReqVO updateReqVO) {
// 校验存在
validateDatasetQuestionExists(updateReqVO.getId());
// 更新
PlatformDatasetQuestionDO updateObj = BeanUtils.toBean(updateReqVO, PlatformDatasetQuestionDO.class);
platformDatasetQuestionMapper.updateById(updateObj);
}
@Override
public void deleteDatasetQuestion(Long id) {
// 校验存在
validateDatasetQuestionExists(id);
// 删除
platformDatasetQuestionMapper.deleteById(id);
}
private void validateDatasetQuestionExists(Long id) {
if (platformDatasetQuestionMapper.selectById(id) == null) {
throw new RuntimeException("数据不存在");
}
}
@Override
public PlatformDatasetQuestionDO getDatasetQuestion(Long id) {
return platformDatasetQuestionMapper.selectById(id);
}
@Override
public PageResult<PlatformDatasetQuestionRespVO> getDatasetQuestionPage(PlatformDatasetQuestionPageReqVO pageReqVO) {
PageResult<PlatformDatasetQuestionDO> datasetQuestionDOPageResult = platformDatasetQuestionMapper.selectPage(pageReqVO);
Long datasetId = pageReqVO.getDatasetId();
DataSetMiddleDO datasetDO = dataSetMiddleService.getOne(datasetId);
Integer datasetParentType = datasetDO.getDatasetParentType();
PageResult<PlatformDatasetQuestionRespVO> result = BeanUtils.toBean(datasetQuestionDOPageResult, PlatformDatasetQuestionRespVO.class);
if (CollectionUtils.isNotEmpty(result.getList())) {
// result.getList().forEach(item -> {
// List<DatasetAnswerDO> datasetAnswerDOS = datasetAnswerMapper.selectList(new LambdaQueryWrapper<>(DatasetAnswerDO.class)
// .eq(DatasetAnswerDO::getQuestionId, item.getId()));
// item.setDatasetAnswerRespVO(BeanUtils.toBean(datasetAnswerDOS, DatasetAnswerRespVO.class));
//
// });
// 优化代码
List<PlatformDatasetQuestionRespVO> list = result.getList();
// 获取答案
Set<Long> collect = list.stream().map(PlatformDatasetQuestionRespVO::getId).collect(Collectors.toSet());
LambdaQueryWrapper<PlatformDatasetAnswerDO> wrapper = new LambdaQueryWrapper<PlatformDatasetAnswerDO>()
.in(PlatformDatasetAnswerDO::getQuestionId, collect);
List<PlatformDatasetAnswerDO> datasetAnswerDOS = platformDatasetAnswerMapper.selectList(wrapper);
List<PlatformDatasetAnswerRespVO> respVOS = BeanUtils.toBean(datasetAnswerDOS, PlatformDatasetAnswerRespVO.class);
Map<Long, List<PlatformDatasetAnswerRespVO>> collect1 = respVOS.stream().collect(Collectors.groupingBy(PlatformDatasetAnswerRespVO::getQuestionId));
list.forEach(item -> {
item.setDatasetAnswerRespVO(collect1.get(item.getId()));
// if(datasetParentType==2){
// LambdaQueryWrapper<PlatformDatasetQuestionAnswerImageDo> imagewrapper = new LambdaQueryWrapper<DatasetQuestionAnswerImageDO>()
// .eq(DatasetQuestionAnswerImageDO::getQuestionId, item.getId())
// .eq(DatasetQuestionAnswerImageDO::getDatasetId,item.getDatasetId());
//
// List<DatasetQuestionAnswerImageDO> datasetQuestionAnswerImageDOList = datasetQuestionAnswerImageMapper.selectList(imagewrapper);
// List<String> imageUrlList = datasetQuestionAnswerImageDOList.stream().map(DatasetQuestionAnswerImageDO::getImageUrl).collect(Collectors.toList());
// item.setImagesList(imageUrlList);
// }
});
}
return result;
}
/**
* 获得 数据集数据问题 列表
*
* @param datasetId 数据集ID
* @return 数据集数据问题 列表
*/
@Override
public List<PlatformDatasetQuestionRespVO> getDatasetQuestionList(Long datasetId) {
// List<DatasetQuestionDO> datasetQuestionDOS = datasetQuestionMapper.selectList(new LambdaQueryWrapper<>(DatasetQuestionDO.class)
// .eq(DatasetQuestionDO::getDatasetId, datasetId));
// List<DatasetQuestionRespVO> result = BeanUtils.toBean(datasetQuestionDOS, DatasetQuestionRespVO.class);
List<PlatformDatasetQuestionRespVO> result = platformDatasetQuestionMapper.getAListOfIssues(datasetId);
if (CollectionUtils.isNotEmpty(result)) {
// 1. 获取待查询的 questionId 列表假设 result 已分页避免过大
List<Long> collected = result.stream()
.map(PlatformDatasetQuestionRespVO::getId)
.collect(Collectors.toList());
// 2. 分批次查询答案避免单次 IN 语句过长
int batchSize = 1000;
List<PlatformDatasetAnswerRespVO> allAnswers = new ArrayList<>();
for (int i = 0; i < collected.size(); i += batchSize) {
int end = Math.min(i + batchSize, collected.size());
List<Long> batchIds = collected.subList(i, end);
allAnswers.addAll(platformDatasetAnswerMapper.getAnswersToYourQuestions(batchIds));
}
// 3. 构建 questionId 到答案列表的映射加速匹配
Map<Long, List<PlatformDatasetAnswerRespVO>> answerMap = new HashMap<>();
for (PlatformDatasetAnswerRespVO answer : allAnswers) {
answerMap.computeIfAbsent(answer.getQuestionId(), k -> new ArrayList<>())
.add(answer);
}
// 4. 为每个 question 绑定答案O(n) 效率
result.forEach(item -> {
item.setDatasetAnswerRespVO(answerMap.getOrDefault(item.getId(), Collections.emptyList()));
});
// result.forEach(item -> {
// List<DatasetAnswerDO> datasetAnswerDOS = datasetAnswerMapper.selectList(new LambdaQueryWrapper<>(DatasetAnswerDO.class)
// .eq(DatasetAnswerDO::getQuestionId, item.getId()));
// item.setDatasetAnswerRespVO(BeanUtils.toBean(datasetAnswerDOS, DatasetAnswerRespVO.class));
// });
}
return result;
}
@Override
public void updateDatasetQuestionDataAnno(List<PlatformDatasetQuestionSaveReqVO> updateReqVOS) {
List<Long> ids = new ArrayList<>();
Long datasetId = null;
for (PlatformDatasetQuestionSaveReqVO updateReqVO : updateReqVOS) {
if (datasetId == null) {
datasetId = updateReqVO.getDatasetId();
}
PlatformDatasetQuestionDO datasetQuestionDO = BeanUtils.toBean(updateReqVO, PlatformDatasetQuestionDO.class);
List<PlatformDatasetAnswerSaveReqVO> datasetAnswerSaveReqVO = updateReqVO.getDatasetAnswerRespVO();
List<PlatformDatasetAnswerDO> datasetAnswerDOS = BeanUtils.toBean(datasetAnswerSaveReqVO, PlatformDatasetAnswerDO.class);
if (CollectionUtils.isNotEmpty(datasetAnswerDOS)) {
for (PlatformDatasetAnswerDO datasetAnswerDO : datasetAnswerDOS) {
if (StringUtil.isNotBlank(datasetAnswerDO.getAnswer())) {
datasetQuestionDO.setStatus(2);
}
if (datasetAnswerDO.getId() == null) {
platformDatasetAnswerMapper.insert(datasetAnswerDO);
ids.add(datasetAnswerDO.getId());
} else {
ids.add(datasetAnswerDO.getId());
platformDatasetAnswerMapper.updateById(datasetAnswerDO);
}
}
// datasetAnswerMapper.insertOrUpdate(datasetAnswerDOS);
}
platformDatasetQuestionMapper.updateById(datasetQuestionDO);
}
List<Long> collect1 = updateReqVOS.stream().map(PlatformDatasetQuestionSaveReqVO::getId).collect(Collectors.toList());
LambdaQueryWrapper<PlatformDatasetAnswerDO> queryWrapper = new LambdaQueryWrapper<>();
queryWrapper.in(collect1 != null, PlatformDatasetAnswerDO::getQuestionId, collect1);
List<PlatformDatasetAnswerDO> datasetAnswerDOS = platformDatasetAnswerMapper.selectList(queryWrapper);
List<Long> collect = datasetAnswerDOS.stream().map(PlatformDatasetAnswerDO::getId).collect(Collectors.toList());
List<Long> diff1 = new ArrayList<>();
if (CollectionUtils.isNotEmpty(collect)) {
HashSet<Long> set1 = new HashSet<>(ids);
HashSet<Long> set2 = new HashSet<>(collect);
// 获取 set2 中有但 set1 中没有的元素
set2.removeAll(set1);
diff1 = new ArrayList<>(set2);
}
if (CollectionUtils.isNotEmpty(diff1)) {
platformDatasetAnswerMapper.deleteBatchIds(diff1);
}
// 标注进度修改
LambdaQueryWrapper<PlatformDatasetQuestionDO> wrapper = new LambdaQueryWrapper<PlatformDatasetQuestionDO>()
.eq(PlatformDatasetQuestionDO::getDatasetId, updateReqVOS.get(0).getDatasetId());
Long sumCount = platformDatasetQuestionMapper.selectCount(wrapper);
wrapper.eq(PlatformDatasetQuestionDO::getStatus, 2);
Long annoCount = platformDatasetQuestionMapper.selectCount(wrapper);
double ratio = sumCount == 0 ? 0 : ((double) annoCount / sumCount) * 100;
Integer formattedRatio = ratio == 0 ? 0 : (int) ratio;
Integer status = formattedRatio == 100 ? 2 : 1;
dataSetMiddleService.updateProcess(formattedRatio, updateReqVOS.get(0).getDatasetId(), status);
}
@Override
public Long getCountByDataSetId(Long datasetid){
Long count = platformDatasetQuestionMapper.selectCount(Wrappers.<PlatformDatasetQuestionDO>lambdaQuery()
.eq(PlatformDatasetQuestionDO::getDatasetId, datasetid)
);
return count;
}
@Override
public Long getCountByDatasetid(LambdaQueryWrapper query){
Long count = platformDatasetQuestionMapper.selectCount(query);
return count;
}
}

View File

@ -0,0 +1,799 @@
package cn.iocoder.yudao.module.mdpf.util;
import com.github.houbb.opencc4j.util.ZhConverterUtil;
import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
import lombok.extern.slf4j.Slf4j;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.time.Year;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Slf4j
public class DataProcessPlatformUtil {
/*
* ---------------------------------------------------------------
* 🔖 异常清洗配置
* ---------------------------------------------------------------
*/
/**
* 移除不可见字
* 移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围
*
* @param input
* @return
*/
public static String removeNonVisibleAsciiChars (String input) {
// 使用StringBuilder来构建正则表达式因为我们需要动态地添加字符范围
StringBuilder regex = new StringBuilder();
regex.append("[\\x00-\\x1F]"); // 0-31范围的字符
regex.append("|"); // OR 操作符
regex.append("[\\x7F-\\xA0]"); // 127-160范围的字符
// 使用replaceAll方法和构建的正则表达式来移除不可见字符
return input.replaceAll(regex.toString(), "");
}
/**
* 移除不可见字符
* <p>
* 将不同的unicode空格比如u2008转成正常的空格
*
* @param input
* @return
*/
public static String convertUnicodeSpacesToNormalSpaces (String input) {
// Unicode空格字符的正则表达式包括但不限于u2008等
String unicodeSpacesRegex = "[\\u0020\\u00A0\\u1680\\u2000-\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]";
// 使用正则表达式替换匹配的Unicode空格字符为普通空格
return input.replaceAll(unicodeSpacesRegex, " ");
}
/**
* 移除不可见字符
* <p>
* 去除乱码和无意义的unicode
*
* @param input
* @return
*/
public static String removeNonPrintableUnicodeChars (String input) {
// 构建一个正则表达式匹配所有非打印ASCII和非打印Unicode字符
// \p{C} 匹配所有控制字符和格式字符
// \p{Zs} 匹配所有空白分隔符比如U+2000到U+200F之间的字符
// 注意有些空白字符可能是有意义的比如空格U+0020所以这里的选择要谨慎
// 如果你确定某些空白字符是无意义的可以将其添加到正则表达式中
String regex = "[\\p{C}\\p{Zs}&&[^\\s]]+|\\u0000"; // \\u0000 是NULL字符通常是无意义的
// 使用replaceAll方法移除匹配的字符
// 注意这里使用了两个替换步骤因为直接替换可能会导致正则表达式匹配问题
// 首先替换掉所有匹配的字符为一个占位符比如"*"然后再替换掉占位符为空字符串
// 这样做是为了避免在替换过程中正则表达式匹配到已经被替换掉的部分
// 但在这种情况下由于我们使用的是字符类匹配其实直接替换为空字符串也是可以的
// 下面的代码为了演示这种可能的复杂性而保留了两步替换的逻辑
String intermediate = input.replaceAll(regex, "*"); // 这一步其实是多余的但为了说明而保留
return intermediate.replaceAll("[*]+", ""); // 这一步实际上完成了去除非打印字符的任务
// 简化版直接替换为空字符串
// return input.replaceAll(regex, "");
}
/**
* 繁体转简体
* <p>
* 繁体转简体不經意妳的笑容清洗成不经意你的笑容
*
* @param input
* @return
*/
public static String traditionalToSimplified (String input) {
return ZhConverterUtil.toSimple(input);
}
// 使用正则表达式匹配HTML标签
private static final String HTML_TAG_REGEX = "<[^>]+>";
/**
* 去除网页标识符
* <p>
* 移除文档中的html标签<html>,<dev><p>
*
* @param input
* @return
*/
public static String removeHtmlTags (String input) {
if (input == null || input.isEmpty()) {
return input;
}
// 使用replaceAll方法替换匹配的HTML标签为空字符串
return input.replaceAll(HTML_TAG_REGEX, "");
}
// 这是一个简化的正则表达式用于匹配常见的emoji表情符号
// 请注意它可能不会涵盖所有可能的emoji因为Unicode标准在不断发展
private static final String EMOJI_REGEX = "[\\uD83C-\\uD83D\\uD83E-\\uD83F\\u2600-\\u27FF"
+ "\\u2B00-\\u2BFF\\u2F00-\\u2FFF\\u3000-\\u303F"
+ "\\u3200-\\u32FF\\uA490-\\uA4CF\\uA900-\\uA97F"
+ "\\uAC00-\\uAC7F\\uAC80-\\uACFF\\uD700-\\uD7AF"
+ "\\uF900-\\uFAFF\\uFB00-\\uFB4F\\uFB50-\\uFDFF"
+ "\\uFE00-\\uFE6F\\uFE70-\\uFEFF\\uFF00-\\uFFEF]";
/**
* 去除表情
* <p>
* 去除文档中的表情🐰👵
*
* @param input
* @return
*/
public static String removeEmojis (String input) {
if (input == null || input.isEmpty()) {
return input;
}
Pattern pattern = Pattern.compile(EMOJI_REGEX, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(input);
return matcher.replaceAll("");
}
// 正则表达式用于匹配中文词汇这里假设词汇由连续的中文字符组成
private static final String CHINESE_WORD_REGEX = "[\\u4e00-\\u9fff]+";
// 方法计算字符串中的中文字符数量
// 注意这里假设输入字符串只包含中文字符和可能的分隔符如空格标点符号等
// 并且中文字符在UTF-16编码中占用两个char但被视为一个逻辑字符
private static int countChineseChars (String input) {
// 使用正则表达式匹配中文词汇并计算匹配到的字符总数这里需要除以2因为每个中文字符占用两个char
// 但为了简化我们可以直接遍历字符检查每个字符是否在中文范围内
int count = 0;
for (char c : input.toCharArray()) {
if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
// 可以根据需要添加更多Unicode块
) {
count++;
}
}
return count;
}
/*
* ---------------------------------------------------------------
* 🔖 过滤配置
* ---------------------------------------------------------------
*/
/**
* 检查文档的词数目
* 词数目不在指定范围会被过滤掉如中文[1,1000000]
*
* @param text
* @param minChars
* @param maxChars
* @return
*/
public static List<String> filterWords (String text, int minChars, int maxChars) {
List<String> result = new ArrayList<>();
Pattern pattern = Pattern.compile(CHINESE_WORD_REGEX);
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
String word = matcher.group();
int chineseCharCount = countChineseChars(word); // 计算中文字符数量
if (chineseCharCount >= minChars && chineseCharCount <= maxChars) {
result.add(word);
}
}
return result;
}
/**
* 检查文档的字重复率
* <p>
* 如果字重复率太高意味着文档中重复的字太多文档会被过滤掉
* </p>
*
* @param content 文档行
* @param threshold 设置字重复率的阈值例如10%
* @return true表示字重复率低于阈值false表示字重复率高于阈值文档会被过滤掉
*/
public static boolean calculateCharacterRepetitionRate(String content, double threshold) {
// 输入校验
if (content == null || content.trim().isEmpty()) {
return false;
}
// 预处理去空格标点等
String processedContent = content
.replaceAll("\\s+", "")
.replaceAll("[\\pP\\pS]", "");
// 短文本不检查
if (processedContent.length() < 5) {
return false;
}
// 统计字符频率
Map<Character, Integer> charCount = new HashMap<>();
char[] chars = processedContent.toCharArray();
for (char c : chars) {
if (isChineseCharacter(c)) { // 可选仅统计中文
charCount.put(c, charCount.getOrDefault(c, 0) + 1);
}
}
// 计算重复率方式1传统重复率
int totalChars = chars.length;
double repetitionRate = (double) (totalChars - charCount.size()) / totalChars;
// 将重复率转换为百分比0100以便与阈值直接比较
double repetitionPercent = repetitionRate * 100;
// 调试日志输出百分比
log.info("总字数: {}", totalChars);
log.info("重复字数: {}", totalChars - charCount.size());
log.info("字重复率: {}%", String.format("%.2f", repetitionPercent));
// 比较前可添加浮点数容差可选
final double EPSILON = 0.0001;
return repetitionPercent - threshold > EPSILON;
}
// 判断是否为中文字符可选
private static boolean isChineseCharacter(char c) {
Character.UnicodeScript sc = Character.UnicodeScript.of(c);
return sc == Character.UnicodeScript.HAN;
}
// 简单的基于空格和标点符号的分词方法
private static List<String> tokenize (String text) {
// 使用正则表达式匹配非单词字符包括空格标点符号等并将它们作为分隔符
Pattern pattern = Pattern.compile("\\W+");
String[] words = pattern.split(text.toLowerCase()); // 转换为小写以进行不区分大小写的比较
List<String> tokens = new ArrayList<>();
for (String word : words) {
if (!word.isEmpty()) { // 排除空字符串
tokens.add(word);
}
}
return tokens;
}
// 方法计算文档的词重复率
/**
* 检查文档的词重复率
* <p>
* 如果词重复率太高意味着文档中重复的词太多文档会被过滤掉
*
* @param content
* @param threshold
* @return
*/
public static boolean calculateWordRepetitionRate (String content, double threshold) {
// 分词
List<String> words = tokenize(content);
// 统计词出现次数
Map<String, Integer> wordCount = new HashMap<>();
for (String word : words) {
wordCount.put(word, wordCount.getOrDefault(word, 0) + 1);
}
// 计算重复词数和总词数
int totalWords = words.size();
int repeatedWords = 0;
for (int count : wordCount.values()) {
if (count > 1) {
repeatedWords += (count - 1); // 只计算重复的部分
}
}
// 计算词重复率
double repetitionRate = (double) repeatedWords / totalWords;
// 打印重复率和阈值方便调试
log.info("词重复率: " + repetitionRate);
log.info("阈值: " + threshold);
// 如果重复率超过阈值返回true表示需要过滤掉文档
return repetitionRate > threshold;
}
/**
* 检查文档的特殊字符率
* 如果特殊字符率太高意味着文档中特殊字符太多文档会被过滤掉
*
* @param content
* @param threshold
* @return
*/
/**
* 检测文本中特殊字符率是否超过阈值阈值范围0100.00
* @param content 待检测文本
* @param threshold 百分比阈值如传入10表示10%
* @return 超过阈值返回true
*/
public static boolean checkSpecialCharacterRate(String content, double threshold) {
// 参数校验
if (content == null || content.isEmpty()) {
log.warn("输入内容为空");
return false;
}
if (threshold < 0 || threshold > 100) {
throw new IllegalArgumentException("阈值必须是0100之间的数值");
}
// 预处理去除所有空白字符可选
String processedContent = content.replaceAll("\\s+", "");
int totalCharCount = processedContent.length();
// 空文本或纯空白内容处理
if (totalCharCount == 0) {
log.info("有效字符数为0");
return false;
}
// 统计特殊字符非字母数字汉字
// 正则说明
// [^a-zA-Z0-9\\p{Script=Han}] 排除字母数字和汉字
// 如需包含其他语言字符需调整正则
Pattern pattern = Pattern.compile("[^a-zA-Z0-9\\p{Script=Han}]");
Matcher matcher = pattern.matcher(processedContent);
int specialCharCount = 0;
while (matcher.find()) {
specialCharCount++;
}
// 计算特殊字符率转换为百分比
double specialCharRatePercent = (double) specialCharCount / totalCharCount * 100;
// 调试日志保留2位小数
DecimalFormat df = new DecimalFormat("0.00");
log.info("特殊字符检测结果: {}/{}={}% (阈值: {}%)",
specialCharCount,
totalCharCount,
df.format(specialCharRatePercent),
df.format(threshold));
// 浮点数精确比较添加1e-6容差
final double EPSILON = 1e-6;
return specialCharRatePercent - threshold > EPSILON;
}
/**
* 检查文档的色情暴力词率
* <p>
* 如果色情暴力词率太高文档会被过滤掉取值范围[0,100]
* </p>
*
* @param content 文本内容
* @param threshold 阈值
* @return 是否过滤文档
*/
public static boolean checkSensitiveWordRate (String content, double threshold) {
// TODO: 先使用 sensitive-word 处理有修改再调整
// 检测是否包含色情暴力词
boolean isFalse = SensitiveWordHelper.contains(content);
if (!isFalse) {
return false;
}
//返回所有敏感词
List<String> wordList = SensitiveWordHelper.findAll(content);
log.info("返回所有敏感词====>>>>{}", wordList);
// 统计敏感词的字符数量
int sensitiveWordLength = 0;
for (String word : wordList) {
sensitiveWordLength += word.length();
}
// 计算文档的总字符数不包括换行符等空白字符可以根据需要调整
// 或者使用 content.replaceAll("\\s+", "").length() 来排除空白字符
int totalCharCount = content.length();
// 计算敏感词长度占总长度的百分比
double specialCharRate = ((double) sensitiveWordLength / totalCharCount) * 100;
// 打印敏感词字符率和阈值方便调试
log.info("敏感词字符率: {}", String.format("%.3f", specialCharRate));
log.info("阈值: {}", threshold);
// 如果敏感词字符率超过阈值返回true表示需要过滤掉文档
return specialCharRate > threshold;
}
/*
* ---------------------------------------------------------------
* 🔖 去重配置
* ---------------------------------------------------------------
*/
/**
* 相似度去重配置
*
* @param contentMap 文本内容列表
* @param threshold 相似度阈值
* @return 是否需要去重
*/
/**
* 基于SimHash的文本相似度去重
* @param contentMap 文本集合Key: 文档ID, Value: 文本内容
* @param threshold 相似度阈值01如0.8表示80%相似
* @return 需要删除的文档ID列表
*/
public static List<Long> similarityDeduplication(Map<Long, String> contentMap, double threshold) {
// 参数校验
if (contentMap == null || contentMap.isEmpty()) {
return Collections.emptyList();
}
if (threshold < 0 || threshold > 1) {
throw new IllegalArgumentException("相似度阈值必须在01之间");
}
long startTime = System.currentTimeMillis();
// 1. 按文档ID排序保持处理顺序确定性
LinkedHashMap<Long, String> sortedMap = contentMap.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new));
// 2. 并行计算SimHash提升大数据量性能
Map<Long, String> simHashMap = sortedMap.entrySet().parallelStream()
.collect(Collectors.toMap(
Map.Entry::getKey,
entry -> HammingUtils.getSimHash(entry.getValue()),
(e1, e2) -> e1,
LinkedHashMap::new));
// 3. 相似度检测
List<Long> duplicateKeys = new ArrayList<>();
List<Long> processedIds = new ArrayList<>(simHashMap.keySet());
for (int i = 0; i < processedIds.size(); i++) {
Long currentId = processedIds.get(i);
if (duplicateKeys.contains(currentId)) {
continue;
}
String hash1 = simHashMap.get(currentId);
// 只与后续未处理的文档比较
for (int j = i + 1; j < processedIds.size(); j++) {
Long compareId = processedIds.get(j);
if (duplicateKeys.contains(compareId)) {
continue;
}
double similarity = HammingUtils.getSimilarity(
hash1,
simHashMap.get(compareId));
log.debug("文档 {} 与 {} 的相似度: {:.2f}%",
currentId, compareId, similarity * 100);
if (similarity > threshold) {
duplicateKeys.add(compareId);
log.info("标记为相似: {} ≈ {} (相似度: {:.2f}%)",
currentId, compareId, similarity * 100);
}
}
}
// 4. 性能日志
long cost = System.currentTimeMillis() - startTime;
log.info("去重完成: 总数={}, 重复数={}, 耗时={}ms",
contentMap.size(),
duplicateKeys.size(),
cost);
return duplicateKeys;
}
/*
* ---------------------------------------------------------------
* 🔖 去隐私配置
* ---------------------------------------------------------------
*/
// 定义一个正则表达式来匹配电子邮件地址
private static final String EMAIL_REGEX =
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,6}";
// 编译正则表达式为Pattern对象
private static final Pattern EMAIL_PATTERN = Pattern.compile(EMAIL_REGEX);
// 去除文本中的电子邮件地址
private static String removeEmails (String text) {
Matcher matcher = EMAIL_PATTERN.matcher(text);
// 使用空字符串替换匹配的电子邮件地址
return matcher.replaceAll("");
}
/**
* 去除Email
* <p>
* 去除email地址
*
* @param content
*/
public static String processFile (String content) {
// 去除电子邮件地址
String modifiedContent = removeEmails(content);
// 或者打印到控制台以查看结果
log.info("去除电子邮件地址:{}", modifiedContent);
return modifiedContent;
}
// 定义一个正则表达式来匹配IPv4地址
private static final String IPV4_REGEX =
"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\." +
"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)";
// 定义一个正则表达式来匹配IPv6地址
// 这个正则表达式相对简单可能无法匹配所有复杂的IPv6地址格式
// 但它可以匹配常见的IPv6地址如2001:0db8:85a3:0000:0000:8a2e:0370:7334
private static final String IPV6_REGEX =
"([0-9a-fA-F]{1,4}:){7}([0-9a-fA-F]{1,4})";
// 编译IPv4正则表达式为Pattern对象
private static final Pattern IPV4_PATTERN = Pattern.compile(IPV4_REGEX);
// 编译IPv6正则表达式为Pattern对象
private static final Pattern IPV6_PATTERN = Pattern.compile(IPV6_REGEX);
/**
* 去除文本中的IPv4和IPv6地址
*/
public static String removeIPAddresses (String text) {
Matcher ipv4Matcher = IPV4_PATTERN.matcher(text);
text = ipv4Matcher.replaceAll("");
Matcher ipv6Matcher = IPV6_PATTERN.matcher(text);
return ipv6Matcher.replaceAll("");
}
/**
* 手机号码的正则表达式
*/
private static final String MOBILE_REGEX = "1\\d{10}";
/**
* 国内电话号码的正则表达式
*/
private static final String DOMESTIC_PHONE_REGEX = "(\\d{4}-|\\d{3}-)?(\\d{8}|\\d{7})";
private static final String HOTLINE_REGEX = "^\\d{3,4}(-\\d{3,4})+$";
/**
* 电话号码400的正则表达式
*/
private static final String PHONE_REGEX = "400(-\\d{3,4}){2}|^800(-\\d{3,4}){2}";
/**
* 信用卡号的正则表达式
*/
private static final String CREDIT_CARD_REGEX = "^([1-9]{1})(\\d{15}|\\d{18})$";
/**
* 十六进制散列的正则表达式32或24 位十六进制数用于 SHA-256
*/
private static final String HASH_REGEX = "[a-fA-F0-9]{32}|[a-fA-F0-9]{24}";
// 编译正则表达式为Pattern对象
private static final Pattern MOBILE_PATTERN = Pattern.compile(MOBILE_REGEX);
private static final Pattern DOMESTIC_PHONE_PATTERN = Pattern.compile(DOMESTIC_PHONE_REGEX);
private static final Pattern PHONE_PATTERN = Pattern.compile(PHONE_REGEX);
private static final Pattern HOTLINE_PATTERN = Pattern.compile(HOTLINE_REGEX);
private static final Pattern CREDIT_CARD_PATTERN = Pattern.compile(CREDIT_CARD_REGEX);
private static final Pattern HASH_PATTERN = Pattern.compile(HASH_REGEX);
// 定义一个年份格式
private static final DateTimeFormatter YEAR_FORMAT = DateTimeFormatter.ofPattern("yyyy");
// 定义一个集合来存储要跳过的年份这里我们假设跳过当前年份和前几年的范围
private static final Set<String> YEARS_TO_SKIP = new HashSet<>();
static {
int currentYear = Year.now().getValue();
for (int i = currentYear - 5; i <= currentYear + 5; i++) {
YEARS_TO_SKIP.add(String.valueOf(i));
}
}
/**
* 去除数字
* <p>
* 去除数字和字母数字标识符如电话号码信用卡号十六进制散列等同时跳过年份和简单数字的实例
*
* @param text
* @return
*/
public static String removeIdentifiers (String text) {
// 使用正则表达式匹配电话号码
text = removePhone(text);
// 使用正则表达式匹配信用卡号
text = removeCreditCard(text);
// 使用正则表达式匹配十六进制散列
text = removeHashMatcher(text);
// // 使用StringBuilder和StringBuilder的replace方法去除其他数字但跳过年份和简单数字
// // TODO: 这里目前有bug先注释掉了
// StringBuilder sb = new StringBuilder(text);
// int index = 0;
// while ((index = findNextNumberToReplace(sb.toString())) != -1) {
// String number = sb.substring(index, findEndOfNumber(sb.toString(), index));
// if (!isYear(number) && !isSimpleNumber(number)) {
// sb.replace(index, index + number.length(), "");
// }
// }
return text;
}
/**
* 去除电话号码
*
* @param text 文本
* @return 去除电话号码后的文本
*/
private static String removePhone (String text) {
// 手机号码的正则表达式
Matcher mobileMatcher = MOBILE_PATTERN.matcher(text);
text = mobileMatcher.replaceAll("");
// 国内电话号码的正则表达式
Matcher domesticPhoneMatcher = DOMESTIC_PHONE_PATTERN.matcher(text);
text = domesticPhoneMatcher.replaceAll("");
// 电话号码400的正则表达式
Matcher phoneMatcher = PHONE_PATTERN.matcher(text);
text = phoneMatcher.replaceAll("");
// 热线电话格式的正则表达式
Matcher hotlinePhoneMatcher = HOTLINE_PATTERN.matcher(text);
text = hotlinePhoneMatcher.replaceAll("");
return text;
}
/**
* 去除信用卡号
*
* @param text 文本
* @return 去除信用卡号后的文本
*/
private static String removeCreditCard (String text) {
Matcher creditCardMatcher = CREDIT_CARD_PATTERN.matcher(text);
text = creditCardMatcher.replaceAll("");
return text;
}
/**
* 去除十六进制散列
*
* @param text 文本
* @return 去除十六进制散列后的文本
*/
private static String removeHashMatcher (String text) {
Matcher hashMatcher = HASH_PATTERN.matcher(text);
text = hashMatcher.replaceAll("");
return text;
}
// 查找下一个要替换的数字的起始索引
private static int findNextNumberToReplace (String text) {
// 这里可以添加更复杂的逻辑来定位要替换的数字但为了简化我们假设数字以空格或非数字字符分隔
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (Character.isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
// 找到数字的起始位置
while (i < text.length() && (Character.isDigit(text.charAt(i)) ||
(text.charAt(i) >= 'a' && text.charAt(i) <= 'f') ||
(text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) {
i++;
}
// 返回数字的起始索引减1因为我们要在循环外部处理i的递增
return i - 1 > 0 ? i - 1 : 0;
}
}
return -1; // 没有找到要替换的数字
}
// 找到数字的结束索引
private static int findEndOfNumber (String text, int startIndex) {
// 从startIndex开始向后查找直到遇到非数字字符
for (int i = startIndex; i < text.length(); i++) {
if (!(Character.isDigit(text.charAt(i)) ||
(text.charAt(i) >= 'a' && text.charAt(i) <= 'f') ||
(text.charAt(i) >= 'A' && text.charAt(i) <= 'F'))) {
return i;
}
}
return text.length(); // 如果字符串以数字结束则返回字符串的长度
}
// 检查一个字符串是否是年份
private static boolean isYear (String str) {
try {
int year = Integer.parseInt(str);
Year y = Year.parse(str, YEAR_FORMAT);
return YEARS_TO_SKIP.contains(str);
} catch (Exception e) {
return false;
}
}
// 检查一个字符串是否是简单数字这里假设不超过六位的连续数字
private static boolean isSimpleNumber (String str) {
try {
int number = Integer.parseInt(str);
return String.valueOf(number).equals(str) && number >= 0 && number < 1000000;
} catch (NumberFormatException e) {
return false;
}
}
public static void main (String[] args) {
String textWithIdentifiers = "Here are some identifiers: 123-456-7890, 1234567812345678, a1b2c3d4e5f6a1b2c3d4e5f6, 2023, and 987654.";
// 去除标识符
String textWithoutIdentifiers = removeIdentifiers(textWithIdentifiers);
// 打印结果
log.info(textWithoutIdentifiers);
// String traditionalText = "不經意,妳的笑容";
// String simplifiedText = traditionalToSimplified(traditionalText);
//
// log.info("繁体文本: [" + traditionalText + "]");
// log.info("简体文本: [" + simplifiedText + "]");
//String dirtyString="?<3F><>简体文<E4BD93><E69687><EFBFBD>f?<3F>G<EFBFBD><47>?<3F><>??<3F>G<EFBFBD>G<EFBFBD><47>پ?<3F>l?,,,杩欐槸涓€涓\\uE043贡鐮";
// // 先进行编码转换
// dirtyString = convertEncoding(dirtyString);
// // 再进行乱码和无意义 Unicode 字符的清理
// String cleanString = clean(dirtyString);
//// String s1 = removeNonPrintableUnicodeChars(s);
// log.info("去除乱码:[{}]", cleanString);
}
public static String clean (String input) {
// 更广泛的乱码字符范围包括一些扩展的不可打印字符
String cleanString = input.replaceAll("[\\x00-\\x1F\\x7F-\\x9F\\uFFFD]", "");
// 去除无意义的 Unicode 字符这里范围可根据实际情况修改
cleanString = cleanString.replaceAll("[\\uE000-\\uF8FF]", "");
return cleanString;
}
public static String convertEncoding (String input) {
// 尝试多种编码转换找到正确的编码
String[] encodings = {"UTF-8", "GBK", "Big5", "ISO-8859-1"};
for (String encoding : encodings) {
try {
byte[] bytes = input.getBytes(encoding);
String result = new String(bytes, StandardCharsets.UTF_8);
return result;
} catch (Exception e) {
// 编码转换失败继续尝试下一个编码
continue;
}
}
return input;
}
}

View File

@ -0,0 +1,83 @@
package cn.iocoder.yudao.module.mdpf.util;
import com.hankcs.hanlp.HanLP;
import lombok.extern.slf4j.Slf4j;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.List;
@Slf4j
public class HammingUtils {
// ======================== 新增方法 ========================
/**
* 短文本处理逻辑按字符拆分
*/
private static List<String> handleShortText(String str) {
List<String> result = new ArrayList<>();
for (char c : str.toCharArray()) {
result.add(String.valueOf(c));
}
return result;
}
// ======================== 原始方法优化后 ========================
public static String getHash(String str) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] hash = md.digest(str.getBytes(StandardCharsets.UTF_8));
return new BigInteger(1, hash).toString(2);
} catch (Exception e) {
log.error("Hash计算失败: {}", e.getMessage());
return str; // 降级处理
}
}
public static String getSimHash(String str) {
int[] v = new int[128];
// 修复点调用已定义的handleShortText方法
List<String> keywords = str.length() < 200 ?
handleShortText(str) :
HanLP.extractKeyword(str, str.length());
for (int i = 0; i < keywords.size(); i++) {
String keywordHash = getHash(keywords.get(i));
// 补全128位
keywordHash = String.format("%128s", keywordHash)
.replace(' ', '0')
.substring(0, 128);
int weight = 10 - (i / (keywords.size() / 10));
for (int j = 0; j < 128; j++) {
v[j] += (keywordHash.charAt(j) == '1') ? weight : -weight;
}
}
StringBuilder simHash = new StringBuilder();
for (int bit : v) {
simHash.append(bit > 0 ? "1" : "0");
}
return simHash.toString();
}
public static int getHammingDistance(String hash1, String hash2) {
if (hash1.length() != hash2.length()) {
return -1;
}
int distance = 0;
for (int i = 0; i < hash1.length(); i++) {
if (hash1.charAt(i) != hash2.charAt(i)) {
distance++;
}
}
return distance;
}
public static double getSimilarity(String hash1, String hash2) {
int distance = getHammingDistance(hash1, hash2);
return 1.0 - (double) distance / 128; // 标准化到[0,1]
}
}

View File

@ -0,0 +1,24 @@
package cn.iocoder.yudao.module.mdpf.util;
import java.net.HttpURLConnection;
import java.net.URL;
public class HttpURLConnectionUtil {
public static HttpURLConnection readFile (String filePath) {
try {
URL url = new URL(filePath);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
if (connection.getResponseCode() == HttpURLConnection.HTTP_OK) {
return connection;
} else {
System.out.println("Failed to fetch file. Server returned HTTP code: " + connection.getResponseCode());
}
connection.disconnect();
} catch (Exception e) {
System.out.println("Error fetching file from URL: " + e.getMessage());
}
return null;
}
}

View File

@ -0,0 +1,44 @@
package cn.iocoder.yudao.module.mdpf.util;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.time.LocalDateTime;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
/**
* 文件解析通用辅助工具类提供创建文本片段Map等功能
*/
@Component
@Slf4j
public class ParserUtils {
@Autowired
private ObjectMapper objectMapper;
/**
* 创建一个包含原始文本和元数据的 Map
* 这个 Map 将作为中间数据结构传递给 TextProcessor
*/
public Map<String, Object> createSegmentMap(String datasetMetaId, String originalMinioPath,
String fileExtension, String extractedText,
Map<String, Object> sourceSpecificMetadata, LocalDateTime processTime,
String segmentType) {
Map<String, Object> segmentMap = new HashMap<>();
segmentMap.put("id", UUID.randomUUID().toString()); // 临时ID便于在内存中追踪或作为MySQL的rawTextSegmentMongoId字段
segmentMap.put("datasetMetaId", datasetMetaId);
segmentMap.put("originalMinioPath", originalMinioPath);
segmentMap.put("sourceFileExtension", fileExtension);
segmentMap.put("extractedText", extractedText != null ? extractedText : "");
segmentMap.put("sourceSpecificMetadata", sourceSpecificMetadata != null ? sourceSpecificMetadata : Collections.emptyMap());
segmentMap.put("extractTime", processTime);
segmentMap.put("segmentType", segmentType);
return segmentMap;
}
}

View File

@ -0,0 +1,158 @@
package cn.iocoder.yudao.module.mdpf.util;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 文本清洗辅助工具类
*/
public class TextCleaningUtil {
// 简单HTML标签去除
private static final Pattern HTML_TAG_PATTERN = Pattern.compile("<[^>]*>");
// 简单Markdown格式去除 (粗体斜体链接图片等)
private static final Pattern MARKDOWN_PATTERN = Pattern.compile("(\\*\\*|__)(.*?)\\1|(\\*|_)(.*?)\\3|\\[(.*?)\\]\\((.*?)\\)|!\\((.*?)\\)\\[(.*?)\\]");
// 简单的邮箱和电话号码识别 (用于PII匿名化)
private static final Pattern EMAIL_PII_PATTERN = Pattern.compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,6}");
private static final Pattern PHONE_PII_PATTERN = Pattern.compile("\\d{3}[-\\s]?\\d{3}[-\\s]?\\d{4}|\\(\\d{3}\\)[-\\s]?\\d{3}[-\\s]?\\d{4}");
/**
* 规范化空白字符将多个空格制表符换行符替换为单个空格并去除首尾空白
*/
public static String normalizeWhitespace(String text) {
if (text == null) return null;
return text.replaceAll("\\s+", " ").trim();
}
/**
* 规范化标点符号将全角标点转半角统一常见标点去除重复标点等
* 这是一个简化版实际可能需要更复杂的规则或第三方库
*/
public static String normalizePunctuation(String text) {
if (text == null) return null;
// 修复移除Java中不支持的命名参数 (target: replacement:)
String result = text.replace("", ",")
.replace("", ".")
.replace("", "!")
.replace("", "?");
// 修复String.replaceAll 不支持 lambda 表达式作为替换字符串
// 需要使用 Pattern Matcher 显式处理来保留重复标点的第一个字符
Pattern p = Pattern.compile("[\\.,!?;]{2,}"); // 匹配两个或更多连续的 .,!?; 标点符号
Matcher m = p.matcher(result);
StringBuffer sb = new StringBuffer(); // 用于构建替换后的字符串
while (m.find()) {
// 对于每个匹配项替换为该匹配项的第一个字符
m.appendReplacement(sb, Matcher.quoteReplacement(m.group().substring(0, 1)));
}
m.appendTail(sb); // 将匹配后的剩余部分追加到StringBuffer
return sb.toString();
}
/**
* 去除HTML标签
*/
public static String removeHtmlTags(String text) {
if (text == null) return null;
return HTML_TAG_PATTERN.matcher(text).replaceAll("");
}
/**
* 去除常见的Markdown格式
*/
public static String removeMarkdownFormatting(String text) {
if (text == null) return null;
return MARKDOWN_PATTERN.matcher(text).replaceAll("$2$4$5$7"); // 替换为捕获组中的内容
}
/**
* 简单匿名化 PII (个人身份信息)例如邮箱和电话号码
* 返回一个包含清洗后文本和是否包含 PII Map
*/
public static Map<String, Object> anonymizePii(String text) {
HashMap<String, Object> map = new HashMap<>();
boolean hasPii = false;
if (text == null) {
// 修正移除 'new *' 冗余行
return new HashMap<String, Object>() {{
put("text", null);
put("has_pii", false);
}};
}
Matcher emailMatcher = EMAIL_PII_PATTERN.matcher(text);
// 修正移除命名参数 'replacement:'
if (emailMatcher.find()) {
text = emailMatcher.replaceAll("[EMAIL_REDACTED]");
hasPii = true;
}
Matcher phoneMatcher = PHONE_PII_PATTERN.matcher(text);
// 修正移除命名参数 'replacement:'
if (phoneMatcher.find()) {
text = phoneMatcher.replaceAll("[PHONE_REDACTED]");
hasPii = true;
}
map.put("text",text);
map.put("has_pii",hasPii);
// 修正移除 'new *' 冗余行
return map;
}
/**
* 检查文本是否包含敏感词
*/
public static boolean containsSensitiveWords(String text, List<String> sensitiveWords) {
if (text == null || sensitiveWords == null || sensitiveWords.isEmpty()) {
return false;
}
String lowerText = text.toLowerCase();
for (String word : sensitiveWords) {
if (lowerText.contains(word.toLowerCase())) {
return true;
}
}
return false;
}
/**
* 简单计算文本质量得分 (占位符)
* 实际可能基于可读性指数(Flesch-Kincaid)语法正确性API内容相关性等
*/
public static Double calculateQualityScore(String text) {
if (text == null || text.trim().isEmpty()) {
return 0.0;
}
// 示例基于文本长度和非标点字符比例的简单评分
int length = text.length();
long alphaNumericCount = text.chars().filter(Character::isLetterOrDigit).count();
if (length == 0) return 0.0;
// 假设长度越长字母数字占比越高质量越高
return Math.min(1.0, (double) alphaNumericCount / length + (double) length / 500.0); // 简单示例
}
/**
* 简单计算文本的Token数量 (占位符)
* 实际可能需要调用大模型分词器 SentencePiece, BPE
* 这里用空格分割词语来粗略估计
*/
public static Integer countTokens(String text) {
if (text == null || text.trim().isEmpty()) {
return 0;
}
// 简单的空格分词
return text.split("\\s+").length;
}
// TODO: 实现更复杂的文本处理功能例如
// - 语言检测 (使用 Apache Tika, Lingua 等库)
// - 关键词提取
// - 实体识别
// - 文本摘要
}

View File

@ -0,0 +1,127 @@
package cn.iocoder.yudao.module.mdpf.util;
import cn.iocoder.yudao.module.mdpf.dal.dataobject.dataset.DataSetFileMiddleDO;
import cn.iocoder.yudao.module.mdpf.util.TextCleaningUtil; // 此导入现在相对于新包是正确的
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.DigestUtils;
import org.springframework.util.StringUtils;
import java.math.BigDecimal;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.util.*;
import java.util.stream.Collectors;
/**
* 文本处理器负责对原始文本片段进行深度清洗质量评估和格式转换
*/
@Component
@Slf4j
public class TextProcessor {
@Autowired
private ObjectMapper objectMapper;
/**
* 清洗单个原始文本片段并评估其质量
*
* @param rawSegmentMap 原始文本片段信息 (来自解析策略的 Map)
* @param datasetId 关联的 DataSetMiddleDO ID (MySQL 主表 ID)
* @param sourceFileId 关联的 DataSetMiddleMongoDO ID (MongoDB 元数据 ID)
* @return 清洗后的 DataSetFileMiddleDO 实体如果文本质量过低或被过滤则返回 null
*/
public DataSetFileMiddleDO cleanAndEvaluate(Map<String, Object> rawSegmentMap, Long datasetId, Long sourceFileId) {
// Map 中提取原始文本片段的各项信息
String originalMinioPath = (String) rawSegmentMap.get("originalMinioPath");
String fileExtension = (String) rawSegmentMap.get("sourceFileExtension");
String extractedText = (String) rawSegmentMap.get("extractedText");
Map<String, Object> sourceSpecificMetadata = (Map<String, Object>) rawSegmentMap.get("sourceSpecificMetadata");
// additionalMetadata (现在是 sourceSpecificMetadata 的一部分) 中获取新字段
String dataSetFileUrl = (String) sourceSpecificMetadata.get("dataSetFileUrl");
String dataSetFileType = (String) sourceSpecificMetadata.get("dataSetFileType");
String datasetFileName = (String) sourceSpecificMetadata.get("datasetFileName");
String sourceFileName = (String) sourceSpecificMetadata.get("sourceFileName");
if (!StringUtils.hasText(extractedText)) {
log.warn("Skipping null or empty extracted text for datasetId: {}, Source File ID: {}", datasetId, sourceFileId);
return null;
}
String cleanedText = extractedText;
StringBuilder remarks = new StringBuilder(); // 仅用于日志不存入 DB
// --- 深度清洗步骤 (使用 TextCleaningUtil) ---
cleanedText = TextCleaningUtil.normalizeWhitespace(cleanedText);
cleanedText = TextCleaningUtil.normalizePunctuation(cleanedText);
cleanedText = TextCleaningUtil.removeHtmlTags(cleanedText);
cleanedText = TextCleaningUtil.removeMarkdownFormatting(cleanedText);
Map<String, Object> piiResult = TextCleaningUtil.anonymizePii(cleanedText);
cleanedText = (String) piiResult.get("text");
if ((Boolean) piiResult.get("has_pii")) {
remarks.append("包含PII已匿名化; ");
}
// if (TextCleaningUtil.containsSensitiveWords(cleanedText,null)) { // <-- 这里的调用现在应该能正确解析了
// cleanedText = TextCleaningUtil.filterSensitiveWords(cleanedText); // <-- 这里的调用现在应该能正确解析了
// remarks.append("包含敏感词,已过滤; ");
// }
// --- 质量评估与过滤 ---
Double qualityScoreDouble = TextCleaningUtil.calculateQualityScore(cleanedText);
BigDecimal qualityScore = BigDecimal.valueOf(qualityScoreDouble); // 转换为 BigDecimal
Integer tokenCount = TextCleaningUtil.countTokens(cleanedText);
DataSetFileMiddleDO cleanedFileDO = new DataSetFileMiddleDO();
// 填充来自 DataSetMiddleServiceImpl 传递的 ID
cleanedFileDO.setDataSetId(datasetId);
cleanedFileDO.setSourceFileId(sourceFileId); // 对应 MongoDB 元数据 ID
// 填充来自文件元数据的字段
cleanedFileDO.setDataSetFileUrl(dataSetFileUrl);
cleanedFileDO.setDataSetFileType(dataSetFileType);
cleanedFileDO.setDatasetFileName(datasetFileName);
cleanedFileDO.setSourceFileUrl(originalMinioPath); // originalMinioPath 对应 source_file_url
cleanedFileDO.setSourceFileName(sourceFileName);
cleanedFileDO.setSourceFileExtension(fileExtension);
// 填充清洗后的文本和相关度量
cleanedFileDO.setCleanedText(cleanedText);
cleanedFileDO.setCleanedTextHash(DigestUtils.md5DigestAsHex(cleanedText.getBytes(StandardCharsets.UTF_8)));
cleanedFileDO.setQualityScore(qualityScore);
cleanedFileDO.setTokenCount(tokenCount);
cleanedFileDO.setCleanTime(LocalDateTime.now()); // 清洗时间是当前时间
// 最终过滤逻辑文本太短或质量分过低
// if (!StringUtils.hasText(cleanedText) || tokenCount <= 10 || qualityScore.compareTo(BigDecimal.valueOf(0.2)) < 0) {
// log.warn("Filtered out text segment due to final quality check (datasetId: {}, sourceFileId: {}), remarks: {}", datasetId, sourceFileId, remarks.toString());
// return null; // 返回 null 表示该片段被过滤
// }
return cleanedFileDO;
}
/**
* 批量清洗原始文本片段列表
*
* @param rawSegments 原始文本片段信息列表
* @param datasetId 关联的 DataSetMiddleDO ID (MySQL 主表 ID)
* @param sourceFileId 关联的 DataSetMiddleMongoDO ID (MongoDB 元数据 ID)
* @return 清洗后的 DataSetFileMiddleDO 实体列表
*/
public List<DataSetFileMiddleDO> cleanAndEvaluateList(List<Map<String, Object>> rawSegments, Long datasetId, Long sourceFileId) {
if (rawSegments == null || rawSegments.isEmpty()) {
return Collections.emptyList();
}
List<DataSetFileMiddleDO> list = rawSegments.stream()
.map(rawSegmentMap -> cleanAndEvaluate(rawSegmentMap, datasetId, sourceFileId))
.filter(Objects::nonNull) // 过滤掉返回 null (即被过滤掉) 的片段
.collect(Collectors.toList());
return list;
}
}

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.iocoder.yudao.module.data.dal.mysql.datasetfilemiddle.DataSetFileMiddleMapper">
<!--
一般情况下,尽可能使用 Mapper 进行 CRUD 增删改查即可。
无法满足的场景,例如说多表关联查询,才使用 XML 编写 SQL。
代码生成器暂时只生成 Mapper XML 文件本身,更多推荐 MybatisX 快速开发插件来生成查询。
文档可见https://www.iocoder.cn/MyBatis/x-plugins/
-->
</mapper>

View File

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetAnswerMapper">
<delete id="deleteTheAnswer">
DELETE FROM platform_dataset_answer WHERE dataset_id = #{id}
</delete>
<!--
一般情况下,尽可能使用 Mapper 进行 CRUD 增删改查即可。
无法满足的场景,例如说多表关联查询,才使用 XML 编写 SQL。
代码生成器暂时只生成 Mapper XML 文件本身,更多推荐 MybatisX 快速开发插件来生成查询。
文档可见https://www.iocoder.cn/MyBatis/x-plugins/
-->
<select id="getAnswersToYourQuestions"
resultType="cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetAnswerRespVO">
SELECT
da.id,
da.dataset_id,
da.dataset_files_id,
da.question_id,
da.answer,
da.create_time
FROM platform_dataset_answer da
WHERE da.deleted = 0
AND da.question_id IN
<foreach item="item" index="index" collection="collected"
open="(" separator="," close=")">
#{item}
</foreach>
</select>
</mapper>

View File

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.iocoder.yudao.module.mdpf.dal.mapper.dataset.PlatformDatasetQuestionMapper">
<delete id="deleteTheIssue">
DELETE FROM platform_dataset_question
WHERE dataset_id = #{id}
</delete>
<!--
一般情况下,尽可能使用 Mapper 进行 CRUD 增删改查即可。
无法满足的场景,例如说多表关联查询,才使用 XML 编写 SQL。
代码生成器暂时只生成 Mapper XML 文件本身,更多推荐 MybatisX 快速开发插件来生成查询。
文档可见https://www.iocoder.cn/MyBatis/x-plugins/
-->
<select id="getAListOfIssues"
resultType="cn.iocoder.yudao.module.mdpf.controller.dataset.vo.PlatformDatasetQuestionRespVO">
SELECT
id,
dataset_id AS datasetId,
dataset_files_id AS datasetFilesId,
question,
status,
'system',
create_time AS createTime
FROM platform_dataset_question
WHERE deleted = 0
AND dataset_id = #{datasetId}
</select>
</mapper>