[update] CSV 解析优化

This commit is contained in:
Liuyang 2025-01-14 13:11:16 +08:00
parent 3f7f62cd65
commit 4c55030083
2 changed files with 15 additions and 73 deletions

View File

@ -1,5 +1,8 @@
package cn.iocoder.yudao.module.llm.utils;
import cn.hutool.core.text.csv.CsvReader;
import cn.hutool.core.text.csv.CsvUtil;
import cn.hutool.core.util.URLUtil;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO;
import cn.iocoder.yudao.module.llm.utils.vo.CsvDataSetVO;
import com.opencsv.CSVParser;
@ -52,87 +55,22 @@ public class DataSetReadFileUtils {
*/
public static List<CsvDataSetVO> readParseCsv (String csvUrl) throws IOException, CsvValidationException {
List<CsvDataSetVO> dataSetVos = new ArrayList<>();
List<CsvDataSetVO> dataSetVos;
// 创建CSV读取器
CSVReader csvReader = null;
try {
URL url = new URL(csvUrl);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
// 创建 URL 对象
URL url = URLUtil.url(csvUrl);
// 创建 CSV 解析器分隔符为逗号
CSVParser parser = new CSVParserBuilder().withSeparator(',').build();
// 构建 CSV 读取器
csvReader = new CSVReaderBuilder(reader).withCSVParser(parser).build();
// 读取标题行
String[] headers = csvReader.readNext();
String[] line;
while (true) {
try {
// 读取下一行数据
line = csvReader.readNext();
if (line == null) {
// 数据集读取完成
break;
}
} catch (com.opencsv.exceptions.CsvValidationException e) {
// 处理读取行时的异常
throw new IOException("读取 CSV 行时发生错误", e);
}
// 动态读取当行长度与标题行长度相等时
if (line.length == headers.length) {
// 获取系统列索引
int systemIndex = getIndex(headers, "system");
// 获取问题列索引
int questionIndex = getIndex(headers, "question");
// 获取答案列索引
int answerIndex = getIndex(headers, "answer");
// 存储系统列的值
String systemValue = systemIndex == -1? "" : line[systemIndex];
// 存储问题列的值
String questionValue = questionIndex == -1? "" : line[questionIndex];
// 存储答案列的值
String answerValue = answerIndex == -1? "" : line[answerIndex];
// 根据标题行找到相应列的索引创建 CsvDataSetVO 对象
CsvDataSetVO dataSetVO = new CsvDataSetVO(systemValue, questionValue, answerValue);
// 将对象添加到列表中
dataSetVos.add(dataSetVO);
}
}
} finally {
if (csvReader != null) {
try {
// 关闭 CSV 读取器
csvReader.close();
} catch (IOException e) {
// 关闭CSV读取器异常
log.error("关闭CSV读取器时发生错误", e);
}
}
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(url.openStream()))) {
CsvReader reader = CsvUtil.getReader(bufferedReader);
dataSetVos = reader.read(bufferedReader, CsvDataSetVO.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
// 返回解析后的对象
return dataSetVos;
}
/**
* 查找列名在标题行中的索引
*
* @param headers 标题行
* @param columnName 列名
* @return 索引
*/
private static int getIndex (String[] headers, String columnName) {
for (int i = 0; i < headers.length; i++) {
if (headers[i].equals(columnName)) {
return i;
}
}
return -1;
}
public static Workbook readXlsxFromUrl(String filePath) {
HttpURLConnection connection = readFile(filePath);

View File

@ -1,5 +1,6 @@
package cn.iocoder.yudao.module.llm.utils.vo;
import cn.hutool.core.annotation.Alias;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -13,7 +14,10 @@ import lombok.ToString;
@Data
@ToString
public class CsvDataSetVO {
@Alias("system")
private String system;
@Alias("question")
private String question;
@Alias("answer")
private String answer;
}