[update] CSV 解析优化
This commit is contained in:
parent
3f7f62cd65
commit
4c55030083
@ -1,5 +1,8 @@
|
||||
package cn.iocoder.yudao.module.llm.utils;
|
||||
|
||||
import cn.hutool.core.text.csv.CsvReader;
|
||||
import cn.hutool.core.text.csv.CsvUtil;
|
||||
import cn.hutool.core.util.URLUtil;
|
||||
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO;
|
||||
import cn.iocoder.yudao.module.llm.utils.vo.CsvDataSetVO;
|
||||
import com.opencsv.CSVParser;
|
||||
@ -52,87 +55,22 @@ public class DataSetReadFileUtils {
|
||||
*/
|
||||
public static List<CsvDataSetVO> readParseCsv (String csvUrl) throws IOException, CsvValidationException {
|
||||
|
||||
List<CsvDataSetVO> dataSetVos = new ArrayList<>();
|
||||
List<CsvDataSetVO> dataSetVos;
|
||||
|
||||
// 创建CSV读取器
|
||||
CSVReader csvReader = null;
|
||||
try {
|
||||
URL url = new URL(csvUrl);
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
|
||||
// 创建 URL 对象
|
||||
URL url = URLUtil.url(csvUrl);
|
||||
|
||||
// 创建 CSV 解析器,分隔符为逗号
|
||||
CSVParser parser = new CSVParserBuilder().withSeparator(',').build();
|
||||
// 构建 CSV 读取器
|
||||
csvReader = new CSVReaderBuilder(reader).withCSVParser(parser).build();
|
||||
|
||||
// 读取标题行
|
||||
String[] headers = csvReader.readNext();
|
||||
String[] line;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
// 读取下一行数据
|
||||
line = csvReader.readNext();
|
||||
if (line == null) {
|
||||
// 数据集读取完成
|
||||
break;
|
||||
}
|
||||
} catch (com.opencsv.exceptions.CsvValidationException e) {
|
||||
// 处理读取行时的异常
|
||||
throw new IOException("读取 CSV 行时发生错误", e);
|
||||
}
|
||||
|
||||
// 动态读取,当行长度与标题行长度相等时
|
||||
if (line.length == headers.length) {
|
||||
// 获取系统列索引
|
||||
int systemIndex = getIndex(headers, "system");
|
||||
// 获取问题列索引
|
||||
int questionIndex = getIndex(headers, "question");
|
||||
// 获取答案列索引
|
||||
int answerIndex = getIndex(headers, "answer");
|
||||
// 存储系统列的值
|
||||
String systemValue = systemIndex == -1? "" : line[systemIndex];
|
||||
// 存储问题列的值
|
||||
String questionValue = questionIndex == -1? "" : line[questionIndex];
|
||||
// 存储答案列的值
|
||||
String answerValue = answerIndex == -1? "" : line[answerIndex];
|
||||
// 根据标题行找到相应列的索引创建 CsvDataSetVO 对象
|
||||
CsvDataSetVO dataSetVO = new CsvDataSetVO(systemValue, questionValue, answerValue);
|
||||
// 将对象添加到列表中
|
||||
dataSetVos.add(dataSetVO);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (csvReader != null) {
|
||||
try {
|
||||
// 关闭 CSV 读取器
|
||||
csvReader.close();
|
||||
} catch (IOException e) {
|
||||
// 关闭CSV读取器异常
|
||||
log.error("关闭CSV读取器时发生错误", e);
|
||||
}
|
||||
}
|
||||
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(url.openStream()))) {
|
||||
CsvReader reader = CsvUtil.getReader(bufferedReader);
|
||||
dataSetVos = reader.read(bufferedReader, CsvDataSetVO.class);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// 返回解析后的对象
|
||||
return dataSetVos;
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找列名在标题行中的索引
|
||||
*
|
||||
* @param headers 标题行
|
||||
* @param columnName 列名
|
||||
* @return 索引
|
||||
*/
|
||||
private static int getIndex (String[] headers, String columnName) {
|
||||
for (int i = 0; i < headers.length; i++) {
|
||||
if (headers[i].equals(columnName)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static Workbook readXlsxFromUrl(String filePath) {
|
||||
HttpURLConnection connection = readFile(filePath);
|
||||
|
@ -1,5 +1,6 @@
|
||||
package cn.iocoder.yudao.module.llm.utils.vo;
|
||||
|
||||
import cn.hutool.core.annotation.Alias;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -13,7 +14,10 @@ import lombok.ToString;
|
||||
@Data
|
||||
@ToString
|
||||
public class CsvDataSetVO {
|
||||
@Alias("system")
|
||||
private String system;
|
||||
@Alias("question")
|
||||
private String question;
|
||||
@Alias("answer")
|
||||
private String answer;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user