Merge remote-tracking branch 'origin/master'

This commit is contained in:
leon 2025-02-28 15:19:48 +08:00
commit 0a9ad7c2b7
5 changed files with 185 additions and 93 deletions

View File

@ -107,6 +107,24 @@
<artifactId>ok2curl</artifactId>
<version>0.4.5</version>
</dependency>
<dependency>
<groupId>com.vladsch.flexmark</groupId>
<artifactId>flexmark-all</artifactId>
<version>0.62.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.3</version>
</dependency>
</dependencies>
</project>

View File

@ -76,13 +76,6 @@ public class AsyncKnowledgeBase {
String extension = knowledge.getDocumentName().substring(lastIndex + 1).toLowerCase();
log.info("文档扩展名: {}", extension);
knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
// if ("txt".equals(extension)) {
// log.info("文档为 txt 文件,直接上传嵌入,文档 ID: {}", knowledge.getId());
// ragHttpService.embedUploadFile(regUploadReqVO);
// } else {
// log.info("文档为非 txt 文件,调用知识嵌入方法,文档 ID: {}", knowledge.getId());
// knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
// }
} else {
log.warn("文档无扩展名,跳过处理,文档 ID: {}", knowledge.getId());
}

View File

@ -17,6 +17,9 @@ import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.BeanUtils;
import com.google.gson.JsonArray;
import com.vladsch.flexmark.html.HtmlRenderer;
import com.vladsch.flexmark.parser.Parser;
import com.vladsch.flexmark.util.data.MutableDataSet;
import kong.unirest.HttpResponse;
import kong.unirest.Unirest;
import kong.unirest.UnirestException;
@ -28,6 +31,13 @@ import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import org.mozilla.universalchardet.UniversalDetector;
import org.springframework.stereotype.Service;
@ -35,9 +45,12 @@ import javax.annotation.Resource;
import java.io.*;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
@ -58,6 +71,9 @@ public class RagHttpService {
@Resource
private KnowledgeDocumentsMapper knowledgeDocumentsMapper;
// @Resource
// private String TEMP_BASE_PATH= System.getProperty("user.dir") + "/temp";
/**
* 最大重试次数
*/
@ -99,7 +115,7 @@ public class RagHttpService {
* @throws UnirestException 如果 Unirest 请求失败
* @throws IOException 如果发生 I/O 错误
*/
public void embedUploadFile(RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException {
public void embedUploadFile (RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException {
log.info("开始向量知识库文档上传流程");
// 根据 fileId 查询知识库文档
@ -242,13 +258,13 @@ public class RagHttpService {
log.info("向量知识库文档上传流程结束");
}
public void printLogs(){
for (int i = 0; i < 5; i++){
public void printLogs () {
for (int i = 0; i < 5; i++) {
log.info("===============================响应成功===============================");
}
}
public static String formatDuration(long durationMillis) {
public static String formatDuration (long durationMillis) {
long minutes = durationMillis / 60000;
long seconds = (durationMillis % 60000) / 1000;
long millis = durationMillis % 1000;
@ -361,7 +377,7 @@ public class RagHttpService {
* @param id 知识库ID
* @throws IOException 如果发生I/O错误
*/
public void knowledgeEmbed(KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException {
public void knowledgeEmbed (KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException {
log.info("开始知识库向量嵌入流程知识库ID: {}", id);
// 获取向量嵌入接口的URL
@ -389,10 +405,34 @@ public class RagHttpService {
log.info("更新文件状态为上传中文件ID: {}", fileId);
updateFileState(documents, KnowledgeStatusEnum.UPLOADING);
// 获取文件字节数组
log.info("开始获取文件字节数组文件URL: {}", fileUrl);
byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
log.info("成功获取文件字节数组,文件大小: {} 字节", fileBytes.length);
// // 获取文件字节数组
// log.info("开始获取文件字节数组文件URL: {}", fileUrl);
// byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
// log.info("成功获取文件字节数组,文件大小: {} 字节", fileBytes.length);
// 获取文件并存储到临时目录
log.info("开始下载文件文件URL: {}", fileUrl);
Path tempFilePath = downloadFileToTemp(fileUrl, fileName);
log.info("文件已下载到临时目录: {}", tempFilePath);
String fileSuffix = getFileSuffix(fileName);
if ("doc".equals(fileSuffix)) {
log.info("正在处理 doc 文件");
try {
tempFilePath= converterDocToDocx(tempFilePath.toString(), tempFilePath.toString().replace(".doc", ".docx"));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
if ("md".equals(fileSuffix)) {
log.info("正在处理 md 文件");
try {
tempFilePath= converterMdToTxt(tempFilePath.toString(), tempFilePath.toString().replace(".md", ".docx"));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
// 创建 OkHttpClient 实例
log.info("创建 OkHttpClient 实例,设置超时时间为 3 分钟");
@ -408,7 +448,7 @@ public class RagHttpService {
.setType(MultipartBody.FORM)
.addFormDataPart("file_id", fileId)
.addFormDataPart("file", fileName,
RequestBody.create(fileBytes, MediaType.parse(mediaType))
RequestBody.create(tempFilePath.toFile(), MediaType.parse(mediaType))
)
.build();
@ -462,49 +502,72 @@ public class RagHttpService {
} catch (IOException e) {
log.error("请求发生IO异常: {}", e.getMessage(), e);
handleFailure(documents, FILE_UPLOAD_FAILED_MSG, e);
} finally {
// 删除临时文件
try {
Files.deleteIfExists(tempFilePath);
log.info("临时文件已删除: {}", tempFilePath);
} catch (IOException e) {
log.error("删除临时文件失败: {}", e.getMessage(), e);
}
}
log.info("知识库向量嵌入流程结束知识库ID: {}", id);
}
/**
* 获取文件字节数组
* 下载文件到临时目录
*
* @param fileUrl 文件地址
* @return 文件字节数组
* @param fileUrl 文件地址
* @param fileName 文件名
* @return 临时文件路径
*/
public static byte[] getFileByte(String fileUrl) {
log.info("开始读取远程文件文件URL: {}", fileUrl);
try (InputStream inputStream = new URL(fileUrl).openStream();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
public static Path downloadFileToTemp (String fileUrl, String fileName) {
try {
// 获取临时目录
Path tempDir = getSystemTempDir();
log.info("系统临时目录: {}", tempDir.toAbsolutePath());
byte[] buffer = new byte[1024];
int bytesRead;
int totalBytesRead = 0;
// 创建目录如果不存在
Files.createDirectories(tempDir);
log.info("临时目录已创建: {}", tempDir.toAbsolutePath());
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
totalBytesRead += bytesRead;
// 创建临时文件路径
Path tempFilePath = tempDir.resolve(fileName);
// 下载文件到临时目录
try (InputStream inputStream = new URL(fileUrl).openStream()) {
Files.copy(inputStream, tempFilePath, StandardCopyOption.REPLACE_EXISTING);
}
log.info("成功读取远程文件,文件大小: {} 字节", totalBytesRead);
return outputStream.toByteArray();
return tempFilePath;
} catch (IOException e) {
log.error("读取远程文件失败: {}", e.getMessage(), e);
throw exception(new ErrorCode(10001_001, "文件读取错误"));
log.error("下载文件到临时目录失败: {}", e.getMessage(), e);
throw new RuntimeException("文件下载错误");
}
}
/**
* 获取系统级临时目录路径
* 符合操作系统规范更安全可靠
*/
public static Path getSystemTempDir () throws IOException {
String sysTempDir = System.getProperty("java.io.tmpdir");
Path tempDir = Paths.get(sysTempDir, "myapp_temp");
return Files.createDirectories(tempDir);
}
/**
* 获取文件类型
*
* @param fileName 文件名
* @return 文件类型
*/
private static String getMediaType(String fileName) {
private static String getMediaType (String fileName) {
log.info("获取文件类型,文件名: {}", fileName);
String fileSuffix = fileName.substring(fileName.lastIndexOf(".") + 1);
String fileSuffix = getFileSuffix(fileName);
String mediaType;
switch (fileSuffix) {
case "pdf":
@ -526,6 +589,67 @@ public class RagHttpService {
log.info("文件类型: {}", mediaType);
return mediaType;
}
/**
* 获取文件后缀
*
* @param fileName 文件名
* @return 文件后缀
*/
@NotNull
private static String getFileSuffix (String fileName) {
return fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase();
}
public static Path converterMdToTxt (String inputPath, String outputPath) throws Exception {
// 读取MD文件内容
// String mdContent = Files.write(Paths.get(outputPath), inputPath.getBytes(StandardCharsets.UTF_8));
String mdContent = new String(Files.readAllBytes(Paths.get(inputPath)), StandardCharsets.UTF_8);
// 使用Flexmark转换为HTML
MutableDataSet options = new MutableDataSet();
Parser parser = Parser.builder(options).build();
HtmlRenderer renderer = HtmlRenderer.builder(options).build();
String html = renderer.render(parser.parse(mdContent));
// 使用Jsoup提取纯文本
String plainText = Jsoup.parse(html).text();
// // 写入TXT文件
// Files.writeString(Paths.get(outputPath), plainText);
Path path = Paths.get(outputPath);
try (Writer writer = new BufferedWriter(
new OutputStreamWriter(
Files.newOutputStream(path), StandardCharsets.UTF_8))) {
writer.write(plainText);
}
return path;
}
public static Path converterDocToDocx(String inputPath, String outputPath) throws Exception {
// 读取DOC文档
try (HWPFDocument doc = new HWPFDocument(Files.newInputStream(Paths.get(inputPath)))) {
XWPFDocument docx = new XWPFDocument();
// 提取文本内容
Range range = doc.getRange();
for (int i = 0; i < range.numParagraphs(); i++) {
String text = range.getParagraph(i).text();
// 创建DOCX段落
XWPFParagraph paragraph = docx.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText(text);
}
// 写入DOCX文件
try (FileOutputStream out = new FileOutputStream(outputPath)) {
docx.write(out);
}
return Paths.get(outputPath);
}
}
/**
* 处理响应结果
*/
@ -567,8 +691,6 @@ public class RagHttpService {
throw new RuntimeException(errorMsg);
}
/**
* 修改知识库文档状态
*
@ -602,48 +724,4 @@ public class RagHttpService {
private KnowledgeDocumentsDO getKnowledgeDocuments (String fileId) {
return knowledgeDocumentsMapper.selectById(fileId);
}
public static void main (String[] args) {
// 创建 OkHttpClient 实例
OkHttpClient client = new OkHttpClient();
String ragEmbed = "http://36.103.199.248:8123/embed";
String fileId = "778899";
String fileName = "docx23_副本.docx";
String fileUrl = "http://xhllm.xinnuojinzhi.com/admin-api/infra/file/29/get/5533434c4ed6b58415c33db46a73be3abe121b0ab66f25fb1a9050a2a978fda2.docx";
String mediaType = getMediaType(fileName);
byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
log.info("URL: {}, fileId: {} ,fileName: {}, fileUrl: {}, mediaType: {} ", ragEmbed, fileId, fileName, fileUrl, mediaType);
// 创建文件对象
// File file = new File("/Users/yangliu/Documents/测试上传/测试上传 - new/docx1_副本.docx");
// 创建 MultipartBody
RequestBody requestBody = new MultipartBody.Builder()
.setType(MultipartBody.FORM)
.addFormDataPart("file_id", fileId)
.addFormDataPart("file", fileName,
RequestBody.create(fileBytes, MediaType.parse(mediaType)))
.build();
// 创建请求
Request request = new Request.Builder()
.url(ragEmbed)
.post(requestBody)
.addHeader("accept", "application/json")
.build();
// 发送请求
try (Response response = client.newCall(request).execute()) {
if (response.isSuccessful()) {
System.out.println("Request successful: " + response.body().string());
} else {
System.out.println("Request failed: " + response.code() + " " + response.message());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -3,16 +3,9 @@ package cn.iocoder.yudao.module.llm.utils;
import cn.hutool.core.text.csv.CsvReader;
import cn.hutool.core.text.csv.CsvUtil;
import cn.hutool.core.util.URLUtil;
import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO;
import cn.iocoder.yudao.module.llm.utils.vo.CsvDataSetVO;
import com.opencsv.CSVParser;
import com.opencsv.CSVParserBuilder;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import com.opencsv.exceptions.CsvValidationException;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.springframework.stereotype.Component;
@ -23,7 +16,6 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
@Slf4j

View File

@ -159,12 +159,23 @@
<artifactId>hanlp</artifactId>
<version>portable-1.3.4</version>
</dependency>
<dependency>
<groupId>com.vladsch.flexmark</groupId>
<artifactId>flexmark-all</artifactId>
<version>0.62.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
<version>1.15.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.3</version>
</dependency>
</dependencies>
<build>