diff --git a/yudao-module-llm/yudao-module-llm-biz/pom.xml b/yudao-module-llm/yudao-module-llm-biz/pom.xml
index d5f728ca3..f8a8a4c5d 100644
--- a/yudao-module-llm/yudao-module-llm-biz/pom.xml
+++ b/yudao-module-llm/yudao-module-llm-biz/pom.xml
@@ -107,6 +107,24 @@
ok2curl
0.4.5
+
+
+
+ com.vladsch.flexmark
+ flexmark-all
+ 0.62.2
+
+
+ org.jsoup
+ jsoup
+ 1.15.3
+
+
+
+ org.apache.poi
+ poi-scratchpad
+ 5.2.3
+
diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java
index 3124fa3a0..9dceee18d 100644
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java
@@ -76,13 +76,6 @@ public class AsyncKnowledgeBase {
String extension = knowledge.getDocumentName().substring(lastIndex + 1).toLowerCase();
log.info("文档扩展名: {}", extension);
knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
-// if ("txt".equals(extension)) {
-// log.info("文档为 txt 文件,直接上传嵌入,文档 ID: {}", knowledge.getId());
-// ragHttpService.embedUploadFile(regUploadReqVO);
-// } else {
-// log.info("文档为非 txt 文件,调用知识嵌入方法,文档 ID: {}", knowledge.getId());
-// knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
-// }
} else {
log.warn("文档无扩展名,跳过处理,文档 ID: {}", knowledge.getId());
}
diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java
index 59d340c59..09ee4d04b 100644
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java
@@ -17,6 +17,9 @@ import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.BeanUtils;
import com.google.gson.JsonArray;
+import com.vladsch.flexmark.html.HtmlRenderer;
+import com.vladsch.flexmark.parser.Parser;
+import com.vladsch.flexmark.util.data.MutableDataSet;
import kong.unirest.HttpResponse;
import kong.unirest.Unirest;
import kong.unirest.UnirestException;
@@ -28,6 +31,13 @@ import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
+import org.jetbrains.annotations.NotNull;
+import org.jsoup.Jsoup;
import org.mozilla.universalchardet.UniversalDetector;
import org.springframework.stereotype.Service;
@@ -35,9 +45,12 @@ import javax.annotation.Resource;
import java.io.*;
import java.net.URL;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
import java.util.List;
import java.util.Map;
-import java.util.Objects;
import java.util.concurrent.TimeUnit;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
@@ -58,6 +71,9 @@ public class RagHttpService {
@Resource
private KnowledgeDocumentsMapper knowledgeDocumentsMapper;
+ // @Resource
+ // private String TEMP_BASE_PATH= System.getProperty("user.dir") + "/temp";
+
/**
* 最大重试次数
*/
@@ -99,7 +115,7 @@ public class RagHttpService {
* @throws UnirestException 如果 Unirest 请求失败
* @throws IOException 如果发生 I/O 错误
*/
- public void embedUploadFile(RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException {
+ public void embedUploadFile (RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException {
log.info("开始向量知识库文档上传流程");
// 根据 fileId 查询知识库文档
@@ -242,13 +258,13 @@ public class RagHttpService {
log.info("向量知识库文档上传流程结束");
}
- public void printLogs(){
- for (int i = 0; i < 5; i++){
+ public void printLogs () {
+ for (int i = 0; i < 5; i++) {
log.info("===============================响应成功===============================");
}
}
- public static String formatDuration(long durationMillis) {
+ public static String formatDuration (long durationMillis) {
long minutes = durationMillis / 60000;
long seconds = (durationMillis % 60000) / 1000;
long millis = durationMillis % 1000;
@@ -361,7 +377,7 @@ public class RagHttpService {
* @param id 知识库ID
* @throws IOException 如果发生I/O错误
*/
- public void knowledgeEmbed(KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException {
+ public void knowledgeEmbed (KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException {
log.info("开始知识库向量嵌入流程,知识库ID: {}", id);
// 获取向量嵌入接口的URL
@@ -389,10 +405,34 @@ public class RagHttpService {
log.info("更新文件状态为上传中,文件ID: {}", fileId);
updateFileState(documents, KnowledgeStatusEnum.UPLOADING);
- // 获取文件字节数组
- log.info("开始获取文件字节数组,文件URL: {}", fileUrl);
- byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
- log.info("成功获取文件字节数组,文件大小: {} 字节", fileBytes.length);
+ // // 获取文件字节数组
+ // log.info("开始获取文件字节数组,文件URL: {}", fileUrl);
+ // byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
+ // log.info("成功获取文件字节数组,文件大小: {} 字节", fileBytes.length);
+
+ // 获取文件并存储到临时目录
+ log.info("开始下载文件,文件URL: {}", fileUrl);
+ Path tempFilePath = downloadFileToTemp(fileUrl, fileName);
+ log.info("文件已下载到临时目录: {}", tempFilePath);
+
+ String fileSuffix = getFileSuffix(fileName);
+ if ("doc".equals(fileSuffix)) {
+ log.info("正在处理 doc 文件");
+ try {
+ tempFilePath= converterDocToDocx(tempFilePath.toString(), tempFilePath.toString().replace(".doc", ".docx"));
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ if ("md".equals(fileSuffix)) {
+ log.info("正在处理 md 文件");
+ try {
+ tempFilePath= converterMdToTxt(tempFilePath.toString(), tempFilePath.toString().replace(".md", ".docx"));
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
// 创建 OkHttpClient 实例
log.info("创建 OkHttpClient 实例,设置超时时间为 3 分钟");
@@ -408,7 +448,7 @@ public class RagHttpService {
.setType(MultipartBody.FORM)
.addFormDataPart("file_id", fileId)
.addFormDataPart("file", fileName,
- RequestBody.create(fileBytes, MediaType.parse(mediaType))
+ RequestBody.create(tempFilePath.toFile(), MediaType.parse(mediaType))
)
.build();
@@ -462,49 +502,72 @@ public class RagHttpService {
} catch (IOException e) {
log.error("请求发生IO异常: {}", e.getMessage(), e);
handleFailure(documents, FILE_UPLOAD_FAILED_MSG, e);
+ } finally {
+ // 删除临时文件
+ try {
+ Files.deleteIfExists(tempFilePath);
+ log.info("临时文件已删除: {}", tempFilePath);
+ } catch (IOException e) {
+ log.error("删除临时文件失败: {}", e.getMessage(), e);
+ }
}
log.info("知识库向量嵌入流程结束,知识库ID: {}", id);
}
/**
- * 获取文件字节数组
+ * 下载文件到临时目录
*
- * @param fileUrl 文件地址
- * @return 文件字节数组
+ * @param fileUrl 文件地址
+ * @param fileName 文件名
+ * @return 临时文件路径
*/
- public static byte[] getFileByte(String fileUrl) {
- log.info("开始读取远程文件,文件URL: {}", fileUrl);
- try (InputStream inputStream = new URL(fileUrl).openStream();
- ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
+ public static Path downloadFileToTemp (String fileUrl, String fileName) {
+ try {
+ // 获取临时目录
+ Path tempDir = getSystemTempDir();
+ log.info("系统临时目录: {}", tempDir.toAbsolutePath());
- byte[] buffer = new byte[1024];
- int bytesRead;
- int totalBytesRead = 0;
+ // 创建目录(如果不存在)
+ Files.createDirectories(tempDir);
+ log.info("临时目录已创建: {}", tempDir.toAbsolutePath());
- while ((bytesRead = inputStream.read(buffer)) != -1) {
- outputStream.write(buffer, 0, bytesRead);
- totalBytesRead += bytesRead;
+ // 创建临时文件路径
+ Path tempFilePath = tempDir.resolve(fileName);
+
+ // 下载文件到临时目录
+ try (InputStream inputStream = new URL(fileUrl).openStream()) {
+ Files.copy(inputStream, tempFilePath, StandardCopyOption.REPLACE_EXISTING);
}
- log.info("成功读取远程文件,文件大小: {} 字节", totalBytesRead);
- return outputStream.toByteArray();
-
+ return tempFilePath;
} catch (IOException e) {
- log.error("读取远程文件失败: {}", e.getMessage(), e);
- throw exception(new ErrorCode(10001_001, "文件读取错误"));
+ log.error("下载文件到临时目录失败: {}", e.getMessage(), e);
+ throw new RuntimeException("文件下载错误");
}
}
+ /**
+ * 获取系统级临时目录路径
+ * (符合操作系统规范,更安全可靠)
+ */
+ public static Path getSystemTempDir () throws IOException {
+ String sysTempDir = System.getProperty("java.io.tmpdir");
+
+ Path tempDir = Paths.get(sysTempDir, "myapp_temp");
+
+ return Files.createDirectories(tempDir);
+ }
+
/**
* 获取文件类型
*
* @param fileName 文件名
* @return 文件类型
*/
- private static String getMediaType(String fileName) {
+ private static String getMediaType (String fileName) {
log.info("获取文件类型,文件名: {}", fileName);
- String fileSuffix = fileName.substring(fileName.lastIndexOf(".") + 1);
+ String fileSuffix = getFileSuffix(fileName);
String mediaType;
switch (fileSuffix) {
case "pdf":
@@ -526,6 +589,67 @@ public class RagHttpService {
log.info("文件类型: {}", mediaType);
return mediaType;
}
+
+ /**
+ * 获取文件后缀
+ *
+ * @param fileName 文件名
+ * @return 文件后缀
+ */
+ @NotNull
+ private static String getFileSuffix (String fileName) {
+ return fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase();
+ }
+
+ public static Path converterMdToTxt (String inputPath, String outputPath) throws Exception {
+ // 读取MD文件内容
+ // String mdContent = Files.write(Paths.get(outputPath), inputPath.getBytes(StandardCharsets.UTF_8));
+ String mdContent = new String(Files.readAllBytes(Paths.get(inputPath)), StandardCharsets.UTF_8);
+
+ // 使用Flexmark转换为HTML
+ MutableDataSet options = new MutableDataSet();
+ Parser parser = Parser.builder(options).build();
+ HtmlRenderer renderer = HtmlRenderer.builder(options).build();
+ String html = renderer.render(parser.parse(mdContent));
+
+ // 使用Jsoup提取纯文本
+ String plainText = Jsoup.parse(html).text();
+
+ // // 写入TXT文件
+ // Files.writeString(Paths.get(outputPath), plainText);
+ Path path = Paths.get(outputPath);
+ try (Writer writer = new BufferedWriter(
+ new OutputStreamWriter(
+ Files.newOutputStream(path), StandardCharsets.UTF_8))) {
+ writer.write(plainText);
+ }
+ return path;
+ }
+
+ public static Path converterDocToDocx(String inputPath, String outputPath) throws Exception {
+ // 读取DOC文档
+ try (HWPFDocument doc = new HWPFDocument(Files.newInputStream(Paths.get(inputPath)))) {
+ XWPFDocument docx = new XWPFDocument();
+
+ // 提取文本内容
+ Range range = doc.getRange();
+ for (int i = 0; i < range.numParagraphs(); i++) {
+ String text = range.getParagraph(i).text();
+
+ // 创建DOCX段落
+ XWPFParagraph paragraph = docx.createParagraph();
+ XWPFRun run = paragraph.createRun();
+ run.setText(text);
+ }
+
+ // 写入DOCX文件
+ try (FileOutputStream out = new FileOutputStream(outputPath)) {
+ docx.write(out);
+ }
+
+ return Paths.get(outputPath);
+ }
+ }
/**
* 处理响应结果
*/
@@ -567,8 +691,6 @@ public class RagHttpService {
throw new RuntimeException(errorMsg);
}
-
-
/**
* 修改知识库文档状态
*
@@ -602,48 +724,4 @@ public class RagHttpService {
private KnowledgeDocumentsDO getKnowledgeDocuments (String fileId) {
return knowledgeDocumentsMapper.selectById(fileId);
}
-
-
- public static void main (String[] args) {
- // 创建 OkHttpClient 实例
- OkHttpClient client = new OkHttpClient();
- String ragEmbed = "http://36.103.199.248:8123/embed";
- String fileId = "778899";
- String fileName = "docx23_副本.docx";
- String fileUrl = "http://xhllm.xinnuojinzhi.com/admin-api/infra/file/29/get/5533434c4ed6b58415c33db46a73be3abe121b0ab66f25fb1a9050a2a978fda2.docx";
- String mediaType = getMediaType(fileName);
- byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
- log.info("URL: {}, fileId: {} ,fileName: {}, fileUrl: {}, mediaType: {} ", ragEmbed, fileId, fileName, fileUrl, mediaType);
-
- // 创建文件对象
- // File file = new File("/Users/yangliu/Documents/测试上传/测试上传 - new/docx1_副本.docx");
-
- // 创建 MultipartBody
- RequestBody requestBody = new MultipartBody.Builder()
- .setType(MultipartBody.FORM)
-
- .addFormDataPart("file_id", fileId)
- .addFormDataPart("file", fileName,
- RequestBody.create(fileBytes, MediaType.parse(mediaType)))
- .build();
-
- // 创建请求
- Request request = new Request.Builder()
- .url(ragEmbed)
- .post(requestBody)
- .addHeader("accept", "application/json")
- .build();
-
- // 发送请求
- try (Response response = client.newCall(request).execute()) {
- if (response.isSuccessful()) {
- System.out.println("Request successful: " + response.body().string());
- } else {
- System.out.println("Request failed: " + response.code() + " " + response.message());
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
}
diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataSetReadFileUtils.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataSetReadFileUtils.java
index b996ce14a..5557c0cc6 100644
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataSetReadFileUtils.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataSetReadFileUtils.java
@@ -3,16 +3,9 @@ package cn.iocoder.yudao.module.llm.utils;
import cn.hutool.core.text.csv.CsvReader;
import cn.hutool.core.text.csv.CsvUtil;
import cn.hutool.core.util.URLUtil;
-import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO;
import cn.iocoder.yudao.module.llm.utils.vo.CsvDataSetVO;
-import com.opencsv.CSVParser;
-import com.opencsv.CSVParserBuilder;
-import com.opencsv.CSVReader;
-import com.opencsv.CSVReaderBuilder;
import com.opencsv.exceptions.CsvValidationException;
import lombok.extern.slf4j.Slf4j;
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.springframework.stereotype.Component;
@@ -23,7 +16,6 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
-import java.util.ArrayList;
import java.util.List;
@Slf4j
diff --git a/yudao-server/pom.xml b/yudao-server/pom.xml
index b69297840..a533cab2d 100644
--- a/yudao-server/pom.xml
+++ b/yudao-server/pom.xml
@@ -159,12 +159,23 @@
hanlp
portable-1.3.4
+
+
+ com.vladsch.flexmark
+ flexmark-all
+ 0.62.2
+
org.jsoup
jsoup
- 1.10.3
+ 1.15.3
+
+ org.apache.poi
+ poi-scratchpad
+ 5.2.3
+