feat(llm): 支持 doc 和 md 文件转换

- 新增 doc 和 md 文件转换功能 - 实现了将 doc 文件转换为 docx 格式 - 实现了将 md 文件转换为 txt 格式 - 优化了文件下载和处理逻辑
2025-02-28 14:25:05 +08:00 · 2025-02-28 14:25:05 +08:00 · 962c31e540
commit 962c31e540
parent 1afe99cf0c
2 changed files with 155 additions and 84 deletions
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java
@ -76,13 +76,6 @@ public class AsyncKnowledgeBase {
                        String extension = knowledge.getDocumentName().substring(lastIndex + 1).toLowerCase();
                        log.info("文档扩展名: {}", extension);
                        knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
-//                        if ("txt".equals(extension)) {
-//                            log.info("文档为 txt 文件，直接上传嵌入，文档 ID: {}", knowledge.getId());
-//                            ragHttpService.embedUploadFile(regUploadReqVO);
-//                        } else {
-//                            log.info("文档为非 txt 文件，调用知识嵌入方法，文档 ID: {}", knowledge.getId());
-//                            knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
-//                        }
                    } else {
                        log.warn("文档无扩展名，跳过处理，文档 ID: {}", knowledge.getId());
                    }
--- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java
+++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java
@ -17,6 +17,9 @@ import com.alibaba.fastjson.JSONException;
 import com.alibaba.fastjson.JSONObject;
 import com.baomidou.mybatisplus.core.toolkit.BeanUtils;
 import com.google.gson.JsonArray;
+import com.vladsch.flexmark.html.HtmlRenderer;
+import com.vladsch.flexmark.parser.Parser;
+import com.vladsch.flexmark.util.data.MutableDataSet;
 import kong.unirest.HttpResponse;
 import kong.unirest.Unirest;
 import kong.unirest.UnirestException;
@ -28,6 +31,13 @@ import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
+import org.jetbrains.annotations.NotNull;
+import org.jsoup.Jsoup;
 import org.mozilla.universalchardet.UniversalDetector;
 import org.springframework.stereotype.Service;

@ -35,9 +45,12 @@ import javax.annotation.Resource;
 import java.io.*;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
 import java.util.concurrent.TimeUnit;

 import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
@ -58,6 +71,9 @@ public class RagHttpService {
    @Resource
    private KnowledgeDocumentsMapper knowledgeDocumentsMapper;

+    //    @Resource
+    //    private  String TEMP_BASE_PATH= System.getProperty("user.dir") + "/temp";
+
    /**
     * 最大重试次数
     */
@ -99,7 +115,7 @@ public class RagHttpService {
     * @throws UnirestException 如果 Unirest 请求失败
     * @throws IOException      如果发生 I/O 错误
     */
-    public void embedUploadFile(RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException {
+    public void embedUploadFile (RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException {
        log.info("开始向量知识库文档上传流程");

        // 根据 fileId 查询知识库文档
@ -242,13 +258,13 @@ public class RagHttpService {
        log.info("向量知识库文档上传流程结束");
    }

-    public void printLogs(){
-        for (int i = 0; i < 5; i++){
+    public void printLogs () {
+        for (int i = 0; i < 5; i++) {
            log.info("===============================响应成功===============================");
        }
    }

-    public static String formatDuration(long durationMillis) {
+    public static String formatDuration (long durationMillis) {
        long minutes = durationMillis / 60000;
        long seconds = (durationMillis % 60000) / 1000;
        long millis = durationMillis % 1000;
@ -361,7 +377,7 @@ public class RagHttpService {
     * @param id    知识库ID
     * @throws IOException 如果发生I/O错误
     */
-    public void knowledgeEmbed(KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException {
+    public void knowledgeEmbed (KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException {
        log.info("开始知识库向量嵌入流程，知识库ID: {}", id);

        // 获取向量嵌入接口的URL
@ -389,10 +405,34 @@ public class RagHttpService {
        log.info("更新文件状态为上传中，文件ID: {}", fileId);
        updateFileState(documents, KnowledgeStatusEnum.UPLOADING);

-        // 获取文件字节数组
-        log.info("开始获取文件字节数组，文件URL: {}", fileUrl);
-        byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
-        log.info("成功获取文件字节数组，文件大小: {} 字节", fileBytes.length);
+        //        // 获取文件字节数组
+        //        log.info("开始获取文件字节数组，文件URL: {}", fileUrl);
+        //        byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
+        //        log.info("成功获取文件字节数组，文件大小: {} 字节", fileBytes.length);
+
+        // 获取文件并存储到临时目录
+        log.info("开始下载文件，文件URL: {}", fileUrl);
+        Path tempFilePath = downloadFileToTemp(fileUrl, fileName);
+        log.info("文件已下载到临时目录: {}", tempFilePath);
+
+        String fileSuffix = getFileSuffix(fileName);
+        if ("doc".equals(fileSuffix)) {
+            log.info("正在处理 doc 文件");
+            try {
+                tempFilePath=   converterDocToDocx(tempFilePath.toString(), tempFilePath.toString().replace(".doc", ".docx"));
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        if ("md".equals(fileSuffix)) {
+            log.info("正在处理 md 文件");
+            try {
+                tempFilePath=  converterMdToTxt(tempFilePath.toString(), tempFilePath.toString().replace(".md", ".docx"));
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }

        // 创建 OkHttpClient 实例
        log.info("创建 OkHttpClient 实例，设置超时时间为 3 分钟");
@ -408,7 +448,7 @@ public class RagHttpService {
                .setType(MultipartBody.FORM)
                .addFormDataPart("file_id", fileId)
                .addFormDataPart("file", fileName,
-                        RequestBody.create(fileBytes, MediaType.parse(mediaType))
+                        RequestBody.create(tempFilePath.toFile(), MediaType.parse(mediaType))
                )
                .build();

@ -462,49 +502,72 @@ public class RagHttpService {
        } catch (IOException e) {
            log.error("请求发生IO异常: {}", e.getMessage(), e);
            handleFailure(documents, FILE_UPLOAD_FAILED_MSG, e);
+        } finally {
+            // 删除临时文件
+            try {
+                Files.deleteIfExists(tempFilePath);
+                log.info("临时文件已删除: {}", tempFilePath);
+            } catch (IOException e) {
+                log.error("删除临时文件失败: {}", e.getMessage(), e);
+            }
        }

        log.info("知识库向量嵌入流程结束，知识库ID: {}", id);
    }

    /**
-     * 获取文件字节数组
+     * 下载文件到临时目录
     *
-     * @param fileUrl 文件地址
-     * @return 文件字节数组
+     * @param fileUrl  文件地址
+     * @param fileName 文件名
+     * @return 临时文件路径
     */
-    public static byte[] getFileByte(String fileUrl) {
-        log.info("开始读取远程文件，文件URL: {}", fileUrl);
-        try (InputStream inputStream = new URL(fileUrl).openStream();
-             ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
+    public static Path downloadFileToTemp (String fileUrl, String fileName) {
+        try {
+            // 获取临时目录
+            Path tempDir = getSystemTempDir();
+            log.info("系统临时目录: {}", tempDir.toAbsolutePath());

-            byte[] buffer = new byte[1024];
-            int bytesRead;
-            int totalBytesRead = 0;
+            // 创建目录（如果不存在）
+            Files.createDirectories(tempDir);
+            log.info("临时目录已创建: {}", tempDir.toAbsolutePath());

-            while ((bytesRead = inputStream.read(buffer)) != -1) {
-                outputStream.write(buffer, 0, bytesRead);
-                totalBytesRead += bytesRead;
+            // 创建临时文件路径
+            Path tempFilePath = tempDir.resolve(fileName);
+
+            // 下载文件到临时目录
+            try (InputStream inputStream = new URL(fileUrl).openStream()) {
+                Files.copy(inputStream, tempFilePath, StandardCopyOption.REPLACE_EXISTING);
            }

-            log.info("成功读取远程文件，文件大小: {} 字节", totalBytesRead);
-            return outputStream.toByteArray();
-
+            return tempFilePath;
        } catch (IOException e) {
-            log.error("读取远程文件失败: {}", e.getMessage(), e);
-            throw exception(new ErrorCode(10001_001, "文件读取错误"));
+            log.error("下载文件到临时目录失败: {}", e.getMessage(), e);
+            throw new RuntimeException("文件下载错误");
        }
    }

+    /**
+     * 获取系统级临时目录路径
+     * （符合操作系统规范，更安全可靠）
+     */
+    public static Path getSystemTempDir () throws IOException {
+        String sysTempDir = System.getProperty("java.io.tmpdir");
+
+        Path tempDir = Paths.get(sysTempDir, "myapp_temp");
+
+        return Files.createDirectories(tempDir);
+    }
+
    /**
     * 获取文件类型
     *
     * @param fileName 文件名
     * @return 文件类型
     */
-    private static String getMediaType(String fileName) {
+    private static String getMediaType (String fileName) {
        log.info("获取文件类型，文件名: {}", fileName);
-        String fileSuffix = fileName.substring(fileName.lastIndexOf(".") + 1);
+        String fileSuffix = getFileSuffix(fileName);
        String mediaType;
        switch (fileSuffix) {
            case "pdf":
@ -526,6 +589,67 @@ public class RagHttpService {
        log.info("文件类型: {}", mediaType);
        return mediaType;
    }
+
+    /**
+     * 获取文件后缀
+     *
+     * @param fileName 文件名
+     * @return 文件后缀
+     */
+    @NotNull
+    private static String getFileSuffix (String fileName) {
+        return fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase();
+    }
+
+    public static Path converterMdToTxt (String inputPath, String outputPath) throws Exception {
+        // 读取MD文件内容
+        //        String mdContent = Files.write(Paths.get(outputPath), inputPath.getBytes(StandardCharsets.UTF_8));
+        String mdContent = new String(Files.readAllBytes(Paths.get(inputPath)), StandardCharsets.UTF_8);
+
+        // 使用Flexmark转换为HTML
+        MutableDataSet options = new MutableDataSet();
+        Parser parser = Parser.builder(options).build();
+        HtmlRenderer renderer = HtmlRenderer.builder(options).build();
+        String html = renderer.render(parser.parse(mdContent));
+
+        // 使用Jsoup提取纯文本
+        String plainText = Jsoup.parse(html).text();
+
+        //        // 写入TXT文件
+        //        Files.writeString(Paths.get(outputPath), plainText);
+        Path path = Paths.get(outputPath);
+        try (Writer writer = new BufferedWriter(
+                new OutputStreamWriter(
+                        Files.newOutputStream(path), StandardCharsets.UTF_8))) {
+            writer.write(plainText);
+        }
+        return path;
+    }
+
+    public static Path converterDocToDocx(String inputPath, String outputPath) throws Exception {
+        // 读取DOC文档
+        try (HWPFDocument doc = new HWPFDocument(Files.newInputStream(Paths.get(inputPath)))) {
+            XWPFDocument docx = new XWPFDocument();
+
+            // 提取文本内容
+            Range range = doc.getRange();
+            for (int i = 0; i < range.numParagraphs(); i++) {
+                String text = range.getParagraph(i).text();
+
+                // 创建DOCX段落
+                XWPFParagraph paragraph = docx.createParagraph();
+                XWPFRun run = paragraph.createRun();
+                run.setText(text);
+            }
+
+            // 写入DOCX文件
+            try (FileOutputStream out = new FileOutputStream(outputPath)) {
+                docx.write(out);
+            }
+
+            return Paths.get(outputPath);
+        }
+    }
    /**
     * 处理响应结果
     */
@ -567,8 +691,6 @@ public class RagHttpService {
        throw new RuntimeException(errorMsg);
    }

-
-
    /**
     * 修改知识库文档状态
     *
@ -602,48 +724,4 @@ public class RagHttpService {
    private KnowledgeDocumentsDO getKnowledgeDocuments (String fileId) {
        return knowledgeDocumentsMapper.selectById(fileId);
    }
-
-
-    public static void main (String[] args) {
-        // 创建 OkHttpClient 实例
-        OkHttpClient client = new OkHttpClient();
-        String ragEmbed = "http://36.103.199.248:8123/embed";
-        String fileId = "778899";
-        String fileName = "docx23_副本.docx";
-        String fileUrl = "http://xhllm.xinnuojinzhi.com/admin-api/infra/file/29/get/5533434c4ed6b58415c33db46a73be3abe121b0ab66f25fb1a9050a2a978fda2.docx";
-        String mediaType = getMediaType(fileName);
-        byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
-        log.info("URL: {}, fileId: {} ,fileName: {}, fileUrl: {}, mediaType: {} ", ragEmbed, fileId, fileName, fileUrl, mediaType);
-
-        // 创建文件对象
-        //        File file = new File("/Users/yangliu/Documents/测试上传/测试上传 - new/docx1_副本.docx");
-
-        // 创建 MultipartBody
-        RequestBody requestBody = new MultipartBody.Builder()
-                .setType(MultipartBody.FORM)
-
-                .addFormDataPart("file_id", fileId)
-                .addFormDataPart("file", fileName,
-                        RequestBody.create(fileBytes, MediaType.parse(mediaType)))
-                .build();
-
-        // 创建请求
-        Request request = new Request.Builder()
-                .url(ragEmbed)
-                .post(requestBody)
-                .addHeader("accept", "application/json")
-                .build();
-
-        // 发送请求
-        try (Response response = client.newCall(request).execute()) {
-            if (response.isSuccessful()) {
-                System.out.println("Request successful: " + response.body().string());
-            } else {
-                System.out.println("Request failed: " + response.code() + " " + response.message());
-            }
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-    }
-
 }