diff --git a/yudao-module-llm/yudao-module-llm-biz/pom.xml b/yudao-module-llm/yudao-module-llm-biz/pom.xml index d5f728ca3..f8a8a4c5d 100644 --- a/yudao-module-llm/yudao-module-llm-biz/pom.xml +++ b/yudao-module-llm/yudao-module-llm-biz/pom.xml @@ -107,6 +107,24 @@ ok2curl 0.4.5 + + + + com.vladsch.flexmark + flexmark-all + 0.62.2 + + + org.jsoup + jsoup + 1.15.3 + + + + org.apache.poi + poi-scratchpad + 5.2.3 + diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java index 3124fa3a0..9dceee18d 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/async/AsyncKnowledgeBase.java @@ -76,13 +76,6 @@ public class AsyncKnowledgeBase { String extension = knowledge.getDocumentName().substring(lastIndex + 1).toLowerCase(); log.info("文档扩展名: {}", extension); knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId()); -// if ("txt".equals(extension)) { -// log.info("文档为 txt 文件,直接上传嵌入,文档 ID: {}", knowledge.getId()); -// ragHttpService.embedUploadFile(regUploadReqVO); -// } else { -// log.info("文档为非 txt 文件,调用知识嵌入方法,文档 ID: {}", knowledge.getId()); -// knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId()); -// } } else { log.warn("文档无扩展名,跳过处理,文档 ID: {}", knowledge.getId()); } diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java index 59d340c59..09ee4d04b 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/service/http/RagHttpService.java @@ -17,6 +17,9 @@ import com.alibaba.fastjson.JSONException; import com.alibaba.fastjson.JSONObject; import com.baomidou.mybatisplus.core.toolkit.BeanUtils; import com.google.gson.JsonArray; +import com.vladsch.flexmark.html.HtmlRenderer; +import com.vladsch.flexmark.parser.Parser; +import com.vladsch.flexmark.util.data.MutableDataSet; import kong.unirest.HttpResponse; import kong.unirest.Unirest; import kong.unirest.UnirestException; @@ -28,6 +31,13 @@ import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.apache.poi.xwpf.usermodel.XWPFRun; +import org.jetbrains.annotations.NotNull; +import org.jsoup.Jsoup; import org.mozilla.universalchardet.UniversalDetector; import org.springframework.stereotype.Service; @@ -35,9 +45,12 @@ import javax.annotation.Resource; import java.io.*; import java.net.URL; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.concurrent.TimeUnit; import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception; @@ -58,6 +71,9 @@ public class RagHttpService { @Resource private KnowledgeDocumentsMapper knowledgeDocumentsMapper; + // @Resource + // private String TEMP_BASE_PATH= System.getProperty("user.dir") + "/temp"; + /** * 最大重试次数 */ @@ -99,7 +115,7 @@ public class RagHttpService { * @throws UnirestException 如果 Unirest 请求失败 * @throws IOException 如果发生 I/O 错误 */ - public void embedUploadFile(RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException { + public void embedUploadFile (RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException { log.info("开始向量知识库文档上传流程"); // 根据 fileId 查询知识库文档 @@ -242,13 +258,13 @@ public class RagHttpService { log.info("向量知识库文档上传流程结束"); } - public void printLogs(){ - for (int i = 0; i < 5; i++){ + public void printLogs () { + for (int i = 0; i < 5; i++) { log.info("===============================响应成功==============================="); } } - public static String formatDuration(long durationMillis) { + public static String formatDuration (long durationMillis) { long minutes = durationMillis / 60000; long seconds = (durationMillis % 60000) / 1000; long millis = durationMillis % 1000; @@ -361,7 +377,7 @@ public class RagHttpService { * @param id 知识库ID * @throws IOException 如果发生I/O错误 */ - public void knowledgeEmbed(KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException { + public void knowledgeEmbed (KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException { log.info("开始知识库向量嵌入流程,知识库ID: {}", id); // 获取向量嵌入接口的URL @@ -389,10 +405,34 @@ public class RagHttpService { log.info("更新文件状态为上传中,文件ID: {}", fileId); updateFileState(documents, KnowledgeStatusEnum.UPLOADING); - // 获取文件字节数组 - log.info("开始获取文件字节数组,文件URL: {}", fileUrl); - byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl)); - log.info("成功获取文件字节数组,文件大小: {} 字节", fileBytes.length); + // // 获取文件字节数组 + // log.info("开始获取文件字节数组,文件URL: {}", fileUrl); + // byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl)); + // log.info("成功获取文件字节数组,文件大小: {} 字节", fileBytes.length); + + // 获取文件并存储到临时目录 + log.info("开始下载文件,文件URL: {}", fileUrl); + Path tempFilePath = downloadFileToTemp(fileUrl, fileName); + log.info("文件已下载到临时目录: {}", tempFilePath); + + String fileSuffix = getFileSuffix(fileName); + if ("doc".equals(fileSuffix)) { + log.info("正在处理 doc 文件"); + try { + tempFilePath= converterDocToDocx(tempFilePath.toString(), tempFilePath.toString().replace(".doc", ".docx")); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + if ("md".equals(fileSuffix)) { + log.info("正在处理 md 文件"); + try { + tempFilePath= converterMdToTxt(tempFilePath.toString(), tempFilePath.toString().replace(".md", ".docx")); + } catch (Exception e) { + throw new RuntimeException(e); + } + } // 创建 OkHttpClient 实例 log.info("创建 OkHttpClient 实例,设置超时时间为 3 分钟"); @@ -408,7 +448,7 @@ public class RagHttpService { .setType(MultipartBody.FORM) .addFormDataPart("file_id", fileId) .addFormDataPart("file", fileName, - RequestBody.create(fileBytes, MediaType.parse(mediaType)) + RequestBody.create(tempFilePath.toFile(), MediaType.parse(mediaType)) ) .build(); @@ -462,49 +502,72 @@ public class RagHttpService { } catch (IOException e) { log.error("请求发生IO异常: {}", e.getMessage(), e); handleFailure(documents, FILE_UPLOAD_FAILED_MSG, e); + } finally { + // 删除临时文件 + try { + Files.deleteIfExists(tempFilePath); + log.info("临时文件已删除: {}", tempFilePath); + } catch (IOException e) { + log.error("删除临时文件失败: {}", e.getMessage(), e); + } } log.info("知识库向量嵌入流程结束,知识库ID: {}", id); } /** - * 获取文件字节数组 + * 下载文件到临时目录 * - * @param fileUrl 文件地址 - * @return 文件字节数组 + * @param fileUrl 文件地址 + * @param fileName 文件名 + * @return 临时文件路径 */ - public static byte[] getFileByte(String fileUrl) { - log.info("开始读取远程文件,文件URL: {}", fileUrl); - try (InputStream inputStream = new URL(fileUrl).openStream(); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { + public static Path downloadFileToTemp (String fileUrl, String fileName) { + try { + // 获取临时目录 + Path tempDir = getSystemTempDir(); + log.info("系统临时目录: {}", tempDir.toAbsolutePath()); - byte[] buffer = new byte[1024]; - int bytesRead; - int totalBytesRead = 0; + // 创建目录(如果不存在) + Files.createDirectories(tempDir); + log.info("临时目录已创建: {}", tempDir.toAbsolutePath()); - while ((bytesRead = inputStream.read(buffer)) != -1) { - outputStream.write(buffer, 0, bytesRead); - totalBytesRead += bytesRead; + // 创建临时文件路径 + Path tempFilePath = tempDir.resolve(fileName); + + // 下载文件到临时目录 + try (InputStream inputStream = new URL(fileUrl).openStream()) { + Files.copy(inputStream, tempFilePath, StandardCopyOption.REPLACE_EXISTING); } - log.info("成功读取远程文件,文件大小: {} 字节", totalBytesRead); - return outputStream.toByteArray(); - + return tempFilePath; } catch (IOException e) { - log.error("读取远程文件失败: {}", e.getMessage(), e); - throw exception(new ErrorCode(10001_001, "文件读取错误")); + log.error("下载文件到临时目录失败: {}", e.getMessage(), e); + throw new RuntimeException("文件下载错误"); } } + /** + * 获取系统级临时目录路径 + * (符合操作系统规范,更安全可靠) + */ + public static Path getSystemTempDir () throws IOException { + String sysTempDir = System.getProperty("java.io.tmpdir"); + + Path tempDir = Paths.get(sysTempDir, "myapp_temp"); + + return Files.createDirectories(tempDir); + } + /** * 获取文件类型 * * @param fileName 文件名 * @return 文件类型 */ - private static String getMediaType(String fileName) { + private static String getMediaType (String fileName) { log.info("获取文件类型,文件名: {}", fileName); - String fileSuffix = fileName.substring(fileName.lastIndexOf(".") + 1); + String fileSuffix = getFileSuffix(fileName); String mediaType; switch (fileSuffix) { case "pdf": @@ -526,6 +589,67 @@ public class RagHttpService { log.info("文件类型: {}", mediaType); return mediaType; } + + /** + * 获取文件后缀 + * + * @param fileName 文件名 + * @return 文件后缀 + */ + @NotNull + private static String getFileSuffix (String fileName) { + return fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase(); + } + + public static Path converterMdToTxt (String inputPath, String outputPath) throws Exception { + // 读取MD文件内容 + // String mdContent = Files.write(Paths.get(outputPath), inputPath.getBytes(StandardCharsets.UTF_8)); + String mdContent = new String(Files.readAllBytes(Paths.get(inputPath)), StandardCharsets.UTF_8); + + // 使用Flexmark转换为HTML + MutableDataSet options = new MutableDataSet(); + Parser parser = Parser.builder(options).build(); + HtmlRenderer renderer = HtmlRenderer.builder(options).build(); + String html = renderer.render(parser.parse(mdContent)); + + // 使用Jsoup提取纯文本 + String plainText = Jsoup.parse(html).text(); + + // // 写入TXT文件 + // Files.writeString(Paths.get(outputPath), plainText); + Path path = Paths.get(outputPath); + try (Writer writer = new BufferedWriter( + new OutputStreamWriter( + Files.newOutputStream(path), StandardCharsets.UTF_8))) { + writer.write(plainText); + } + return path; + } + + public static Path converterDocToDocx(String inputPath, String outputPath) throws Exception { + // 读取DOC文档 + try (HWPFDocument doc = new HWPFDocument(Files.newInputStream(Paths.get(inputPath)))) { + XWPFDocument docx = new XWPFDocument(); + + // 提取文本内容 + Range range = doc.getRange(); + for (int i = 0; i < range.numParagraphs(); i++) { + String text = range.getParagraph(i).text(); + + // 创建DOCX段落 + XWPFParagraph paragraph = docx.createParagraph(); + XWPFRun run = paragraph.createRun(); + run.setText(text); + } + + // 写入DOCX文件 + try (FileOutputStream out = new FileOutputStream(outputPath)) { + docx.write(out); + } + + return Paths.get(outputPath); + } + } /** * 处理响应结果 */ @@ -567,8 +691,6 @@ public class RagHttpService { throw new RuntimeException(errorMsg); } - - /** * 修改知识库文档状态 * @@ -602,48 +724,4 @@ public class RagHttpService { private KnowledgeDocumentsDO getKnowledgeDocuments (String fileId) { return knowledgeDocumentsMapper.selectById(fileId); } - - - public static void main (String[] args) { - // 创建 OkHttpClient 实例 - OkHttpClient client = new OkHttpClient(); - String ragEmbed = "http://36.103.199.248:8123/embed"; - String fileId = "778899"; - String fileName = "docx23_副本.docx"; - String fileUrl = "http://xhllm.xinnuojinzhi.com/admin-api/infra/file/29/get/5533434c4ed6b58415c33db46a73be3abe121b0ab66f25fb1a9050a2a978fda2.docx"; - String mediaType = getMediaType(fileName); - byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl)); - log.info("URL: {}, fileId: {} ,fileName: {}, fileUrl: {}, mediaType: {} ", ragEmbed, fileId, fileName, fileUrl, mediaType); - - // 创建文件对象 - // File file = new File("/Users/yangliu/Documents/测试上传/测试上传 - new/docx1_副本.docx"); - - // 创建 MultipartBody - RequestBody requestBody = new MultipartBody.Builder() - .setType(MultipartBody.FORM) - - .addFormDataPart("file_id", fileId) - .addFormDataPart("file", fileName, - RequestBody.create(fileBytes, MediaType.parse(mediaType))) - .build(); - - // 创建请求 - Request request = new Request.Builder() - .url(ragEmbed) - .post(requestBody) - .addHeader("accept", "application/json") - .build(); - - // 发送请求 - try (Response response = client.newCall(request).execute()) { - if (response.isSuccessful()) { - System.out.println("Request successful: " + response.body().string()); - } else { - System.out.println("Request failed: " + response.code() + " " + response.message()); - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } diff --git a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataSetReadFileUtils.java b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataSetReadFileUtils.java index b996ce14a..5557c0cc6 100644 --- a/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataSetReadFileUtils.java +++ b/yudao-module-llm/yudao-module-llm-biz/src/main/java/cn/iocoder/yudao/module/llm/utils/DataSetReadFileUtils.java @@ -3,16 +3,9 @@ package cn.iocoder.yudao.module.llm.utils; import cn.hutool.core.text.csv.CsvReader; import cn.hutool.core.text.csv.CsvUtil; import cn.hutool.core.util.URLUtil; -import cn.iocoder.yudao.module.llm.dal.dataobject.dataset.DatasetQuestionDO; import cn.iocoder.yudao.module.llm.utils.vo.CsvDataSetVO; -import com.opencsv.CSVParser; -import com.opencsv.CSVParserBuilder; -import com.opencsv.CSVReader; -import com.opencsv.CSVReaderBuilder; import com.opencsv.exceptions.CsvValidationException; import lombok.extern.slf4j.Slf4j; -import org.apache.poi.ss.usermodel.Cell; -import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.springframework.stereotype.Component; @@ -23,7 +16,6 @@ import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; -import java.util.ArrayList; import java.util.List; @Slf4j diff --git a/yudao-server/pom.xml b/yudao-server/pom.xml index b69297840..a533cab2d 100644 --- a/yudao-server/pom.xml +++ b/yudao-server/pom.xml @@ -159,12 +159,23 @@ hanlp portable-1.3.4 + + + com.vladsch.flexmark + flexmark-all + 0.62.2 + org.jsoup jsoup - 1.10.3 + 1.15.3 + + org.apache.poi + poi-scratchpad + 5.2.3 +