feat(llm): 支持 doc 和 md 文件转换
- 新增 doc 和 md 文件转换功能 - 实现了将 doc 文件转换为 docx 格式 - 实现了将 md 文件转换为 txt 格式 - 优化了文件下载和处理逻辑
This commit is contained in:
parent
1afe99cf0c
commit
962c31e540
@ -76,13 +76,6 @@ public class AsyncKnowledgeBase {
|
||||
String extension = knowledge.getDocumentName().substring(lastIndex + 1).toLowerCase();
|
||||
log.info("文档扩展名: {}", extension);
|
||||
knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
|
||||
// if ("txt".equals(extension)) {
|
||||
// log.info("文档为 txt 文件,直接上传嵌入,文档 ID: {}", knowledge.getId());
|
||||
// ragHttpService.embedUploadFile(regUploadReqVO);
|
||||
// } else {
|
||||
// log.info("文档为非 txt 文件,调用知识嵌入方法,文档 ID: {}", knowledge.getId());
|
||||
// knowledgeEmbed(knowledge, knowledge.getKnowledgeBaseId());
|
||||
// }
|
||||
} else {
|
||||
log.warn("文档无扩展名,跳过处理,文档 ID: {}", knowledge.getId());
|
||||
}
|
||||
|
@ -17,6 +17,9 @@ import com.alibaba.fastjson.JSONException;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.baomidou.mybatisplus.core.toolkit.BeanUtils;
|
||||
import com.google.gson.JsonArray;
|
||||
import com.vladsch.flexmark.html.HtmlRenderer;
|
||||
import com.vladsch.flexmark.parser.Parser;
|
||||
import com.vladsch.flexmark.util.data.MutableDataSet;
|
||||
import kong.unirest.HttpResponse;
|
||||
import kong.unirest.Unirest;
|
||||
import kong.unirest.UnirestException;
|
||||
@ -28,6 +31,13 @@ import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFRun;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.mozilla.universalchardet.UniversalDetector;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -35,9 +45,12 @@ import javax.annotation.Resource;
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
|
||||
@ -58,6 +71,9 @@ public class RagHttpService {
|
||||
@Resource
|
||||
private KnowledgeDocumentsMapper knowledgeDocumentsMapper;
|
||||
|
||||
// @Resource
|
||||
// private String TEMP_BASE_PATH= System.getProperty("user.dir") + "/temp";
|
||||
|
||||
/**
|
||||
* 最大重试次数
|
||||
*/
|
||||
@ -99,7 +115,7 @@ public class RagHttpService {
|
||||
* @throws UnirestException 如果 Unirest 请求失败
|
||||
* @throws IOException 如果发生 I/O 错误
|
||||
*/
|
||||
public void embedUploadFile(RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException {
|
||||
public void embedUploadFile (RegUploadReqVO ragUploadReqVO) throws UnirestException, IOException {
|
||||
log.info("开始向量知识库文档上传流程");
|
||||
|
||||
// 根据 fileId 查询知识库文档
|
||||
@ -242,13 +258,13 @@ public class RagHttpService {
|
||||
log.info("向量知识库文档上传流程结束");
|
||||
}
|
||||
|
||||
public void printLogs(){
|
||||
for (int i = 0; i < 5; i++){
|
||||
public void printLogs () {
|
||||
for (int i = 0; i < 5; i++) {
|
||||
log.info("===============================响应成功===============================");
|
||||
}
|
||||
}
|
||||
|
||||
public static String formatDuration(long durationMillis) {
|
||||
public static String formatDuration (long durationMillis) {
|
||||
long minutes = durationMillis / 60000;
|
||||
long seconds = (durationMillis % 60000) / 1000;
|
||||
long millis = durationMillis % 1000;
|
||||
@ -361,7 +377,7 @@ public class RagHttpService {
|
||||
* @param id 知识库ID
|
||||
* @throws IOException 如果发生I/O错误
|
||||
*/
|
||||
public void knowledgeEmbed(KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException {
|
||||
public void knowledgeEmbed (KnowledgeRagEmbedReqVO reqVO, Long id) throws IOException {
|
||||
log.info("开始知识库向量嵌入流程,知识库ID: {}", id);
|
||||
|
||||
// 获取向量嵌入接口的URL
|
||||
@ -389,10 +405,34 @@ public class RagHttpService {
|
||||
log.info("更新文件状态为上传中,文件ID: {}", fileId);
|
||||
updateFileState(documents, KnowledgeStatusEnum.UPLOADING);
|
||||
|
||||
// 获取文件字节数组
|
||||
log.info("开始获取文件字节数组,文件URL: {}", fileUrl);
|
||||
byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
|
||||
log.info("成功获取文件字节数组,文件大小: {} 字节", fileBytes.length);
|
||||
// // 获取文件字节数组
|
||||
// log.info("开始获取文件字节数组,文件URL: {}", fileUrl);
|
||||
// byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
|
||||
// log.info("成功获取文件字节数组,文件大小: {} 字节", fileBytes.length);
|
||||
|
||||
// 获取文件并存储到临时目录
|
||||
log.info("开始下载文件,文件URL: {}", fileUrl);
|
||||
Path tempFilePath = downloadFileToTemp(fileUrl, fileName);
|
||||
log.info("文件已下载到临时目录: {}", tempFilePath);
|
||||
|
||||
String fileSuffix = getFileSuffix(fileName);
|
||||
if ("doc".equals(fileSuffix)) {
|
||||
log.info("正在处理 doc 文件");
|
||||
try {
|
||||
tempFilePath= converterDocToDocx(tempFilePath.toString(), tempFilePath.toString().replace(".doc", ".docx"));
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
if ("md".equals(fileSuffix)) {
|
||||
log.info("正在处理 md 文件");
|
||||
try {
|
||||
tempFilePath= converterMdToTxt(tempFilePath.toString(), tempFilePath.toString().replace(".md", ".docx"));
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
// 创建 OkHttpClient 实例
|
||||
log.info("创建 OkHttpClient 实例,设置超时时间为 3 分钟");
|
||||
@ -408,7 +448,7 @@ public class RagHttpService {
|
||||
.setType(MultipartBody.FORM)
|
||||
.addFormDataPart("file_id", fileId)
|
||||
.addFormDataPart("file", fileName,
|
||||
RequestBody.create(fileBytes, MediaType.parse(mediaType))
|
||||
RequestBody.create(tempFilePath.toFile(), MediaType.parse(mediaType))
|
||||
)
|
||||
.build();
|
||||
|
||||
@ -462,49 +502,72 @@ public class RagHttpService {
|
||||
} catch (IOException e) {
|
||||
log.error("请求发生IO异常: {}", e.getMessage(), e);
|
||||
handleFailure(documents, FILE_UPLOAD_FAILED_MSG, e);
|
||||
} finally {
|
||||
// 删除临时文件
|
||||
try {
|
||||
Files.deleteIfExists(tempFilePath);
|
||||
log.info("临时文件已删除: {}", tempFilePath);
|
||||
} catch (IOException e) {
|
||||
log.error("删除临时文件失败: {}", e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
log.info("知识库向量嵌入流程结束,知识库ID: {}", id);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文件字节数组
|
||||
* 下载文件到临时目录
|
||||
*
|
||||
* @param fileUrl 文件地址
|
||||
* @return 文件字节数组
|
||||
* @param fileUrl 文件地址
|
||||
* @param fileName 文件名
|
||||
* @return 临时文件路径
|
||||
*/
|
||||
public static byte[] getFileByte(String fileUrl) {
|
||||
log.info("开始读取远程文件,文件URL: {}", fileUrl);
|
||||
try (InputStream inputStream = new URL(fileUrl).openStream();
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
|
||||
public static Path downloadFileToTemp (String fileUrl, String fileName) {
|
||||
try {
|
||||
// 获取临时目录
|
||||
Path tempDir = getSystemTempDir();
|
||||
log.info("系统临时目录: {}", tempDir.toAbsolutePath());
|
||||
|
||||
byte[] buffer = new byte[1024];
|
||||
int bytesRead;
|
||||
int totalBytesRead = 0;
|
||||
// 创建目录(如果不存在)
|
||||
Files.createDirectories(tempDir);
|
||||
log.info("临时目录已创建: {}", tempDir.toAbsolutePath());
|
||||
|
||||
while ((bytesRead = inputStream.read(buffer)) != -1) {
|
||||
outputStream.write(buffer, 0, bytesRead);
|
||||
totalBytesRead += bytesRead;
|
||||
// 创建临时文件路径
|
||||
Path tempFilePath = tempDir.resolve(fileName);
|
||||
|
||||
// 下载文件到临时目录
|
||||
try (InputStream inputStream = new URL(fileUrl).openStream()) {
|
||||
Files.copy(inputStream, tempFilePath, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
log.info("成功读取远程文件,文件大小: {} 字节", totalBytesRead);
|
||||
return outputStream.toByteArray();
|
||||
|
||||
return tempFilePath;
|
||||
} catch (IOException e) {
|
||||
log.error("读取远程文件失败: {}", e.getMessage(), e);
|
||||
throw exception(new ErrorCode(10001_001, "文件读取错误"));
|
||||
log.error("下载文件到临时目录失败: {}", e.getMessage(), e);
|
||||
throw new RuntimeException("文件下载错误");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取系统级临时目录路径
|
||||
* (符合操作系统规范,更安全可靠)
|
||||
*/
|
||||
public static Path getSystemTempDir () throws IOException {
|
||||
String sysTempDir = System.getProperty("java.io.tmpdir");
|
||||
|
||||
Path tempDir = Paths.get(sysTempDir, "myapp_temp");
|
||||
|
||||
return Files.createDirectories(tempDir);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文件类型
|
||||
*
|
||||
* @param fileName 文件名
|
||||
* @return 文件类型
|
||||
*/
|
||||
private static String getMediaType(String fileName) {
|
||||
private static String getMediaType (String fileName) {
|
||||
log.info("获取文件类型,文件名: {}", fileName);
|
||||
String fileSuffix = fileName.substring(fileName.lastIndexOf(".") + 1);
|
||||
String fileSuffix = getFileSuffix(fileName);
|
||||
String mediaType;
|
||||
switch (fileSuffix) {
|
||||
case "pdf":
|
||||
@ -526,6 +589,67 @@ public class RagHttpService {
|
||||
log.info("文件类型: {}", mediaType);
|
||||
return mediaType;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文件后缀
|
||||
*
|
||||
* @param fileName 文件名
|
||||
* @return 文件后缀
|
||||
*/
|
||||
@NotNull
|
||||
private static String getFileSuffix (String fileName) {
|
||||
return fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase();
|
||||
}
|
||||
|
||||
public static Path converterMdToTxt (String inputPath, String outputPath) throws Exception {
|
||||
// 读取MD文件内容
|
||||
// String mdContent = Files.write(Paths.get(outputPath), inputPath.getBytes(StandardCharsets.UTF_8));
|
||||
String mdContent = new String(Files.readAllBytes(Paths.get(inputPath)), StandardCharsets.UTF_8);
|
||||
|
||||
// 使用Flexmark转换为HTML
|
||||
MutableDataSet options = new MutableDataSet();
|
||||
Parser parser = Parser.builder(options).build();
|
||||
HtmlRenderer renderer = HtmlRenderer.builder(options).build();
|
||||
String html = renderer.render(parser.parse(mdContent));
|
||||
|
||||
// 使用Jsoup提取纯文本
|
||||
String plainText = Jsoup.parse(html).text();
|
||||
|
||||
// // 写入TXT文件
|
||||
// Files.writeString(Paths.get(outputPath), plainText);
|
||||
Path path = Paths.get(outputPath);
|
||||
try (Writer writer = new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
Files.newOutputStream(path), StandardCharsets.UTF_8))) {
|
||||
writer.write(plainText);
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
public static Path converterDocToDocx(String inputPath, String outputPath) throws Exception {
|
||||
// 读取DOC文档
|
||||
try (HWPFDocument doc = new HWPFDocument(Files.newInputStream(Paths.get(inputPath)))) {
|
||||
XWPFDocument docx = new XWPFDocument();
|
||||
|
||||
// 提取文本内容
|
||||
Range range = doc.getRange();
|
||||
for (int i = 0; i < range.numParagraphs(); i++) {
|
||||
String text = range.getParagraph(i).text();
|
||||
|
||||
// 创建DOCX段落
|
||||
XWPFParagraph paragraph = docx.createParagraph();
|
||||
XWPFRun run = paragraph.createRun();
|
||||
run.setText(text);
|
||||
}
|
||||
|
||||
// 写入DOCX文件
|
||||
try (FileOutputStream out = new FileOutputStream(outputPath)) {
|
||||
docx.write(out);
|
||||
}
|
||||
|
||||
return Paths.get(outputPath);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* 处理响应结果
|
||||
*/
|
||||
@ -567,8 +691,6 @@ public class RagHttpService {
|
||||
throw new RuntimeException(errorMsg);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 修改知识库文档状态
|
||||
*
|
||||
@ -602,48 +724,4 @@ public class RagHttpService {
|
||||
private KnowledgeDocumentsDO getKnowledgeDocuments (String fileId) {
|
||||
return knowledgeDocumentsMapper.selectById(fileId);
|
||||
}
|
||||
|
||||
|
||||
public static void main (String[] args) {
|
||||
// 创建 OkHttpClient 实例
|
||||
OkHttpClient client = new OkHttpClient();
|
||||
String ragEmbed = "http://36.103.199.248:8123/embed";
|
||||
String fileId = "778899";
|
||||
String fileName = "docx23_副本.docx";
|
||||
String fileUrl = "http://xhllm.xinnuojinzhi.com/admin-api/infra/file/29/get/5533434c4ed6b58415c33db46a73be3abe121b0ab66f25fb1a9050a2a978fda2.docx";
|
||||
String mediaType = getMediaType(fileName);
|
||||
byte[] fileBytes = Objects.requireNonNull(getFileByte(fileUrl));
|
||||
log.info("URL: {}, fileId: {} ,fileName: {}, fileUrl: {}, mediaType: {} ", ragEmbed, fileId, fileName, fileUrl, mediaType);
|
||||
|
||||
// 创建文件对象
|
||||
// File file = new File("/Users/yangliu/Documents/测试上传/测试上传 - new/docx1_副本.docx");
|
||||
|
||||
// 创建 MultipartBody
|
||||
RequestBody requestBody = new MultipartBody.Builder()
|
||||
.setType(MultipartBody.FORM)
|
||||
|
||||
.addFormDataPart("file_id", fileId)
|
||||
.addFormDataPart("file", fileName,
|
||||
RequestBody.create(fileBytes, MediaType.parse(mediaType)))
|
||||
.build();
|
||||
|
||||
// 创建请求
|
||||
Request request = new Request.Builder()
|
||||
.url(ragEmbed)
|
||||
.post(requestBody)
|
||||
.addHeader("accept", "application/json")
|
||||
.build();
|
||||
|
||||
// 发送请求
|
||||
try (Response response = client.newCall(request).execute()) {
|
||||
if (response.isSuccessful()) {
|
||||
System.out.println("Request successful: " + response.body().string());
|
||||
} else {
|
||||
System.out.println("Request failed: " + response.code() + " " + response.message());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user