From f1806477c4972c2d09f32df848216bcb69f4d680 Mon Sep 17 00:00:00 2001 From: Administrator <15274802129@163.com> Date: Mon, 29 Sep 2025 14:00:09 +0800 Subject: [PATCH] feat(ai): 新增阿里云百炼知识库工具类 - 添加 KnowledgeBaseUtil 工具类,封装阿里云百炼知识库操作 - 实现文件上传、知识库创建与更新等核心功能 - 支持文件MD5计算、分类管理、索引任务提交与状态查询 - 集成阿里云百炼SDK依赖,版本为2.5.0 - 提供完整的知识库初始化与文件更新流程示例 - 添加文件解析状态轮询与错误处理机制 - 支持知识库中文档的追加导入与旧文档删除功能 --- src/main/java/cc/mrbird/febs/ai/utils/KnowledgeBaseUtil.java | 523 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ pom.xml | 6 2 files changed, 529 insertions(+), 0 deletions(-) diff --git a/pom.xml b/pom.xml index a22f296..75102b1 100644 --- a/pom.xml +++ b/pom.xml @@ -30,6 +30,12 @@ <dependencies> <dependency> + <groupId>com.aliyun</groupId> + <artifactId>bailian20231229</artifactId> + <version>2.5.0</version> + </dependency> + + <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.8.9</version> diff --git a/src/main/java/cc/mrbird/febs/ai/utils/KnowledgeBaseUtil.java b/src/main/java/cc/mrbird/febs/ai/utils/KnowledgeBaseUtil.java new file mode 100644 index 0000000..5d7c8c7 --- /dev/null +++ b/src/main/java/cc/mrbird/febs/ai/utils/KnowledgeBaseUtil.java @@ -0,0 +1,523 @@ +package cc.mrbird.febs.ai.utils; + +import com.aliyun.bailian20231229.models.*; +import com.aliyun.teautil.models.RuntimeOptions; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.File; +import java.io.FileInputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.file.Paths; +import java.security.MessageDigest; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +public class KnowledgeBaseUtil { + private static String ACCESS_KEY_ID = "LTAI5tCyQRwhZ2eimxCFKbdq"; + private static String ACCESS_KEY_SECRET = "fs1mEwLXg2j9XuKJsFoW8ThQbJFqHl"; + private static String WORKSPACE_ID = "llm-4bcr09yfxlgz0b0t"; + private static String ENDPOINT = "bailian.cn-beijing.aliyuncs.com"; + + /** + * <b>description</b> : + * <p>使用凭据初始化账号Client</p> + * @return Client + */ + public static com.aliyun.bailian20231229.Client createClient() throws Exception { + com.aliyun.teaopenapi.models.Config config = new com.aliyun.teaopenapi.models.Config() + .setAccessKeyId(ACCESS_KEY_ID) + .setAccessKeySecret(ACCESS_KEY_SECRET) + .setEndpoint(ENDPOINT); + return new com.aliyun.bailian20231229.Client(config); + } + + + /** + * 添加分类 + * @param categoryName 分类名称 + * @param parentCategoryId 父分类ID + * @return 返回新增分类的ID + * @throws Exception 当API调用失败或其他异常情况时抛出 + */ + public static String AddCategory(String categoryName, String parentCategoryId) throws Exception { + // 创建阿里云百炼客户端 + com.aliyun.bailian20231229.Client client = KnowledgeBaseUtil.createClient(); + // 构造添加分类请求参数 + com.aliyun.bailian20231229.models.AddCategoryRequest addCategoryRequest = new com.aliyun.bailian20231229.models.AddCategoryRequest() + .setCategoryName(categoryName) + .setCategoryType("UNSTRUCTURED") + .setParentCategoryId(parentCategoryId); + com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); + java.util.Map<String, String> headers = new java.util.HashMap<>(); + // 调用添加分类API接口 + AddCategoryResponse addCategoryResponse = client.addCategoryWithOptions(WORKSPACE_ID, addCategoryRequest, headers, runtime); + return addCategoryResponse.getBody().getData().getCategoryId(); + } + + /** + * 计算文件的MD5值。 + * + * @param filePath 文件本地路径 + * @return 文件的MD5值 + * @throws Exception 如果计算过程中发生错误 + */ + public static String calculateMD5(String filePath) throws Exception { + MessageDigest md = MessageDigest.getInstance("MD5"); + try (FileInputStream fis = new FileInputStream(filePath)) { + byte[] buffer = new byte[4096]; + int bytesRead; + while ((bytesRead = fis.read(buffer)) != -1) { + md.update(buffer, 0, bytesRead); + } + } + StringBuilder sb = new StringBuilder(); + for (byte b : md.digest()) { + sb.append(String.format("%02x", b & 0xff)); + } + return sb.toString(); + } + + /** + * 获取文件大小(以字节为单位)。 + * + * @param filePath 文件本地路径 + * @return 文件大小(以字节为单位) + */ + public static String getFileSize(String filePath) { + File file = new File(filePath); + long fileSize = file.length(); + return String.valueOf(fileSize); + } + + /** + * 申请文件上传租约。 + * + * @param client 客户端对象 + * @param categoryId 类目ID + * @param fileName 文件名称 + * @param fileMd5 文件的MD5值 + * @param fileSize 文件大小(以字节为单位) + * @param workspaceId 业务空间ID + * @return 阿里云百炼服务的响应对象 + */ + public static ApplyFileUploadLeaseResponse applyLease(com.aliyun.bailian20231229.Client client, String categoryId, String fileName, String fileMd5, String fileSize, String workspaceId) throws Exception { + Map<String, String> headers = new HashMap<>(); + com.aliyun.bailian20231229.models.ApplyFileUploadLeaseRequest applyFileUploadLeaseRequest = new com.aliyun.bailian20231229.models.ApplyFileUploadLeaseRequest(); + applyFileUploadLeaseRequest.setFileName(fileName); + applyFileUploadLeaseRequest.setMd5(fileMd5); + applyFileUploadLeaseRequest.setSizeInBytes(fileSize); + com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); + ApplyFileUploadLeaseResponse applyFileUploadLeaseResponse = null; + applyFileUploadLeaseResponse = client.applyFileUploadLeaseWithOptions(categoryId, workspaceId, applyFileUploadLeaseRequest, headers, runtime); + return applyFileUploadLeaseResponse; + } + + /** + * 上传文件到临时存储。 + * + * @param preSignedUrl 上传租约中的 URL + * @param headers 上传请求的头部 + * @param filePath 文件本地路径 + * @throws Exception 如果上传过程中发生错误 + */ + public static void uploadFile(String preSignedUrl, Map<String, String> headers, String filePath) throws Exception { + File file = new File(filePath); + if (!file.exists() || !file.isFile()) { + throw new IllegalArgumentException("文件不存在或不是普通文件: " + filePath); + } + + try (FileInputStream fis = new FileInputStream(file)) { + URL url = new URL(preSignedUrl); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("PUT"); + conn.setDoOutput(true); + + // 设置上传请求头 + conn.setRequestProperty("X-bailian-extra", headers.get("X-bailian-extra")); + conn.setRequestProperty("Content-Type", headers.get("Content-Type")); + + // 分块读取并上传文件 + byte[] buffer = new byte[4096]; + int bytesRead; + while ((bytesRead = fis.read(buffer)) != -1) { + conn.getOutputStream().write(buffer, 0, bytesRead); + } + + int responseCode = conn.getResponseCode(); + if (responseCode != 200) { + throw new RuntimeException("上传失败: " + responseCode); + } + } + } + + /** + * 将文件添加到类目中。 + * + * @param client 客户端对象 + * @param leaseId 租约ID + * @param parser 用于文件的解析器 + * @param categoryId 类目ID + * @param workspaceId 业务空间ID + * @return 阿里云百炼服务的响应对象 + */ + public static AddFileResponse addFile(com.aliyun.bailian20231229.Client client, String leaseId, String parser, String categoryId, String workspaceId) throws Exception { + Map<String, String> headers = new HashMap<>(); + com.aliyun.bailian20231229.models.AddFileRequest addFileRequest = new com.aliyun.bailian20231229.models.AddFileRequest(); + addFileRequest.setLeaseId(leaseId); + addFileRequest.setParser(parser); + addFileRequest.setCategoryId(categoryId); + com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); + return client.addFileWithOptions(workspaceId, addFileRequest, headers, runtime); + } + + /** + * 查询文件的基本信息。 + * + * @param client 客户端对象 + * @param workspaceId 业务空间ID + * @param fileId 文件ID + * @return 阿里云百炼服务的响应对象 + */ + public static DescribeFileResponse describeFile(com.aliyun.bailian20231229.Client client, String workspaceId, String fileId) throws Exception { + Map<String, String> headers = new HashMap<>(); + com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); + return client.describeFileWithOptions(workspaceId, fileId, headers, runtime); + } + + /** + * 在阿里云百炼服务中创建知识库(初始化)。 + * + * @param client 客户端对象 + * @param workspaceId 业务空间ID + * @param fileId 文件ID + * @param name 知识库名称 + * @param structureType 知识库的数据类型 + * @param sourceType 应用数据的数据类型,支持类目类型和文件类型 + * @param sinkType 知识库的向量存储类型 + * @return 阿里云百炼服务的响应对象 + */ + public static CreateIndexResponse createIndex(com.aliyun.bailian20231229.Client client, String workspaceId, String fileId, String name, String structureType, String sourceType, String sinkType) throws Exception { + Map<String, String> headers = new HashMap<>(); + com.aliyun.bailian20231229.models.CreateIndexRequest createIndexRequest = new com.aliyun.bailian20231229.models.CreateIndexRequest(); + createIndexRequest.setStructureType(structureType); + createIndexRequest.setName(name); + createIndexRequest.setSourceType(sourceType); + createIndexRequest.setSinkType(sinkType); + createIndexRequest.setDocumentIds(Collections.singletonList(fileId)); + com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); + return client.createIndexWithOptions(workspaceId, createIndexRequest, headers, runtime); + } + + /** + * 向阿里云百炼服务提交索引任务。 + * + * @param client 客户端对象 + * @param workspaceId 业务空间ID + * @param indexId 知识库ID + * @return 阿里云百炼服务的响应对象 + */ + public static SubmitIndexJobResponse submitIndex(com.aliyun.bailian20231229.Client client, String workspaceId, String indexId) throws Exception { + Map<String, String> headers = new HashMap<>(); + com.aliyun.bailian20231229.models.SubmitIndexJobRequest submitIndexJobRequest = new com.aliyun.bailian20231229.models.SubmitIndexJobRequest(); + submitIndexJobRequest.setIndexId(indexId); + com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); + return client.submitIndexJobWithOptions(workspaceId, submitIndexJobRequest, headers, runtime); + } + + /** + * 查询索引任务状态。 + * + * @param client 客户端对象 + * @param workspaceId 业务空间ID + * @param jobId 任务ID + * @param indexId 知识库ID + * @return 阿里云百炼服务的响应对象 + */ + public static GetIndexJobStatusResponse getIndexJobStatus(com.aliyun.bailian20231229.Client client, String workspaceId, String jobId, String indexId) throws Exception { + Map<String, String> headers = new HashMap<>(); + com.aliyun.bailian20231229.models.GetIndexJobStatusRequest getIndexJobStatusRequest = new com.aliyun.bailian20231229.models.GetIndexJobStatusRequest(); + getIndexJobStatusRequest.setIndexId(indexId); + getIndexJobStatusRequest.setJobId(jobId); + com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); + GetIndexJobStatusResponse getIndexJobStatusResponse = null; + getIndexJobStatusResponse = client.getIndexJobStatusWithOptions(workspaceId, getIndexJobStatusRequest, headers, runtime); + return getIndexJobStatusResponse; + } + + /** + * 从指定的文档类知识库中永久删除一个或多个文件 + * + * @param client 客户端(Client) + * @param workspaceId 业务空间ID + * @param indexId 知识库ID + * @param fileId 文件ID + * @return 阿里云百炼服务的响应 + */ + public static DeleteIndexDocumentResponse deleteIndexDocument(com.aliyun.bailian20231229.Client client, String workspaceId, String indexId, String fileId) throws Exception { + Map<String, String> headers = new HashMap<>(); + DeleteIndexDocumentRequest deleteIndexDocumentRequest = new DeleteIndexDocumentRequest(); + deleteIndexDocumentRequest.setIndexId(indexId); + deleteIndexDocumentRequest.setDocumentIds(Collections.singletonList(fileId)); + com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); + return client.deleteIndexDocumentWithOptions(workspaceId, deleteIndexDocumentRequest, headers, runtime); + } + + /** + * 使用阿里云百炼服务创建知识库。 + * + * @param filePath 文件本地路径 + * @param workspaceId 业务空间ID + * @param name 知识库名称 + * @return 如果成功,返回知识库ID;否则返回 null + */ + public static String createKnowledgeBase(String filePath, String workspaceId, String name) { + // 设置默认值 + String categoryId = "default"; + String parser = "DASHSCOPE_DOCMIND"; + String sourceType = "DATA_CENTER_FILE"; + String structureType = "unstructured"; + String sinkType = "DEFAULT"; + try { + // 步骤1:初始化客户端(Client) + System.out.println("步骤1:初始化Client"); + com.aliyun.bailian20231229.Client client = KnowledgeBaseUtil.createClient(); + + // 步骤2:准备文件信息 + System.out.println("步骤2:准备文件信息"); + String fileName = new File(filePath).getName(); + String fileMd5 = calculateMD5(filePath); + String fileSize = getFileSize(filePath); + + // 步骤3:申请上传租约 + System.out.println("步骤3:向阿里云百炼申请上传租约"); + ApplyFileUploadLeaseResponse leaseResponse = applyLease(client, categoryId, fileName, fileMd5, fileSize, workspaceId); + String leaseId = leaseResponse.getBody().getData().getFileUploadLeaseId(); + String uploadUrl = leaseResponse.getBody().getData().getParam().getUrl(); + Object uploadHeaders = leaseResponse.getBody().getData().getParam().getHeaders(); + + // 步骤4:上传文件 + System.out.println("步骤4:上传文件到阿里云百炼"); + // 请自行安装jackson-databind + // 将上一步的uploadHeaders转换为Map(Key-Value形式) + ObjectMapper mapper = new ObjectMapper(); + Map<String, String> uploadHeadersMap = (Map<String, String>) mapper.readValue(mapper.writeValueAsString(uploadHeaders), Map.class); + uploadFile(uploadUrl, uploadHeadersMap, filePath); + + // 步骤5:将文件添加到服务器 + System.out.println("步骤5:将文件添加到阿里云百炼服务器"); + AddFileResponse addResponse = addFile(client, leaseId, parser, categoryId, workspaceId); + String fileId = addResponse.getBody().getData().getFileId(); + + // 步骤6:检查文件状态 + System.out.println("步骤6:检查阿里云百炼中的文件状态"); + while (true) { + DescribeFileResponse describeResponse = describeFile(client, workspaceId, fileId); + String status = describeResponse.getBody().getData().getStatus(); + System.out.println("当前文件状态:" + status); + + if (status.equals("INIT")) { + System.out.println("文件待解析,请稍候..."); + } else if (status.equals("PARSING")) { + System.out.println("文件解析中,请稍候..."); + } else if (status.equals("PARSE_SUCCESS")) { + System.out.println("文件解析完成!"); + break; + } else { + System.out.println("未知的文件状态:" + status + ",请联系技术支持。"); + return null; + } + TimeUnit.SECONDS.sleep(5); + } + + // 步骤7:初始化知识库 + System.out.println("步骤7:在阿里云百炼中创建知识库"); + CreateIndexResponse indexResponse = createIndex(client, workspaceId, fileId, name, structureType, sourceType, sinkType); + String indexId = indexResponse.getBody().getData().getId(); + + // 步骤8:提交索引任务 + System.out.println("步骤8:向阿里云百炼提交索引任务"); + SubmitIndexJobResponse submitResponse = submitIndex(client, workspaceId, indexId); + String jobId = submitResponse.getBody().getData().getId(); + + // 步骤9:获取索引任务状态 + System.out.println("步骤9:获取阿里云百炼索引任务状态"); + while (true) { + GetIndexJobStatusResponse getStatusResponse = getIndexJobStatus(client, workspaceId, jobId, indexId); + String status = getStatusResponse.getBody().getData().getStatus(); + System.out.println("当前索引任务状态:" + status); + + if (status.equals("COMPLETED")) { + break; + } + TimeUnit.SECONDS.sleep(5); + } + + System.out.println("阿里云百炼知识库创建成功!"); + return indexId; + + } catch (Exception e) { + System.out.println("发生错误:" + e.getMessage()); + e.printStackTrace(); + return null; + } + } + + + + /** + * 使用阿里云百炼服务更新知识库 + * + * @param filePath 文件(更新后的)的实际本地路径 + * @param workspaceId 业务空间ID + * @param indexId 需要更新的知识库ID + * @param oldFileId 需要更新的文件的FileID + * @return 如果成功,返回知识库ID;否则返回 null + */ + public static String updateKnowledgeBase(String filePath, String workspaceId, String indexId, String oldFileId) { + // 设置默认值 + String categoryId = "default"; + String parser = "DASHSCOPE_DOCMIND"; + String sourceType = "DATA_CENTER_FILE"; + try { + // 步骤1:初始化客户端(Client) + System.out.println("步骤1:创建Client"); + com.aliyun.bailian20231229.Client client = createClient(); + + // 步骤2:准备文件信息(更新后的文件) + System.out.println("步骤2:准备文件信息"); + String fileName = Paths.get(filePath).getFileName().toString(); + String fileMd5 = calculateMD5(filePath); + String fileSize = getFileSize(filePath); + + // 步骤3:申请上传租约 + System.out.println("步骤3:向阿里云百炼申请上传租约"); + ApplyFileUploadLeaseResponse leaseResponse = applyLease(client, categoryId, fileName, fileMd5, fileSize, workspaceId); + String leaseId = leaseResponse.getBody().getData().getFileUploadLeaseId(); + String uploadUrl = leaseResponse.getBody().getData().getParam().getUrl(); + Object uploadHeaders = leaseResponse.getBody().getData().getParam().getHeaders(); + + // 步骤4:上传文件到临时存储 + System.out.println("步骤4:上传文件到临时存储"); + // 请自行安装jackson-databind + // 将上一步的uploadHeaders转换为Map(Key-Value形式) + ObjectMapper mapper = new ObjectMapper(); + Map<String, String> uploadHeadersMap = (Map<String, String>) mapper.readValue(mapper.writeValueAsString(uploadHeaders), Map.class); + uploadFile(uploadUrl, uploadHeadersMap, filePath); + + // 步骤5:添加文件到类目中 + System.out.println("步骤5:添加文件到类目中"); + AddFileResponse addResponse = addFile(client, leaseId, parser, categoryId, workspaceId); + String fileId = addResponse.getBody().getData().getFileId(); + + // 步骤6:检查更新后的文件状态 + System.out.println("步骤6:检查阿里云百炼中的文件状态"); + while (true) { + DescribeFileResponse describeResponse = describeFile(client, workspaceId, fileId); + String status = describeResponse.getBody().getData().getStatus(); + System.out.println("当前文件状态:" + status); + if ("INIT".equals(status)) { + System.out.println("文件待解析,请稍候..."); + } else if ("PARSING".equals(status)) { + System.out.println("文件解析中,请稍候..."); + } else if ("PARSE_SUCCESS".equals(status)) { + System.out.println("文件解析完成!"); + break; + } else { + System.out.println("未知的文件状态:" + status + ",请联系技术支持。"); + return null; + } + Thread.sleep(5000); + } + + // 步骤7:提交追加文件任务 + System.out.println("步骤7:提交追加文件任务"); + SubmitIndexAddDocumentsJobResponse indexAddResponse = submitIndexAddDocumentsJob(client, workspaceId, indexId, fileId, sourceType); + String jobId = indexAddResponse.getBody().getData().getId(); + + // 步骤8:等待追加任务完成 + System.out.println("步骤8:等待追加任务完成"); + while (true) { + GetIndexJobStatusResponse jobStatusResponse = getIndexJobStatus(client, workspaceId, jobId, indexId); + String status = jobStatusResponse.getBody().getData().getStatus(); + System.out.println("当前索引任务状态:" + status); + if ("COMPLETED".equals(status)) { + break; + } + Thread.sleep(5000); + } + + // 步骤9:删除旧文件 + System.out.println("步骤9:删除旧文件"); + deleteIndexDocument(client, workspaceId, indexId, oldFileId); + + System.out.println("阿里云百炼知识库更新成功!"); + return indexId; + } catch (Exception e) { + System.out.println("发生错误:" + e.getMessage()); + return null; + } + } + + /** + * 向一个文档类知识库追加导入已解析的文件 + * + * @param client 客户端(Client) + * @param workspaceId 业务空间ID + * @param indexId 知识库ID + * @param fileId 文件ID + * @param sourceType 数据类型 + * @return 阿里云百炼服务的响应 + */ + public static SubmitIndexAddDocumentsJobResponse submitIndexAddDocumentsJob(com.aliyun.bailian20231229.Client client, String workspaceId, String indexId, String fileId, String sourceType) throws Exception { + Map<String, String> headers = new HashMap<>(); + SubmitIndexAddDocumentsJobRequest submitIndexAddDocumentsJobRequest = new SubmitIndexAddDocumentsJobRequest(); + submitIndexAddDocumentsJobRequest.setIndexId(indexId); + submitIndexAddDocumentsJobRequest.setDocumentIds(Collections.singletonList(fileId)); + submitIndexAddDocumentsJobRequest.setSourceType(sourceType); + RuntimeOptions runtime = new RuntimeOptions(); + return client.submitIndexAddDocumentsJobWithOptions(workspaceId, submitIndexAddDocumentsJobRequest, headers, runtime); + } + + + + /** + * 主函数。 + */ +// public static void main(String[] args) { +// Scanner scanner = new Scanner(System.in); +// +// String filePath = "D:\\项目\\大模型\\阿里云百炼\\知识库\\薪资谈判常见100问与答.md"; +// +// System.out.print("请为您的知识库输入一个名称:"); +// String kbName = scanner.nextLine(); +// +// String workspaceId = WORKSPACE_ID; +// String result = createKnowledgeBase(filePath, workspaceId, kbName); +// if (result != null) { +// System.out.println("知识库ID: " + result); +// } +// } + + /** + * 主函数。 + */ + public static void main(String[] args) { + + String filePath = "D:\\项目\\大模型\\阿里云百炼\\知识库\\薪资谈判常见100问与答.md"; + String indexId = "xlmj6e7ix1"; // 即 AddFile 接口返回的 FileId。您也可以在阿里云百炼控制台的应用数据页面,单击文件名称旁的 ID 图标获取。 + String oldFileId = "file_5c9f7e4e0f3e4b4ea2bd208a1b4f5e6f_12629554"; + + String workspaceId = WORKSPACE_ID; + String result = updateKnowledgeBase(filePath, workspaceId, indexId, oldFileId); + if (result != null) { + System.out.println("知识库更新成功,返回知识库ID: " + result); + } else { + System.out.println("知识库更新失败。"); + } + } + + +} -- Gitblit v1.9.1