diff --git a/package.json b/package.json index a4af1052fe095..67bd7e3190b24 100644 --- a/package.json +++ b/package.json @@ -162,10 +162,12 @@ "diff": "^7.0.0", "drizzle-orm": "^0.40.0", "drizzle-zod": "^0.5.1", + "epub2": "^3.0.2", "fast-deep-equal": "^3.1.3", "file-type": "^20.0.0", "framer-motion": "^11.16.0", "gpt-tokenizer": "^2.8.1", + "html-to-text": "^9.0.5", "i18next": "^24.2.1", "i18next-browser-languagedetector": "^8.0.2", "i18next-resources-to-backend": "^1.2.1", diff --git a/src/database/client/migrations.json b/src/database/client/migrations.json index 543549ad41233..171b636570c1f 100644 --- a/src/database/client/migrations.json +++ b/src/database/client/migrations.json @@ -223,7 +223,10 @@ "hash": "9646161fa041354714f823d726af27247bcd6e60fa3be5698c0d69f337a5700b" }, { - "sql": ["DROP TABLE \"user_budgets\";", "\nDROP TABLE \"user_subscriptions\";"], + "sql": [ + "DROP TABLE \"user_budgets\";", + "\nDROP TABLE \"user_subscriptions\";" + ], "bps": true, "folderMillis": 1729699958471, "hash": "7dad43a2a25d1aec82124a4e53f8d82f8505c3073f23606c1dc5d2a4598eacf9" @@ -295,7 +298,9 @@ "hash": "845a692ceabbfc3caf252a97d3e19a213bc0c433df2689900135f9cfded2cf49" }, { - "sql": ["ALTER TABLE \"messages\" ADD COLUMN \"reasoning\" jsonb;"], + "sql": [ + "ALTER TABLE \"messages\" ADD COLUMN \"reasoning\" jsonb;" + ], "bps": true, "folderMillis": 1737609172353, "hash": "2cb36ae4fcdd7b7064767e04bfbb36ae34518ff4bb1b39006f2dd394d1893868" @@ -309,4 +314,4 @@ "folderMillis": 1739901891891, "hash": "78d8fefd8c58938d7bc3da2295a73b35ce2e8d7cb2820f8e817acdb8dd5bebb2" } -] +] \ No newline at end of file diff --git a/src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap b/src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap new file mode 100644 index 0000000000000..ffcb40644c23d --- /dev/null +++ b/src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap @@ -0,0 +1,238 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`EPubLoader > should run 1`] = ` +[ + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 1, + "to": 13, + }, + }, + "source": "", + }, + "pageContent": "HEFTY WATER + +This document serves to test Reading System support for the epub:switch +[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-content-switch] +element. There is also a little bit of ruby markup +[http://www.w3.org/TR/html5/the-ruby-element.html#the-ruby-element] available. + + +THE SWITCH + +Below is an instance of the epub:switch element, containing Chemical Markup +Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The +fallback content is a chunk of plain XHTML5.", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 9, + "to": 22, + }, + }, + "source": "", + }, + "pageContent": "THE SWITCH + +Below is an instance of the epub:switch element, containing Chemical Markup +Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The +fallback content is a chunk of plain XHTML5. + + * If your Reading System supports epub:switch and CML, it will render the CML + formula natively, and ignore (a.k.a not display) the XHTML fallback. + * If your Reading System supports epub:switch but not CML, it will ignore (not + display) the CML formula, and render the the XHTML fallback instead. + * If your Reading System does not support epub:switch at all, then the + rendering results are somewhat unpredictable, but the most likely result is + that it will display both a failed attempt to render the CML and the XHTML + fallback.", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 24, + "to": 43, + }, + }, + "source": "", + }, + "pageContent": "Note: the XHTML fallback is bold and enclosed in a gray dotted box with a +slightly gray background. A failed CML rendering will most likely appear above +the gray fallback box and read: +"H hydrogen O oxygen hefty H O water". + +Here the switch begins... + + +H hydrogen O oxygen hefty H O water + +2H2 + O2 ⟶ 2H2O + +... and here the switch ends. + + +THE SOURCE + +Below is a rendition of the source code of the switch element. Your Reading +System should display this correctly regardless of whether it supports the +switch element.", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 46, + "to": 66, + }, + }, + "source": "", + }, + "pageContent": " + + + + + H + hydrogen + + + + O + oxygen + + + hefty + + + H + O + water + ", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 57, + "to": 79, + }, + }, + "source": "", + }, + "pageContent": " oxygen + + + hefty + + + H + O + water + + + + + +

+ 2H2 + + + O2 + + 2H2O +

+
+
", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 84, + "to": 94, + }, + }, + "source": "", + }, + "pageContent": "HEFTY RUBY WATER + +While the ruby element is mostly used in east-asian languages, it can also be +useful in other contexts. As an example, and as you can see in the source of the +CML element above, the code includes a caption element which is intended to be +displayed below the formula segments. Following this paragraph is a reworked +version of the XHTML fallback used above, using the ruby element. If your +Reading System does not support ruby markup, then the captions will appear in +parentheses on the same line as the formula segments. + +2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 94, + "to": 111, + }, + }, + "source": "", + }, + "pageContent": "2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water) + +If your Reading System in addition to supporting ruby markup also supports the +-epub-ruby-position +[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-css-ruby-position] +property, then the captions will appear under the formula segments instead of +over them. + +The source code for the ruby version of the XHTML fallback looks as follows: + + +

+ 2H2(hydrogen) + + + O2(oxygen) + (hefty) + 2H2O(water) +

", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 105, + "to": 120, + }, + }, + "source": "", + }, + "pageContent": "

+ 2H2(hydrogen) + + + O2(oxygen) + (hefty) + 2H2O(water) +

+ + +... and the css declaration using the -epub-ruby-position property looks like +this: + + +p#rubyp { + -epub-ruby-position : under; +}", + }, +] +`; diff --git a/src/libs/langchain/loaders/epub/__tests__/demo.epub b/src/libs/langchain/loaders/epub/__tests__/demo.epub new file mode 100644 index 0000000000000..11cc0a74cf7cc Binary files /dev/null and b/src/libs/langchain/loaders/epub/__tests__/demo.epub differ diff --git a/src/libs/langchain/loaders/epub/__tests__/index.test.ts b/src/libs/langchain/loaders/epub/__tests__/index.test.ts new file mode 100644 index 0000000000000..4ee843b86379d --- /dev/null +++ b/src/libs/langchain/loaders/epub/__tests__/index.test.ts @@ -0,0 +1,24 @@ +// @vitest-environment node +import * as fs from 'node:fs'; +import { join } from 'node:path'; +import { expect } from 'vitest'; + +import { EPubLoader } from '../index'; + +function sanitizeDynamicFields(document: any[]) { + for (const doc of document) { + doc.metadata.source && (doc.metadata.source = ''); + } + return document; +} + +describe('EPubLoader', () => { + it('should run', async () => { + const content = fs.readFileSync(join(__dirname, `./demo.epub`)); + + const fileContent: Uint8Array = new Uint8Array(content); + + const data = await EPubLoader(fileContent); + expect(sanitizeDynamicFields(data)).toMatchSnapshot(); + }); +}); diff --git a/src/libs/langchain/loaders/epub/index.ts b/src/libs/langchain/loaders/epub/index.ts new file mode 100644 index 0000000000000..1481e49bba235 --- /dev/null +++ b/src/libs/langchain/loaders/epub/index.ts @@ -0,0 +1,21 @@ +import { EPubLoader as Loader } from '@langchain/community/document_loaders/fs/epub'; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; +import { loaderConfig } from '../config'; +import { TempFileManager } from '@/server/utils/tempFileManager'; + +export const EPubLoader = async (content: Uint8Array) => { + const tempManager = new TempFileManager(); + try { + const tempPath = await tempManager.writeTempFile(content); + const loader = new Loader(tempPath); + const documents = await loader.load(); + + const splitter = new RecursiveCharacterTextSplitter(loaderConfig); + return await splitter.splitDocuments(documents); + } catch (e) { + throw new Error(`EPubLoader error: ${(e as Error).message}`); + } finally { + tempManager.cleanup(); // 确保清理 + } + +}; diff --git a/src/libs/langchain/loaders/index.ts b/src/libs/langchain/loaders/index.ts index 50c91c8fc94cd..b4763320b99a2 100644 --- a/src/libs/langchain/loaders/index.ts +++ b/src/libs/langchain/loaders/index.ts @@ -14,6 +14,7 @@ import { MarkdownLoader } from './markdown'; import { PdfLoader } from './pdf'; import { PPTXLoader } from './pptx'; import { TextLoader } from './txt'; +import { EPubLoader } from './epub'; class LangChainError extends Error { constructor(message: string) { @@ -64,6 +65,10 @@ export class ChunkingLoader { return await CsVLoader(fileBlob); } + case 'epub': { + return await EPubLoader(content); + } + default: { throw new Error( `Unsupported file type [${type}], please check your file is supported, or create report issue here: https://github.com/lobehub/lobe-chat/discussions/3550`, @@ -100,6 +105,10 @@ export class ChunkingLoader { return 'csv'; } + if (filename.endsWith('epub')) { + return 'epub'; + } + const ext = filename.split('.').pop(); if (ext && SupportedTextSplitterLanguages.includes(ext as SupportedTextSplitterLanguage)) { diff --git a/src/libs/langchain/types.ts b/src/libs/langchain/types.ts index 0bcc746bcd9ce..5512fb9524774 100644 --- a/src/libs/langchain/types.ts +++ b/src/libs/langchain/types.ts @@ -6,4 +6,5 @@ export type LangChainLoaderType = | 'doc' | 'text' | 'latex' - | 'csv'; + | 'csv' + | 'epub'; diff --git a/src/server/utils/tempFileManager.ts b/src/server/utils/tempFileManager.ts new file mode 100644 index 0000000000000..908b457717df5 --- /dev/null +++ b/src/server/utils/tempFileManager.ts @@ -0,0 +1,70 @@ +import { mkdtempSync, rmSync , writeFileSync, existsSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { v4 as uuidv4 } from 'uuid'; + +/** + * 安全存储临时文件工具类 + */ +export class TempFileManager { + private readonly tempDir: string; + private filePaths: Set = new Set(); + + constructor() { + // 创建唯一临时目录 (跨平台安全) + this.tempDir = mkdtempSync(join(tmpdir(), 'epub-')); + // 注册退出清理钩子 + this.registerCleanupHook(); + } + + /** + * 将 Uint8Array 写入临时文件 + * @param data 文件数据 + * @param ext 文件扩展名 (默认 .epub) + * @returns 临时文件绝对路径 + */ + async writeTempFile(data: Uint8Array, ext = '.epub'): Promise { + const filePath = join(this.tempDir, `${uuidv4()}${ext}`); + + try { + writeFileSync(filePath, data); + this.filePaths.add(filePath); + return filePath; + } catch (error) { + this.cleanup(); // 写入失败时立即清理 + throw new Error(`Failed to write temp file: ${(error as Error).message}`); + } + } + + /** + * 安全清理临时资源 + */ + cleanup(): void { + if (existsSync(this.tempDir)) { + // 递归删除目录及内容 + rmSync(this.tempDir, { force: true, recursive: true }); + this.filePaths.clear(); + } + } + + /** + * 注册进程退出/异常时的自动清理 + */ + private registerCleanupHook(): void { + // 正常退出 + process.on('exit', () => this.cleanup()); + // 异常退出 + process.on('uncaughtException', (err) => { + console.error('Uncaught exception, cleaning temp files:', err); + this.cleanup(); + process.exit(1); + }); + // 信号终止 + ['SIGINT', 'SIGTERM'].forEach((signal) => { + process.on(signal, () => { + this.cleanup(); + process.exit(0); + }); + }); + } +}