diff --git a/package.json b/package.json
index a4af1052fe095..67bd7e3190b24 100644
--- a/package.json
+++ b/package.json
@@ -162,10 +162,12 @@
"diff": "^7.0.0",
"drizzle-orm": "^0.40.0",
"drizzle-zod": "^0.5.1",
+ "epub2": "^3.0.2",
"fast-deep-equal": "^3.1.3",
"file-type": "^20.0.0",
"framer-motion": "^11.16.0",
"gpt-tokenizer": "^2.8.1",
+ "html-to-text": "^9.0.5",
"i18next": "^24.2.1",
"i18next-browser-languagedetector": "^8.0.2",
"i18next-resources-to-backend": "^1.2.1",
diff --git a/src/database/client/migrations.json b/src/database/client/migrations.json
index 543549ad41233..171b636570c1f 100644
--- a/src/database/client/migrations.json
+++ b/src/database/client/migrations.json
@@ -223,7 +223,10 @@
"hash": "9646161fa041354714f823d726af27247bcd6e60fa3be5698c0d69f337a5700b"
},
{
- "sql": ["DROP TABLE \"user_budgets\";", "\nDROP TABLE \"user_subscriptions\";"],
+ "sql": [
+ "DROP TABLE \"user_budgets\";",
+ "\nDROP TABLE \"user_subscriptions\";"
+ ],
"bps": true,
"folderMillis": 1729699958471,
"hash": "7dad43a2a25d1aec82124a4e53f8d82f8505c3073f23606c1dc5d2a4598eacf9"
@@ -295,7 +298,9 @@
"hash": "845a692ceabbfc3caf252a97d3e19a213bc0c433df2689900135f9cfded2cf49"
},
{
- "sql": ["ALTER TABLE \"messages\" ADD COLUMN \"reasoning\" jsonb;"],
+ "sql": [
+ "ALTER TABLE \"messages\" ADD COLUMN \"reasoning\" jsonb;"
+ ],
"bps": true,
"folderMillis": 1737609172353,
"hash": "2cb36ae4fcdd7b7064767e04bfbb36ae34518ff4bb1b39006f2dd394d1893868"
@@ -309,4 +314,4 @@
"folderMillis": 1739901891891,
"hash": "78d8fefd8c58938d7bc3da2295a73b35ce2e8d7cb2820f8e817acdb8dd5bebb2"
}
-]
+]
\ No newline at end of file
diff --git a/src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap b/src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap
new file mode 100644
index 0000000000000..ffcb40644c23d
--- /dev/null
+++ b/src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap
@@ -0,0 +1,238 @@
+// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
+
+exports[`EPubLoader > should run 1`] = `
+[
+ Document {
+ "id": undefined,
+ "metadata": {
+ "loc": {
+ "lines": {
+ "from": 1,
+ "to": 13,
+ },
+ },
+ "source": "",
+ },
+ "pageContent": "HEFTY WATER
+
+This document serves to test Reading System support for the epub:switch
+[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-content-switch]
+element. There is also a little bit of ruby markup
+[http://www.w3.org/TR/html5/the-ruby-element.html#the-ruby-element] available.
+
+
+THE SWITCH
+
+Below is an instance of the epub:switch element, containing Chemical Markup
+Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
+fallback content is a chunk of plain XHTML5.",
+ },
+ Document {
+ "id": undefined,
+ "metadata": {
+ "loc": {
+ "lines": {
+ "from": 9,
+ "to": 22,
+ },
+ },
+ "source": "",
+ },
+ "pageContent": "THE SWITCH
+
+Below is an instance of the epub:switch element, containing Chemical Markup
+Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
+fallback content is a chunk of plain XHTML5.
+
+ * If your Reading System supports epub:switch and CML, it will render the CML
+ formula natively, and ignore (a.k.a not display) the XHTML fallback.
+ * If your Reading System supports epub:switch but not CML, it will ignore (not
+ display) the CML formula, and render the the XHTML fallback instead.
+ * If your Reading System does not support epub:switch at all, then the
+ rendering results are somewhat unpredictable, but the most likely result is
+ that it will display both a failed attempt to render the CML and the XHTML
+ fallback.",
+ },
+ Document {
+ "id": undefined,
+ "metadata": {
+ "loc": {
+ "lines": {
+ "from": 24,
+ "to": 43,
+ },
+ },
+ "source": "",
+ },
+ "pageContent": "Note: the XHTML fallback is bold and enclosed in a gray dotted box with a
+slightly gray background. A failed CML rendering will most likely appear above
+the gray fallback box and read:
+"H hydrogen O oxygen hefty H O water".
+
+Here the switch begins...
+
+
+H hydrogen O oxygen hefty H O water
+
+2H2 + O2 ⟶ 2H2O
+
+... and here the switch ends.
+
+
+THE SOURCE
+
+Below is a rendition of the source code of the switch element. Your Reading
+System should display this correctly regardless of whether it supports the
+switch element.",
+ },
+ Document {
+ "id": undefined,
+ "metadata": {
+ "loc": {
+ "lines": {
+ "from": 46,
+ "to": 66,
+ },
+ },
+ "source": "",
+ },
+ "pageContent": "
+ 2H2
+ +
+ O2
+ ⟶
+ 2H2O
+
+ 2H2 + + + O2 + ⟶ + 2H2O +
", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 105, + "to": 120, + }, + }, + "source": "", + }, + "pageContent": "+ 2H2 + + + O2 + ⟶ + 2H2O +
+ + +... and the css declaration using the -epub-ruby-position property looks like +this: + + +p#rubyp { + -epub-ruby-position : under; +}", + }, +] +`; diff --git a/src/libs/langchain/loaders/epub/__tests__/demo.epub b/src/libs/langchain/loaders/epub/__tests__/demo.epub new file mode 100644 index 0000000000000..11cc0a74cf7cc Binary files /dev/null and b/src/libs/langchain/loaders/epub/__tests__/demo.epub differ diff --git a/src/libs/langchain/loaders/epub/__tests__/index.test.ts b/src/libs/langchain/loaders/epub/__tests__/index.test.ts new file mode 100644 index 0000000000000..4ee843b86379d --- /dev/null +++ b/src/libs/langchain/loaders/epub/__tests__/index.test.ts @@ -0,0 +1,24 @@ +// @vitest-environment node +import * as fs from 'node:fs'; +import { join } from 'node:path'; +import { expect } from 'vitest'; + +import { EPubLoader } from '../index'; + +function sanitizeDynamicFields(document: any[]) { + for (const doc of document) { + doc.metadata.source && (doc.metadata.source = ''); + } + return document; +} + +describe('EPubLoader', () => { + it('should run', async () => { + const content = fs.readFileSync(join(__dirname, `./demo.epub`)); + + const fileContent: Uint8Array = new Uint8Array(content); + + const data = await EPubLoader(fileContent); + expect(sanitizeDynamicFields(data)).toMatchSnapshot(); + }); +}); diff --git a/src/libs/langchain/loaders/epub/index.ts b/src/libs/langchain/loaders/epub/index.ts new file mode 100644 index 0000000000000..1481e49bba235 --- /dev/null +++ b/src/libs/langchain/loaders/epub/index.ts @@ -0,0 +1,21 @@ +import { EPubLoader as Loader } from '@langchain/community/document_loaders/fs/epub'; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; +import { loaderConfig } from '../config'; +import { TempFileManager } from '@/server/utils/tempFileManager'; + +export const EPubLoader = async (content: Uint8Array) => { + const tempManager = new TempFileManager(); + try { + const tempPath = await tempManager.writeTempFile(content); + const loader = new Loader(tempPath); + const documents = await loader.load(); + + const splitter = new RecursiveCharacterTextSplitter(loaderConfig); + return await splitter.splitDocuments(documents); + } catch (e) { + throw new Error(`EPubLoader error: ${(e as Error).message}`); + } finally { + tempManager.cleanup(); // 确保清理 + } + +}; diff --git a/src/libs/langchain/loaders/index.ts b/src/libs/langchain/loaders/index.ts index 50c91c8fc94cd..b4763320b99a2 100644 --- a/src/libs/langchain/loaders/index.ts +++ b/src/libs/langchain/loaders/index.ts @@ -14,6 +14,7 @@ import { MarkdownLoader } from './markdown'; import { PdfLoader } from './pdf'; import { PPTXLoader } from './pptx'; import { TextLoader } from './txt'; +import { EPubLoader } from './epub'; class LangChainError extends Error { constructor(message: string) { @@ -64,6 +65,10 @@ export class ChunkingLoader { return await CsVLoader(fileBlob); } + case 'epub': { + return await EPubLoader(content); + } + default: { throw new Error( `Unsupported file type [${type}], please check your file is supported, or create report issue here: https://github.com/lobehub/lobe-chat/discussions/3550`, @@ -100,6 +105,10 @@ export class ChunkingLoader { return 'csv'; } + if (filename.endsWith('epub')) { + return 'epub'; + } + const ext = filename.split('.').pop(); if (ext && SupportedTextSplitterLanguages.includes(ext as SupportedTextSplitterLanguage)) { diff --git a/src/libs/langchain/types.ts b/src/libs/langchain/types.ts index 0bcc746bcd9ce..5512fb9524774 100644 --- a/src/libs/langchain/types.ts +++ b/src/libs/langchain/types.ts @@ -6,4 +6,5 @@ export type LangChainLoaderType = | 'doc' | 'text' | 'latex' - | 'csv'; + | 'csv' + | 'epub'; diff --git a/src/server/utils/tempFileManager.ts b/src/server/utils/tempFileManager.ts new file mode 100644 index 0000000000000..908b457717df5 --- /dev/null +++ b/src/server/utils/tempFileManager.ts @@ -0,0 +1,70 @@ +import { mkdtempSync, rmSync , writeFileSync, existsSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { v4 as uuidv4 } from 'uuid'; + +/** + * 安全存储临时文件工具类 + */ +export class TempFileManager { + private readonly tempDir: string; + private filePaths: Set