-
-
Notifications
You must be signed in to change notification settings - Fork 12.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
💄 style: add epub file chunk split support (#6317)
* ✨feat: add epub file chunk split support * ✅test: add unit test for epub chunk splitter --------- Co-authored-by: stevendong <[email protected]>
- Loading branch information
1 parent
847cd33
commit a79ab7a
Showing
9 changed files
with
374 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
238 changes: 238 additions & 0 deletions
238
src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html | ||
|
||
exports[`EPubLoader > should run 1`] = ` | ||
[ | ||
Document { | ||
"id": undefined, | ||
"metadata": { | ||
"loc": { | ||
"lines": { | ||
"from": 1, | ||
"to": 13, | ||
}, | ||
}, | ||
"source": "", | ||
}, | ||
"pageContent": "HEFTY WATER | ||
This document serves to test Reading System support for the epub:switch | ||
[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-content-switch] | ||
element. There is also a little bit of ruby markup | ||
[http://www.w3.org/TR/html5/the-ruby-element.html#the-ruby-element] available. | ||
THE SWITCH | ||
Below is an instance of the epub:switch element, containing Chemical Markup | ||
Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The | ||
fallback content is a chunk of plain XHTML5.", | ||
}, | ||
Document { | ||
"id": undefined, | ||
"metadata": { | ||
"loc": { | ||
"lines": { | ||
"from": 9, | ||
"to": 22, | ||
}, | ||
}, | ||
"source": "", | ||
}, | ||
"pageContent": "THE SWITCH | ||
Below is an instance of the epub:switch element, containing Chemical Markup | ||
Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The | ||
fallback content is a chunk of plain XHTML5. | ||
* If your Reading System supports epub:switch and CML, it will render the CML | ||
formula natively, and ignore (a.k.a not display) the XHTML fallback. | ||
* If your Reading System supports epub:switch but not CML, it will ignore (not | ||
display) the CML formula, and render the the XHTML fallback instead. | ||
* If your Reading System does not support epub:switch at all, then the | ||
rendering results are somewhat unpredictable, but the most likely result is | ||
that it will display both a failed attempt to render the CML and the XHTML | ||
fallback.", | ||
}, | ||
Document { | ||
"id": undefined, | ||
"metadata": { | ||
"loc": { | ||
"lines": { | ||
"from": 24, | ||
"to": 43, | ||
}, | ||
}, | ||
"source": "", | ||
}, | ||
"pageContent": "Note: the XHTML fallback is bold and enclosed in a gray dotted box with a | ||
slightly gray background. A failed CML rendering will most likely appear above | ||
the gray fallback box and read: | ||
"H hydrogen O oxygen hefty H O water". | ||
Here the switch begins... | ||
H hydrogen O oxygen hefty H O water | ||
2H2 + O2 ⟶ 2H2O | ||
... and here the switch ends. | ||
THE SOURCE | ||
Below is a rendition of the source code of the switch element. Your Reading | ||
System should display this correctly regardless of whether it supports the | ||
switch element.", | ||
}, | ||
Document { | ||
"id": undefined, | ||
"metadata": { | ||
"loc": { | ||
"lines": { | ||
"from": 46, | ||
"to": 66, | ||
}, | ||
}, | ||
"source": "", | ||
}, | ||
"pageContent": "<switch xmlns="http://www.idpf.org/2007/ops"> | ||
<case required-namespace="http://www.xml-cml.org/schema"> | ||
<chem xmlns="http://www.xml-cml.org/schema"> | ||
<reaction> | ||
<molecule n="2"> | ||
<atom n="2"> H </atom> | ||
<caption> hydrogen </caption> | ||
</molecule> | ||
<plus></plus> | ||
<molecule> | ||
<atom n="2"> O </atom> | ||
<caption> oxygen </caption> | ||
</molecule> | ||
<gives> | ||
<caption> hefty </caption> | ||
</gives> | ||
<molecule n="2"> | ||
<atom n="2"> H </atom> | ||
<atom> O </atom> | ||
<caption> water </caption> | ||
</molecule>", | ||
}, | ||
Document { | ||
"id": undefined, | ||
"metadata": { | ||
"loc": { | ||
"lines": { | ||
"from": 57, | ||
"to": 79, | ||
}, | ||
}, | ||
"source": "", | ||
}, | ||
"pageContent": "<caption> oxygen </caption> | ||
</molecule> | ||
<gives> | ||
<caption> hefty </caption> | ||
</gives> | ||
<molecule n="2"> | ||
<atom n="2"> H </atom> | ||
<atom> O </atom> | ||
<caption> water </caption> | ||
</molecule> | ||
</reaction> | ||
</chem> | ||
</case> | ||
<default> | ||
<p xmlns="http://www.w3.org/1999/xhtml" id="fallback"> | ||
<span>2H<sub>2</sub></span> | ||
<span>+</span> | ||
<span>O<sub>2</sub></span> | ||
<span>⟶</span> | ||
<span>2H<sub>2</sub>O</span> | ||
</p> | ||
</default> | ||
</switch>", | ||
}, | ||
Document { | ||
"id": undefined, | ||
"metadata": { | ||
"loc": { | ||
"lines": { | ||
"from": 84, | ||
"to": 94, | ||
}, | ||
}, | ||
"source": "", | ||
}, | ||
"pageContent": "HEFTY RUBY WATER | ||
While the ruby element is mostly used in east-asian languages, it can also be | ||
useful in other contexts. As an example, and as you can see in the source of the | ||
CML element above, the code includes a caption element which is intended to be | ||
displayed below the formula segments. Following this paragraph is a reworked | ||
version of the XHTML fallback used above, using the ruby element. If your | ||
Reading System does not support ruby markup, then the captions will appear in | ||
parentheses on the same line as the formula segments. | ||
2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)", | ||
}, | ||
Document { | ||
"id": undefined, | ||
"metadata": { | ||
"loc": { | ||
"lines": { | ||
"from": 94, | ||
"to": 111, | ||
}, | ||
}, | ||
"source": "", | ||
}, | ||
"pageContent": "2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water) | ||
If your Reading System in addition to supporting ruby markup also supports the | ||
-epub-ruby-position | ||
[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-css-ruby-position] | ||
property, then the captions will appear under the formula segments instead of | ||
over them. | ||
The source code for the ruby version of the XHTML fallback looks as follows: | ||
<p id="rubyp"> | ||
<ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby> | ||
<span>+</span> | ||
<ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby> | ||
<ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby> | ||
<ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby> | ||
</p>", | ||
}, | ||
Document { | ||
"id": undefined, | ||
"metadata": { | ||
"loc": { | ||
"lines": { | ||
"from": 105, | ||
"to": 120, | ||
}, | ||
}, | ||
"source": "", | ||
}, | ||
"pageContent": "<p id="rubyp"> | ||
<ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby> | ||
<span>+</span> | ||
<ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby> | ||
<ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby> | ||
<ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby> | ||
</p> | ||
... and the css declaration using the -epub-ruby-position property looks like | ||
this: | ||
p#rubyp { | ||
-epub-ruby-position : under; | ||
}", | ||
}, | ||
] | ||
`; |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// @vitest-environment node | ||
import * as fs from 'node:fs'; | ||
import { join } from 'node:path'; | ||
import { expect } from 'vitest'; | ||
|
||
import { EPubLoader } from '../index'; | ||
|
||
function sanitizeDynamicFields(document: any[]) { | ||
for (const doc of document) { | ||
doc.metadata.source && (doc.metadata.source = ''); | ||
} | ||
return document; | ||
} | ||
|
||
describe('EPubLoader', () => { | ||
it('should run', async () => { | ||
const content = fs.readFileSync(join(__dirname, `./demo.epub`)); | ||
|
||
const fileContent: Uint8Array = new Uint8Array(content); | ||
|
||
const data = await EPubLoader(fileContent); | ||
expect(sanitizeDynamicFields(data)).toMatchSnapshot(); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import { EPubLoader as Loader } from '@langchain/community/document_loaders/fs/epub'; | ||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; | ||
import { loaderConfig } from '../config'; | ||
import { TempFileManager } from '@/server/utils/tempFileManager'; | ||
|
||
export const EPubLoader = async (content: Uint8Array) => { | ||
const tempManager = new TempFileManager(); | ||
try { | ||
const tempPath = await tempManager.writeTempFile(content); | ||
const loader = new Loader(tempPath); | ||
const documents = await loader.load(); | ||
|
||
const splitter = new RecursiveCharacterTextSplitter(loaderConfig); | ||
return await splitter.splitDocuments(documents); | ||
} catch (e) { | ||
throw new Error(`EPubLoader error: ${(e as Error).message}`); | ||
} finally { | ||
tempManager.cleanup(); // 确保清理 | ||
} | ||
|
||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,5 @@ export type LangChainLoaderType = | |
| 'doc' | ||
| 'text' | ||
| 'latex' | ||
| 'csv'; | ||
| 'csv' | ||
| 'epub'; |
Oops, something went wrong.