From 1f8fcb39833cdbeb3455a2421ca214e5813fa58f Mon Sep 17 00:00:00 2001 From: Rafael Kvitko Reis Date: Sat, 15 Jul 2023 18:53:57 -0300 Subject: [PATCH 1/2] HtmlToMarkdownTextSplitter --- .../HtmlToMarkdownTextSplitter.ts | 75 +++++++++++++++++++ .../htmlToMarkdownTextSplitter.svg | 6 ++ packages/components/package.json | 1 + 3 files changed, 82 insertions(+) create mode 100644 packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts create mode 100644 packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/htmlToMarkdownTextSplitter.svg diff --git a/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts new file mode 100644 index 00000000..d05298fe --- /dev/null +++ b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts @@ -0,0 +1,75 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { getBaseClasses } from '../../../src/utils' +import { MarkdownTextSplitter, MarkdownTextSplitterParams } from 'langchain/text_splitter' +import { NodeHtmlMarkdown } from 'node-html-markdown' + +class HtmlToMarkdownTextSplitter_TextSplitters implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'HtmlToMarkdown Text Splitter' + this.name = 'htmlToMarkdownTextSplitter' + this.type = 'HtmlToMarkdownTextSplitter' + this.icon = 'htmlToMarkdownTextSplitter.svg' + this.category = 'Text Splitters' + this.description = `Converts Html to Markdown and then split your content into documents based on the Markdown headers` + this.baseClasses = [this.type, ...getBaseClasses(HtmlToMarkdownTextSplitter)] + this.inputs = [ + { + label: 'Chunk Size', + name: 'chunkSize', + type: 'number', + default: 1000, + optional: true + }, + { + label: 'Chunk Overlap', + name: 'chunkOverlap', + type: 'number', + optional: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const chunkSize = nodeData.inputs?.chunkSize as string + const chunkOverlap = nodeData.inputs?.chunkOverlap as string + + const obj = {} as MarkdownTextSplitterParams + + if (chunkSize) obj.chunkSize = parseInt(chunkSize, 10) + if (chunkOverlap) obj.chunkOverlap = parseInt(chunkOverlap, 10) + + const splitter = new HtmlToMarkdownTextSplitter(obj) + + return splitter + } +} +class HtmlToMarkdownTextSplitter extends MarkdownTextSplitter implements MarkdownTextSplitterParams { + constructor(fields?: Partial) { + { + super(fields) + } + } + splitText(text: string): Promise { + return new Promise((resolve, reject) => { + const markdown = NodeHtmlMarkdown.translate( + /* html */ text, + /* options (optional) */ {}, + /* customTranslators (optional) */ undefined, + /* customCodeBlockTranslators (optional) */ undefined + ) + super.splitText(markdown).then((result) => { + resolve(result) + }) + }) + } +} +module.exports = { nodeClass: HtmlToMarkdownTextSplitter_TextSplitters } diff --git a/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/htmlToMarkdownTextSplitter.svg b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/htmlToMarkdownTextSplitter.svg new file mode 100644 index 00000000..f7d45d60 --- /dev/null +++ b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/htmlToMarkdownTextSplitter.svg @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index 3459a372..d3ac06c3 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -42,6 +42,7 @@ "mammoth": "^1.5.1", "moment": "^2.29.3", "node-fetch": "^2.6.11", + "node-html-markdown": "^1.3.0", "pdf-parse": "^1.1.1", "pdfjs-dist": "^3.7.107", "playwright": "^1.35.0", From b4b7ccdbd78cd8ff11b9f5894ea0893360beb3f4 Mon Sep 17 00:00:00 2001 From: Rafael Reis Date: Sun, 16 Jul 2023 01:34:18 -0300 Subject: [PATCH 2/2] clean code --- .../HtmlToMarkdownTextSplitter.ts | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts index d05298fe..161cb89e 100644 --- a/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts +++ b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts @@ -59,13 +59,8 @@ class HtmlToMarkdownTextSplitter extends MarkdownTextSplitter implements Markdow } } splitText(text: string): Promise { - return new Promise((resolve, reject) => { - const markdown = NodeHtmlMarkdown.translate( - /* html */ text, - /* options (optional) */ {}, - /* customTranslators (optional) */ undefined, - /* customCodeBlockTranslators (optional) */ undefined - ) + return new Promise((resolve) => { + const markdown = NodeHtmlMarkdown.translate(text) super.splitText(markdown).then((result) => { resolve(result) })