Feature/add ability to upload file from chat (#3059)

add ability to upload file from chat
2026-06-28 17:01:00 +03:00 · 2024-08-25 13:22:48 +01:00
parent e8f5f07735
commit 66acd0c000
37 changed files with 1111 additions and 259 deletions
@@ -108,6 +108,7 @@ class Csv_DocumentLoaders implements INode {
            const chatflowid = options.chatflowid

            for (const file of files) {
+                if (!file) continue
                const fileData = await getFileFromStorage(file, chatflowid)
                const blob = new Blob([fileData])
                const loader = new CSVLoader(blob, columnName.trim().length === 0 ? undefined : columnName.trim())
@@ -127,6 +128,7 @@ class Csv_DocumentLoaders implements INode {
            }

            for (const file of files) {
+                if (!file) continue
                const splitDataURI = file.split(',')
                splitDataURI.pop()
                const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
@@ -83,6 +83,7 @@ class Docx_DocumentLoaders implements INode {
            const chatflowid = options.chatflowid

            for (const file of files) {
+                if (!file) continue
                const fileData = await getFileFromStorage(file, chatflowid)
                const blob = new Blob([fileData])
                const loader = new DocxLoader(blob)
@@ -103,6 +104,7 @@ class Docx_DocumentLoaders implements INode {
            }

            for (const file of files) {
+                if (!file) continue
                const splitDataURI = file.split(',')
                splitDataURI.pop()
                const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
@@ -0,0 +1,299 @@
+import { omit } from 'lodash'
+import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
+import { TextSplitter } from 'langchain/text_splitter'
+import { TextLoader } from 'langchain/document_loaders/fs/text'
+import { JSONLinesLoader, JSONLoader } from 'langchain/document_loaders/fs/json'
+import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
+import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
+import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
+import { BaseDocumentLoader } from 'langchain/document_loaders/base'
+import { Document } from '@langchain/core/documents'
+import { getFileFromStorage } from '../../../src/storageUtils'
+import { mapMimeTypeToExt } from '../../../src/utils'
+
+class File_DocumentLoaders implements INode {
+    label: string
+    name: string
+    version: number
+    description: string
+    type: string
+    icon: string
+    category: string
+    baseClasses: string[]
+    inputs: INodeParams[]
+
+    constructor() {
+        this.label = 'File Loader'
+        this.name = 'fileLoader'
+        this.version = 1.0
+        this.type = 'Document'
+        this.icon = 'file.svg'
+        this.category = 'Document Loaders'
+        this.description = `A generic file loader that can load txt, json, csv, docx, pdf, and other files`
+        this.baseClasses = [this.type]
+        this.inputs = [
+            {
+                label: 'File',
+                name: 'file',
+                type: 'file',
+                fileType: '*'
+            },
+            {
+                label: 'Text Splitter',
+                name: 'textSplitter',
+                type: 'TextSplitter',
+                optional: true
+            },
+            {
+                label: 'Pdf Usage',
+                name: 'pdfUsage',
+                type: 'options',
+                description: 'Only when loading PDF files',
+                options: [
+                    {
+                        label: 'One document per page',
+                        name: 'perPage'
+                    },
+                    {
+                        label: 'One document per file',
+                        name: 'perFile'
+                    }
+                ],
+                default: 'perPage',
+                optional: true,
+                additionalParams: true
+            },
+            {
+                label: 'JSONL Pointer Extraction',
+                name: 'pointerName',
+                type: 'string',
+                description: 'Only when loading JSONL files',
+                placeholder: '<pointerName>',
+                optional: true,
+                additionalParams: true
+            },
+            {
+                label: 'Additional Metadata',
+                name: 'metadata',
+                type: 'json',
+                description: 'Additional metadata to be added to the extracted documents',
+                optional: true,
+                additionalParams: true
+            },
+            {
+                label: 'Omit Metadata Keys',
+                name: 'omitMetadataKeys',
+                type: 'string',
+                rows: 4,
+                description:
+                    'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
+                placeholder: 'key1, key2, key3.nestedKey1',
+                optional: true,
+                additionalParams: true
+            }
+        ]
+    }
+
+    async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
+        const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
+        const fileBase64 = nodeData.inputs?.file as string
+        const metadata = nodeData.inputs?.metadata
+        const pdfUsage = nodeData.inputs?.pdfUsage
+        const pointerName = nodeData.inputs?.pointerName as string
+        const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
+
+        let omitMetadataKeys: string[] = []
+        if (_omitMetadataKeys) {
+            omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
+        }
+
+        let files: string[] = []
+        const fileBlobs: { blob: Blob; ext: string }[] = []
+
+        //FILE-STORAGE::["CONTRIBUTING.md","LICENSE.md","README.md"]
+        const totalFiles = getOverrideFileInputs(nodeData) || fileBase64
+        if (totalFiles.startsWith('FILE-STORAGE::')) {
+            const fileName = totalFiles.replace('FILE-STORAGE::', '')
+            if (fileName.startsWith('[') && fileName.endsWith(']')) {
+                files = JSON.parse(fileName)
+            } else {
+                files = [fileName]
+            }
+            const chatflowid = options.chatflowid
+
+            for (const file of files) {
+                if (!file) continue
+                const fileData = await getFileFromStorage(file, chatflowid)
+                const blob = new Blob([fileData])
+                fileBlobs.push({ blob, ext: file.split('.').pop() || '' })
+            }
+        } else {
+            if (totalFiles.startsWith('[') && totalFiles.endsWith(']')) {
+                files = JSON.parse(totalFiles)
+            } else {
+                files = [totalFiles]
+            }
+
+            for (const file of files) {
+                if (!file) continue
+                const splitDataURI = file.split(',')
+                splitDataURI.pop()
+                const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
+                const blob = new Blob([bf])
+
+                let extension = ''
+                // eslint-disable-next-line no-useless-escape
+                const match = file.match(/^data:([A-Za-z-+\/]+);base64,/)
+
+                if (!match) {
+                    fileBlobs.push({
+                        blob,
+                        ext: extension
+                    })
+                } else {
+                    const mimeType = match[1]
+                    fileBlobs.push({
+                        blob,
+                        ext: mapMimeTypeToExt(mimeType)
+                    })
+                }
+            }
+        }
+
+        const loader = new MultiFileLoader(fileBlobs, {
+            json: (blob) => new JSONLoader(blob),
+            jsonl: (blob) => new JSONLinesLoader(blob, '/' + pointerName.trim()),
+            txt: (blob) => new TextLoader(blob),
+            csv: (blob) => new CSVLoader(blob),
+            xls: (blob) => new CSVLoader(blob),
+            xlsx: (blob) => new CSVLoader(blob),
+            docx: (blob) => new DocxLoader(blob),
+            doc: (blob) => new DocxLoader(blob),
+            pdf: (blob) =>
+                pdfUsage === 'perFile'
+                    ? // @ts-ignore
+                      new PDFLoader(blob, { splitPages: false, pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') })
+                    : // @ts-ignore
+                      new PDFLoader(blob, { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }),
+            '': (blob) => new TextLoader(blob)
+        })
+        let docs = []
+
+        if (textSplitter) {
+            docs = await loader.load()
+            docs = await textSplitter.splitDocuments(docs)
+        } else {
+            docs = await loader.load()
+        }
+
+        if (metadata) {
+            const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
+            docs = docs.map((doc) => ({
+                ...doc,
+                metadata:
+                    _omitMetadataKeys === '*'
+                        ? {
+                              ...parsedMetadata
+                          }
+                        : omit(
+                              {
+                                  ...doc.metadata,
+                                  ...parsedMetadata
+                              },
+                              omitMetadataKeys
+                          )
+            }))
+        } else {
+            docs = docs.map((doc) => ({
+                ...doc,
+                metadata:
+                    _omitMetadataKeys === '*'
+                        ? {}
+                        : omit(
+                              {
+                                  ...doc.metadata
+                              },
+                              omitMetadataKeys
+                          )
+            }))
+        }
+
+        return docs
+    }
+}
+
+const getOverrideFileInputs = (nodeData: INodeData) => {
+    const txtFileBase64 = nodeData.inputs?.txtFile as string
+    const pdfFileBase64 = nodeData.inputs?.pdfFile as string
+    const jsonFileBase64 = nodeData.inputs?.jsonFile as string
+    const csvFileBase64 = nodeData.inputs?.csvFile as string
+    const jsonlinesFileBase64 = nodeData.inputs?.jsonlinesFile as string
+    const docxFileBase64 = nodeData.inputs?.docxFile as string
+    const yamlFileBase64 = nodeData.inputs?.yamlFile as string
+
+    const removePrefix = (storageFile: string): string[] => {
+        const fileName = storageFile.replace('FILE-STORAGE::', '')
+        if (fileName.startsWith('[') && fileName.endsWith(']')) {
+            return JSON.parse(fileName)
+        }
+        return [fileName]
+    }
+
+    // If exists, combine all file inputs into an array
+    const files: string[] = []
+    if (txtFileBase64) {
+        files.push(...removePrefix(txtFileBase64))
+    }
+    if (pdfFileBase64) {
+        files.push(...removePrefix(pdfFileBase64))
+    }
+    if (jsonFileBase64) {
+        files.push(...removePrefix(jsonFileBase64))
+    }
+    if (csvFileBase64) {
+        files.push(...removePrefix(csvFileBase64))
+    }
+    if (jsonlinesFileBase64) {
+        files.push(...removePrefix(jsonlinesFileBase64))
+    }
+    if (docxFileBase64) {
+        files.push(...removePrefix(docxFileBase64))
+    }
+    if (yamlFileBase64) {
+        files.push(...removePrefix(yamlFileBase64))
+    }
+
+    return files.length ? `FILE-STORAGE::${JSON.stringify(files)}` : ''
+}
+
+interface LoadersMapping {
+    [extension: string]: (blob: Blob) => BaseDocumentLoader
+}
+
+class MultiFileLoader extends BaseDocumentLoader {
+    constructor(public fileBlobs: { blob: Blob; ext: string }[], public loaders: LoadersMapping) {
+        super()
+
+        if (Object.keys(loaders).length === 0) {
+            throw new Error('Must provide at least one loader')
+        }
+    }
+
+    public async load(): Promise<Document[]> {
+        const documents: Document[] = []
+
+        for (const fileBlob of this.fileBlobs) {
+            const loaderFactory = this.loaders[fileBlob.ext]
+            if (loaderFactory) {
+                const loader = loaderFactory(fileBlob.blob)
+                documents.push(...(await loader.load()))
+            } else {
+                throw new Error(`Error loading file`)
+            }
+        }
+
+        return documents
+    }
+}
+
+module.exports = { nodeClass: File_DocumentLoaders }
@@ -0,0 +1 @@
+<svg  xmlns="http://www.w3.org/2000/svg"  width="24"  height="24"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-files"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M15 3v4a1 1 0 0 0 1 1h4" /><path d="M18 17h-7a2 2 0 0 1 -2 -2v-10a2 2 0 0 1 2 -2h4l5 5v7a2 2 0 0 1 -2 2z" /><path d="M16 17v2a2 2 0 0 1 -2 2h-7a2 2 0 0 1 -2 -2v-10a2 2 0 0 1 2 -2h2" /></svg>
@@ -3,7 +3,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
 import { TextSplitter } from 'langchain/text_splitter'
 import { TextLoader } from 'langchain/document_loaders/fs/text'
 import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'
-import { JSONLoader } from 'langchain/document_loaders/fs/json'
+import { JSONLinesLoader, JSONLoader } from 'langchain/document_loaders/fs/json'
 import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
 import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
 import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
@@ -22,7 +22,7 @@ class Folder_DocumentLoaders implements INode {
    constructor() {
        this.label = 'Folder with Files'
        this.name = 'folderFiles'
-        this.version = 2.0
+        this.version = 3.0
        this.type = 'Document'
        this.icon = 'folder.svg'
        this.category = 'Document Loaders'
@@ -51,6 +51,7 @@ class Folder_DocumentLoaders implements INode {
                label: 'Pdf Usage',
                name: 'pdfUsage',
                type: 'options',
+                description: 'Only when loading PDF files',
                options: [
                    {
                        label: 'One document per page',
@@ -65,6 +66,15 @@ class Folder_DocumentLoaders implements INode {
                optional: true,
                additionalParams: true
            },
+            {
+                label: 'JSONL Pointer Extraction',
+                name: 'pointerName',
+                type: 'string',
+                description: 'Only when loading JSONL files',
+                placeholder: '<pointerName>',
+                optional: true,
+                additionalParams: true
+            },
            {
                label: 'Additional Metadata',
                name: 'metadata',
@@ -93,6 +103,7 @@ class Folder_DocumentLoaders implements INode {
        const metadata = nodeData.inputs?.metadata
        const recursive = nodeData.inputs?.recursive as boolean
        const pdfUsage = nodeData.inputs?.pdfUsage
+        const pointerName = nodeData.inputs?.pointerName as string
        const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string

        let omitMetadataKeys: string[] = []
@@ -104,8 +115,12 @@ class Folder_DocumentLoaders implements INode {
            folderPath,
            {
                '.json': (path) => new JSONLoader(path),
+                '.jsonl': (blob) => new JSONLinesLoader(blob, '/' + pointerName.trim()),
                '.txt': (path) => new TextLoader(path),
                '.csv': (path) => new CSVLoader(path),
+                '.xls': (path) => new CSVLoader(path),
+                '.xlsx': (path) => new CSVLoader(path),
+                '.doc': (path) => new DocxLoader(path),
                '.docx': (path) => new DocxLoader(path),
                '.pdf': (path) =>
                    pdfUsage === 'perFile'
@@ -99,6 +99,7 @@ class Json_DocumentLoaders implements INode {
            const chatflowid = options.chatflowid

            for (const file of files) {
+                if (!file) continue
                const fileData = await getFileFromStorage(file, chatflowid)
                const blob = new Blob([fileData])
                const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined)
@@ -119,6 +120,7 @@ class Json_DocumentLoaders implements INode {
            }

            for (const file of files) {
+                if (!file) continue
                const splitDataURI = file.split(',')
                splitDataURI.pop()
                const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
@@ -93,6 +93,7 @@ class Jsonlines_DocumentLoaders implements INode {
            const chatflowid = options.chatflowid

            for (const file of files) {
+                if (!file) continue
                const fileData = await getFileFromStorage(file, chatflowid)
                const blob = new Blob([fileData])
                const loader = new JSONLinesLoader(blob, pointer)
@@ -113,6 +114,7 @@ class Jsonlines_DocumentLoaders implements INode {
            }

            for (const file of files) {
+                if (!file) continue
                const splitDataURI = file.split(',')
                splitDataURI.pop()
                const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
@@ -109,6 +109,7 @@ class Pdf_DocumentLoaders implements INode {
            const chatflowid = options.chatflowid

            for (const file of files) {
+                if (!file) continue
                const fileData = await getFileFromStorage(file, chatflowid)
                const bf = Buffer.from(fileData)
                await this.extractDocs(usage, bf, legacyBuild, textSplitter, docs)
@@ -121,6 +122,7 @@ class Pdf_DocumentLoaders implements INode {
            }

            for (const file of files) {
+                if (!file) continue
                const splitDataURI = file.split(',')
                splitDataURI.pop()
                const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
@@ -101,6 +101,7 @@ class Text_DocumentLoaders implements INode {
            const chatflowid = options.chatflowid

            for (const file of files) {
+                if (!file) continue
                const fileData = await getFileFromStorage(file, chatflowid)
                const blob = new Blob([fileData])
                const loader = new TextLoader(blob)
@@ -121,6 +122,7 @@ class Text_DocumentLoaders implements INode {
            }

            for (const file of files) {
+                if (!file) continue
                const splitDataURI = file.split(',')
                splitDataURI.pop()
                const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
@@ -496,6 +496,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
                const chatflowid = options.chatflowid

                for (const file of files) {
+                    if (!file) continue
                    const fileData = await getFileFromStorage(file, chatflowid)
                    const loaderDocs = await loader.loadAndSplitBuffer(fileData, file)
                    docs.push(...loaderDocs)
@@ -508,6 +509,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
                }

                for (const file of files) {
+                    if (!file) continue
                    const splitDataURI = file.split(',')
                    const filename = splitDataURI.pop()?.split(':')[1] ?? ''
                    const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
				`@@ -0,0 +1 @@`
				<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-tabler icons-tabler-outline icon-tabler-files"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M15 3v4a1 1 0 0 0 1 1h4" /><path d="M18 17h-7a2 2 0 0 1 -2 -2v-10a2 2 0 0 1 2 -2h4l5 5v7a2 2 0 0 1 -2 2z" /><path d="M16 17v2a2 2 0 0 1 -2 2h-7a2 2 0 0 1 -2 -2v-10a2 2 0 0 1 2 -2h2" /></svg>