mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 19:00:59 +03:00
[Feature] improve CsvLoader & clean code (#3830)
* Improve CSV Loader * Improve S3 Loaders --------- Co-authored-by: Henry <hzj94@hotmail.com>
This commit is contained in:
@@ -1,6 +1,11 @@
|
||||
import { omit } from 'lodash'
|
||||
import { ICommonObject, INode, INodeData, INodeOptionsValue, INodeOutputsValue, INodeParams } from '../../../src/Interface'
|
||||
import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
|
||||
import {
|
||||
getCredentialData,
|
||||
getCredentialParam,
|
||||
handleDocumentLoaderDocuments,
|
||||
handleDocumentLoaderMetadata,
|
||||
handleDocumentLoaderOutput
|
||||
} from '../../../src/utils'
|
||||
import { S3Client, GetObjectCommand, S3ClientConfig, ListObjectsV2Command, ListObjectsV2Output } from '@aws-sdk/client-s3'
|
||||
import { getRegions, MODEL_TYPE } from '../../../src/modelLoader'
|
||||
import { Readable } from 'node:stream'
|
||||
@@ -10,12 +15,13 @@ import * as os from 'node:os'
|
||||
|
||||
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'
|
||||
import { JSONLoader } from 'langchain/document_loaders/fs/json'
|
||||
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
|
||||
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
|
||||
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
|
||||
import { TextLoader } from 'langchain/document_loaders/fs/text'
|
||||
import { TextSplitter } from 'langchain/text_splitter'
|
||||
|
||||
import { CSVLoader } from '../Csv/CsvLoader'
|
||||
|
||||
class S3_DocumentLoaders implements INode {
|
||||
label: string
|
||||
name: string
|
||||
@@ -151,11 +157,6 @@ class S3_DocumentLoaders implements INode {
|
||||
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
|
||||
const output = nodeData.outputs?.output as string
|
||||
|
||||
let omitMetadataKeys: string[] = []
|
||||
if (_omitMetadataKeys) {
|
||||
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
|
||||
}
|
||||
|
||||
let credentials: S3ClientConfig['credentials'] | undefined
|
||||
|
||||
if (nodeData.credential) {
|
||||
@@ -241,11 +242,11 @@ class S3_DocumentLoaders implements INode {
|
||||
'.csv': (path) => new CSVLoader(path),
|
||||
'.docx': (path) => new DocxLoader(path),
|
||||
'.pdf': (path) =>
|
||||
pdfUsage === 'perFile'
|
||||
? // @ts-ignore
|
||||
new PDFLoader(path, { splitPages: false, pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') })
|
||||
: // @ts-ignore
|
||||
new PDFLoader(path, { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }),
|
||||
new PDFLoader(path, {
|
||||
splitPages: pdfUsage !== 'perFile',
|
||||
// @ts-ignore
|
||||
pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
|
||||
}),
|
||||
'.aspx': (path) => new TextLoader(path),
|
||||
'.asp': (path) => new TextLoader(path),
|
||||
'.cpp': (path) => new TextLoader(path), // C++
|
||||
@@ -284,63 +285,16 @@ class S3_DocumentLoaders implements INode {
|
||||
true
|
||||
)
|
||||
|
||||
let docs = []
|
||||
let docs = await handleDocumentLoaderDocuments(loader, textSplitter)
|
||||
|
||||
if (textSplitter) {
|
||||
let splittedDocs = await loader.load()
|
||||
splittedDocs = await textSplitter.splitDocuments(splittedDocs)
|
||||
docs.push(...splittedDocs)
|
||||
} else {
|
||||
docs = await loader.load()
|
||||
}
|
||||
|
||||
if (metadata) {
|
||||
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
|
||||
docs = docs.map((doc) => ({
|
||||
...doc,
|
||||
metadata:
|
||||
_omitMetadataKeys === '*'
|
||||
? {
|
||||
...parsedMetadata
|
||||
}
|
||||
: omit(
|
||||
{
|
||||
...doc.metadata,
|
||||
...parsedMetadata
|
||||
},
|
||||
omitMetadataKeys
|
||||
)
|
||||
}))
|
||||
} else {
|
||||
docs = docs.map((doc) => ({
|
||||
...doc,
|
||||
metadata:
|
||||
_omitMetadataKeys === '*'
|
||||
? {}
|
||||
: omit(
|
||||
{
|
||||
...doc.metadata
|
||||
},
|
||||
omitMetadataKeys
|
||||
)
|
||||
}))
|
||||
}
|
||||
docs = handleDocumentLoaderMetadata(docs, _omitMetadataKeys, metadata)
|
||||
|
||||
return handleDocumentLoaderOutput(docs, output)
|
||||
} catch (e: any) {
|
||||
throw new Error(`Failed to load data from bucket ${bucketName}: ${e.message}`)
|
||||
} finally {
|
||||
// remove the temp directory before returning docs
|
||||
fsDefault.rmSync(tempDir, { recursive: true })
|
||||
|
||||
if (output === 'document') {
|
||||
return docs
|
||||
} else {
|
||||
let finaltext = ''
|
||||
for (const doc of docs) {
|
||||
finaltext += `${doc.pageContent}\n`
|
||||
}
|
||||
return handleEscapeCharacters(finaltext, false)
|
||||
}
|
||||
} catch (e: any) {
|
||||
fsDefault.rmSync(tempDir, { recursive: true })
|
||||
throw new Error(`Failed to load data from bucket ${bucketName}: ${e.message}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user