[Feature] improve CsvLoader & clean code (#3830)

* Improve CSV Loader

* Improve S3 Loaders

---------

Co-authored-by: Henry <hzj94@hotmail.com>
This commit is contained in:
Jérémy JOURDIN
2025-01-14 17:47:04 +01:00
committed by GitHub
parent cc87d85675
commit 24eb437bad
7 changed files with 230 additions and 207 deletions
+69
View File
@@ -7,10 +7,14 @@ import { z } from 'zod'
import { DataSource } from 'typeorm'
import { ICommonObject, IDatabaseEntity, IDocument, IMessage, INodeData, IVariable, MessageContentImageUrl } from './Interface'
import { AES, enc } from 'crypto-js'
import { omit } from 'lodash'
import { AIMessage, HumanMessage, BaseMessage } from '@langchain/core/messages'
import { Document } from '@langchain/core/documents'
import { getFileFromStorage } from './storageUtils'
import { GetSecretValueCommand, SecretsManagerClient, SecretsManagerClientConfig } from '@aws-sdk/client-secrets-manager'
import { customGet } from '../nodes/sequentialagents/commonUtils'
import { TextSplitter } from 'langchain/text_splitter'
import { DocumentLoader } from 'langchain/document_loaders/base'
export const numberOrExpressionRegex = '^(\\d+\\.?\\d*|{{.*}})$' //return true if string consists only numbers OR expression {{}}
export const notEmptyRegex = '(.|\\s)*\\S(.|\\s)*' //return true if string is not empty or blank
@@ -1077,3 +1081,68 @@ export const resolveFlowObjValue = (obj: any, sourceObj: any): any => {
return obj
}
}
export const handleDocumentLoaderOutput = (docs: Document[], output: string) => {
if (output === 'document') {
return docs
} else {
let finaltext = ''
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
}
}
export const parseDocumentLoaderMetadata = (metadata: object | string): object => {
if (!metadata) return {}
if (typeof metadata !== 'object') {
return JSON.parse(metadata)
}
return metadata
}
export const handleDocumentLoaderMetadata = (
docs: Document[],
_omitMetadataKeys: string,
metadata: object | string = {},
sourceIdKey?: string
) => {
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
metadata = parseDocumentLoaderMetadata(metadata)
return docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? metadata
: omit(
{
...metadata,
...doc.metadata,
...(sourceIdKey ? { [sourceIdKey]: doc.metadata[sourceIdKey] || sourceIdKey } : undefined)
},
omitMetadataKeys
)
}))
}
export const handleDocumentLoaderDocuments = async (loader: DocumentLoader, textSplitter?: TextSplitter) => {
let docs: Document[] = []
if (textSplitter) {
let splittedDocs = await loader.load()
splittedDocs = await textSplitter.splitDocuments(splittedDocs)
docs = splittedDocs
} else {
docs = await loader.load()
}
return docs
}