[Feature] improve CsvLoader & clean code (#3830)

* Improve CSV Loader

* Improve S3 Loaders

---------

Co-authored-by: Henry <hzj94@hotmail.com>
This commit is contained in:
Jérémy JOURDIN
2025-01-14 17:47:04 +01:00
committed by GitHub
parent cc87d85675
commit 24eb437bad
7 changed files with 230 additions and 207 deletions
@@ -1,6 +1,11 @@
import { omit } from 'lodash'
import { ICommonObject, INode, INodeData, INodeOptionsValue, INodeOutputsValue, INodeParams } from '../../../src/Interface'
import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
import {
getCredentialData,
getCredentialParam,
handleDocumentLoaderDocuments,
handleDocumentLoaderMetadata,
handleDocumentLoaderOutput
} from '../../../src/utils'
import { S3Client, GetObjectCommand, S3ClientConfig, ListObjectsV2Command, ListObjectsV2Output } from '@aws-sdk/client-s3'
import { getRegions, MODEL_TYPE } from '../../../src/modelLoader'
import { Readable } from 'node:stream'
@@ -10,12 +15,13 @@ import * as os from 'node:os'
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'
import { JSONLoader } from 'langchain/document_loaders/fs/json'
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
import { TextLoader } from 'langchain/document_loaders/fs/text'
import { TextSplitter } from 'langchain/text_splitter'
import { CSVLoader } from '../Csv/CsvLoader'
class S3_DocumentLoaders implements INode {
label: string
name: string
@@ -151,11 +157,6 @@ class S3_DocumentLoaders implements INode {
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let credentials: S3ClientConfig['credentials'] | undefined
if (nodeData.credential) {
@@ -241,11 +242,11 @@ class S3_DocumentLoaders implements INode {
'.csv': (path) => new CSVLoader(path),
'.docx': (path) => new DocxLoader(path),
'.pdf': (path) =>
pdfUsage === 'perFile'
? // @ts-ignore
new PDFLoader(path, { splitPages: false, pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') })
: // @ts-ignore
new PDFLoader(path, { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }),
new PDFLoader(path, {
splitPages: pdfUsage !== 'perFile',
// @ts-ignore
pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
}),
'.aspx': (path) => new TextLoader(path),
'.asp': (path) => new TextLoader(path),
'.cpp': (path) => new TextLoader(path), // C++
@@ -284,63 +285,16 @@ class S3_DocumentLoaders implements INode {
true
)
let docs = []
let docs = await handleDocumentLoaderDocuments(loader, textSplitter)
if (textSplitter) {
let splittedDocs = await loader.load()
splittedDocs = await textSplitter.splitDocuments(splittedDocs)
docs.push(...splittedDocs)
} else {
docs = await loader.load()
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {
...parsedMetadata
}
: omit(
{
...doc.metadata,
...parsedMetadata
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {}
: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
docs = handleDocumentLoaderMetadata(docs, _omitMetadataKeys, metadata)
return handleDocumentLoaderOutput(docs, output)
} catch (e: any) {
throw new Error(`Failed to load data from bucket ${bucketName}: ${e.message}`)
} finally {
// remove the temp directory before returning docs
fsDefault.rmSync(tempDir, { recursive: true })
if (output === 'document') {
return docs
} else {
let finaltext = ''
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
}
} catch (e: any) {
fsDefault.rmSync(tempDir, { recursive: true })
throw new Error(`Failed to load data from bucket ${bucketName}: ${e.message}`)
}
}
}