Feature/Add teams, gmail, outlook tools (#4577)

* add teams, gmail, outlook tools

* update docs link

* update credentials for oauth2

* add jira tool

* add google drive, google calendar, google sheets tools, powerpoint, excel, word doc loader

* update jira logo

* Refactor Gmail and Outlook tools to remove maxOutputLength parameter and enhance request handling. Update response formatting to include parameters in the output. Adjust Google Drive tools to simplify success messages by removing unnecessary parameter details.
This commit is contained in:
Henry Heng
2025-06-06 19:52:04 +01:00
committed by GitHub
parent 6dcb65cedb
commit 30c4180d97
62 changed files with 16832 additions and 144 deletions
@@ -7,6 +7,8 @@ import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
import { LoadOfSheet } from '../MicrosoftExcel/ExcelLoader'
import { PowerpointLoader } from '../MicrosoftPowerpoint/PowerpointLoader'
import { Document } from '@langchain/core/documents'
import { getFileFromStorage } from '../../../src/storageUtils'
import { handleEscapeCharacters, mapMimeTypeToExt } from '../../../src/utils'
@@ -213,10 +215,14 @@ class File_DocumentLoaders implements INode {
jsonl: (blob) => new JSONLinesLoader(blob, '/' + pointerName.trim()),
txt: (blob) => new TextLoader(blob),
csv: (blob) => new CSVLoader(blob),
xls: (blob) => new CSVLoader(blob),
xlsx: (blob) => new CSVLoader(blob),
xls: (blob) => new LoadOfSheet(blob),
xlsx: (blob) => new LoadOfSheet(blob),
xlsm: (blob) => new LoadOfSheet(blob),
xlsb: (blob) => new LoadOfSheet(blob),
docx: (blob) => new DocxLoader(blob),
doc: (blob) => new DocxLoader(blob),
ppt: (blob) => new PowerpointLoader(blob),
pptx: (blob) => new PowerpointLoader(blob),
pdf: (blob) =>
pdfUsage === 'perFile'
? // @ts-ignore
@@ -7,6 +7,8 @@ import { JSONLinesLoader, JSONLoader } from 'langchain/document_loaders/fs/json'
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
import { LoadOfSheet } from '../MicrosoftExcel/ExcelLoader'
import { PowerpointLoader } from '../MicrosoftPowerpoint/PowerpointLoader'
import { handleEscapeCharacters } from '../../../src/utils'
class Folder_DocumentLoaders implements INode {
@@ -135,10 +137,14 @@ class Folder_DocumentLoaders implements INode {
'.jsonl': (blob) => new JSONLinesLoader(blob, '/' + pointerName.trim()),
'.txt': (path) => new TextLoader(path),
'.csv': (path) => new CSVLoader(path),
'.xls': (path) => new CSVLoader(path),
'.xlsx': (path) => new CSVLoader(path),
'.xls': (path) => new LoadOfSheet(path),
'.xlsx': (path) => new LoadOfSheet(path),
'.xlsm': (path) => new LoadOfSheet(path),
'.xlsb': (path) => new LoadOfSheet(path),
'.doc': (path) => new DocxLoader(path),
'.docx': (path) => new DocxLoader(path),
'.ppt': (path) => new PowerpointLoader(path),
'.pptx': (path) => new PowerpointLoader(path),
'.pdf': (path) =>
pdfUsage === 'perFile'
? // @ts-ignore
@@ -0,0 +1,828 @@
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams, INodeOptionsValue } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import {
convertMultiOptionsToStringArray,
getCredentialData,
getCredentialParam,
handleEscapeCharacters,
INodeOutputsValue,
refreshOAuth2Token
} from '../../../src'
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
import * as fs from 'fs'
import * as path from 'path'
import * as os from 'os'
import { LoadOfSheet } from '../MicrosoftExcel/ExcelLoader'
import { PowerpointLoader } from '../MicrosoftPowerpoint/PowerpointLoader'
// Helper function to get human-readable MIME type labels
const getMimeTypeLabel = (mimeType: string): string | undefined => {
const mimeTypeLabels: { [key: string]: string } = {
'application/vnd.google-apps.document': 'Google Doc',
'application/vnd.google-apps.spreadsheet': 'Google Sheet',
'application/vnd.google-apps.presentation': 'Google Slides',
'application/pdf': 'PDF',
'text/plain': 'Text File',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'Word Doc',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'PowerPoint',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'Excel File'
}
return mimeTypeLabels[mimeType] || undefined
}
class GoogleDrive_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
credential: INodeParams
inputs: INodeParams[]
outputs: INodeOutputsValue[]
constructor() {
this.label = 'Google Drive'
this.name = 'googleDrive'
this.version = 1.0
this.type = 'Document'
this.icon = 'google-drive.svg'
this.category = 'Document Loaders'
this.description = `Load documents from Google Drive files`
this.baseClasses = [this.type]
this.credential = {
label: 'Connect Credential',
name: 'credential',
type: 'credential',
description: 'Google Drive OAuth2 Credential',
credentialNames: ['googleDriveOAuth2']
}
this.inputs = [
{
label: 'Select Files',
name: 'selectedFiles',
type: 'asyncMultiOptions',
loadMethod: 'listFiles',
description: 'Select files from your Google Drive',
refresh: true
},
{
label: 'Folder ID',
name: 'folderId',
type: 'string',
description: 'Google Drive folder ID to load all files from (alternative to selecting specific files)',
placeholder: '1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms',
optional: true
},
{
label: 'File Types',
name: 'fileTypes',
type: 'multiOptions',
description: 'Types of files to load',
options: [
{
label: 'Google Docs',
name: 'application/vnd.google-apps.document'
},
{
label: 'Google Sheets',
name: 'application/vnd.google-apps.spreadsheet'
},
{
label: 'Google Slides',
name: 'application/vnd.google-apps.presentation'
},
{
label: 'PDF Files',
name: 'application/pdf'
},
{
label: 'Text Files',
name: 'text/plain'
},
{
label: 'Word Documents',
name: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
},
{
label: 'PowerPoint',
name: 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
},
{
label: 'Excel Files',
name: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
}
],
default: [
'application/vnd.google-apps.document',
'application/vnd.google-apps.spreadsheet',
'application/vnd.google-apps.presentation',
'text/plain',
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
],
optional: true
},
{
label: 'Include Subfolders',
name: 'includeSubfolders',
type: 'boolean',
description: 'Whether to include files from subfolders when loading from a folder',
default: false,
optional: true
},
{
label: 'Include Shared Drives',
name: 'includeSharedDrives',
type: 'boolean',
description: 'Whether to include files from shared drives (Team Drives) that you have access to',
default: false,
optional: true
},
{
label: 'Max Files',
name: 'maxFiles',
type: 'number',
description: 'Maximum number of files to load (default: 50)',
default: 50,
optional: true
},
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true
},
{
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
]
this.outputs = [
{
label: 'Document',
name: 'document',
description: 'Array of document objects containing metadata and pageContent',
baseClasses: [...this.baseClasses, 'json']
},
{
label: 'Text',
name: 'text',
description: 'Concatenated string from pageContent of documents',
baseClasses: ['string', 'json']
}
]
}
//@ts-ignore
loadMethods = {
async listFiles(nodeData: INodeData, options: ICommonObject): Promise<INodeOptionsValue[]> {
const returnData: INodeOptionsValue[] = []
try {
let credentialData = await getCredentialData(nodeData.credential ?? '', options)
credentialData = await refreshOAuth2Token(nodeData.credential ?? '', credentialData, options)
const accessToken = getCredentialParam('access_token', credentialData, nodeData)
if (!accessToken) {
return returnData
}
// Get file types from input to filter
const fileTypes = convertMultiOptionsToStringArray(nodeData.inputs?.fileTypes)
const includeSharedDrives = nodeData.inputs?.includeSharedDrives as boolean
const maxFiles = (nodeData.inputs?.maxFiles as number) || 100
let query = 'trashed = false'
// Add file type filter if specified
if (fileTypes && fileTypes.length > 0) {
const mimeTypeQuery = fileTypes.map((type) => `mimeType='${type}'`).join(' or ')
query += ` and (${mimeTypeQuery})`
}
const url = new URL('https://www.googleapis.com/drive/v3/files')
url.searchParams.append('q', query)
url.searchParams.append('pageSize', Math.min(maxFiles, 1000).toString())
url.searchParams.append('fields', 'files(id, name, mimeType, size, createdTime, modifiedTime, webViewLink, driveId)')
url.searchParams.append('orderBy', 'modifiedTime desc')
// Add shared drives support if requested
if (includeSharedDrives) {
url.searchParams.append('supportsAllDrives', 'true')
url.searchParams.append('includeItemsFromAllDrives', 'true')
}
const response = await fetch(url.toString(), {
headers: {
Authorization: `Bearer ${accessToken}`,
'Content-Type': 'application/json'
}
})
if (!response.ok) {
console.error(`Failed to list files: ${response.statusText}`)
return returnData
}
const data = await response.json()
for (const file of data.files) {
const mimeTypeLabel = getMimeTypeLabel(file.mimeType)
if (!mimeTypeLabel) {
continue
}
// Add drive context to description
const driveContext = file.driveId ? ' (Shared Drive)' : ' (My Drive)'
const obj: INodeOptionsValue = {
name: file.id,
label: file.name,
description: `Type: ${mimeTypeLabel}${driveContext} | Modified: ${new Date(file.modifiedTime).toLocaleDateString()}`
}
returnData.push(obj)
}
} catch (error) {
console.error('Error listing Google Drive files:', error)
}
return returnData
}
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const selectedFiles = nodeData.inputs?.selectedFiles as string
const folderId = nodeData.inputs?.folderId as string
const fileTypes = nodeData.inputs?.fileTypes as string[]
const includeSubfolders = nodeData.inputs?.includeSubfolders as boolean
const includeSharedDrives = nodeData.inputs?.includeSharedDrives as boolean
const maxFiles = (nodeData.inputs?.maxFiles as number) || 50
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
if (!selectedFiles && !folderId) {
throw new Error('Either selected files or Folder ID is required')
}
let credentialData = await getCredentialData(nodeData.credential ?? '', options)
credentialData = await refreshOAuth2Token(nodeData.credential ?? '', credentialData, options)
const accessToken = getCredentialParam('access_token', credentialData, nodeData)
if (!accessToken) {
throw new Error('No access token found in credential')
}
let docs: IDocument[] = []
try {
let filesToProcess: any[] = []
if (selectedFiles) {
// Load selected files (selectedFiles can be a single ID or comma-separated IDs)
let ids: string[] = []
if (typeof selectedFiles === 'string' && selectedFiles.startsWith('[') && selectedFiles.endsWith(']')) {
ids = convertMultiOptionsToStringArray(selectedFiles)
} else if (typeof selectedFiles === 'string') {
ids = [selectedFiles]
} else if (Array.isArray(selectedFiles)) {
ids = selectedFiles
}
for (const id of ids) {
const fileInfo = await this.getFileInfo(id, accessToken, includeSharedDrives)
if (fileInfo && this.shouldProcessFile(fileInfo, fileTypes)) {
filesToProcess.push(fileInfo)
}
}
} else if (folderId) {
// Load files from folder
filesToProcess = await this.getFilesFromFolder(
folderId,
accessToken,
fileTypes,
includeSubfolders,
includeSharedDrives,
maxFiles
)
}
// Process each file
for (const fileInfo of filesToProcess) {
try {
const doc = await this.processFile(fileInfo, accessToken)
if (doc.length > 0) {
docs.push(...doc)
}
} catch (error) {
console.warn(`Failed to process file ${fileInfo.name}: ${error.message}`)
}
}
// Apply text splitter if provided
if (textSplitter && docs.length > 0) {
docs = await textSplitter.splitDocuments(docs)
}
// Apply metadata transformations
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {
...parsedMetadata
}
: omit(
{
...doc.metadata,
...parsedMetadata
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {}
: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
} catch (error) {
throw new Error(`Failed to load Google Drive documents: ${error.message}`)
}
if (output === 'document') {
return docs
} else {
let finaltext = ''
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
}
}
private async getFileInfo(fileId: string, accessToken: string, includeSharedDrives: boolean): Promise<any> {
const url = new URL(`https://www.googleapis.com/drive/v3/files/${encodeURIComponent(fileId)}`)
url.searchParams.append('fields', 'id, name, mimeType, size, createdTime, modifiedTime, parents, webViewLink, driveId')
// Add shared drives support if requested
if (includeSharedDrives) {
url.searchParams.append('supportsAllDrives', 'true')
}
const response = await fetch(url.toString(), {
headers: {
Authorization: `Bearer ${accessToken}`,
'Content-Type': 'application/json'
}
})
if (!response.ok) {
throw new Error(`Failed to get file info: ${response.statusText}`)
}
const fileInfo = await response.json()
// Add drive context to description
const driveContext = fileInfo.driveId ? ' (Shared Drive)' : ' (My Drive)'
return {
...fileInfo,
driveContext
}
}
private async getFilesFromFolder(
folderId: string,
accessToken: string,
fileTypes: string[] | undefined,
includeSubfolders: boolean,
includeSharedDrives: boolean,
maxFiles: number
): Promise<any[]> {
const files: any[] = []
let nextPageToken: string | undefined
do {
let query = `'${folderId}' in parents and trashed = false`
// Add file type filter if specified
if (fileTypes && fileTypes.length > 0) {
const mimeTypeQuery = fileTypes.map((type) => `mimeType='${type}'`).join(' or ')
query += ` and (${mimeTypeQuery})`
}
const url = new URL('https://www.googleapis.com/drive/v3/files')
url.searchParams.append('q', query)
url.searchParams.append('pageSize', Math.min(maxFiles - files.length, 1000).toString())
url.searchParams.append(
'fields',
'nextPageToken, files(id, name, mimeType, size, createdTime, modifiedTime, parents, webViewLink, driveId)'
)
// Add shared drives support if requested
if (includeSharedDrives) {
url.searchParams.append('supportsAllDrives', 'true')
url.searchParams.append('includeItemsFromAllDrives', 'true')
}
if (nextPageToken) {
url.searchParams.append('pageToken', nextPageToken)
}
const response = await fetch(url.toString(), {
headers: {
Authorization: `Bearer ${accessToken}`,
'Content-Type': 'application/json'
}
})
if (!response.ok) {
throw new Error(`Failed to list files: ${response.statusText}`)
}
const data = await response.json()
// Add drive context to each file
const filesWithContext = data.files.map((file: any) => ({
...file,
driveContext: file.driveId ? ' (Shared Drive)' : ' (My Drive)'
}))
files.push(...filesWithContext)
nextPageToken = data.nextPageToken
// If includeSubfolders is true, also get files from subfolders
if (includeSubfolders) {
for (const file of data.files) {
if (file.mimeType === 'application/vnd.google-apps.folder') {
const subfolderFiles = await this.getFilesFromFolder(
file.id,
accessToken,
fileTypes,
includeSubfolders,
includeSharedDrives,
maxFiles - files.length
)
files.push(...subfolderFiles)
}
}
}
} while (nextPageToken && files.length < maxFiles)
return files.slice(0, maxFiles)
}
private shouldProcessFile(fileInfo: any, fileTypes: string[] | undefined): boolean {
if (!fileTypes || fileTypes.length === 0) {
return true
}
return fileTypes.includes(fileInfo.mimeType)
}
private async processFile(fileInfo: any, accessToken: string): Promise<IDocument[]> {
let content = ''
try {
// Handle different file types
if (this.isTextBasedFile(fileInfo.mimeType)) {
// Download regular text files
content = await this.downloadFile(fileInfo.id, accessToken)
// Create document with metadata
return [
{
pageContent: content,
metadata: {
source: fileInfo.webViewLink || `https://drive.google.com/file/d/${fileInfo.id}/view`,
fileId: fileInfo.id,
fileName: fileInfo.name,
mimeType: fileInfo.mimeType,
size: fileInfo.size ? parseInt(fileInfo.size) : undefined,
createdTime: fileInfo.createdTime,
modifiedTime: fileInfo.modifiedTime,
parents: fileInfo.parents,
driveId: fileInfo.driveId,
driveContext: fileInfo.driveContext || (fileInfo.driveId ? ' (Shared Drive)' : ' (My Drive)')
}
}
]
} else if (this.isSupportedBinaryFile(fileInfo.mimeType) || this.isGoogleWorkspaceFile(fileInfo.mimeType)) {
// Process binary files and Google Workspace files using loaders
return await this.processBinaryFile(fileInfo, accessToken)
} else {
console.warn(`Unsupported file type ${fileInfo.mimeType} for file ${fileInfo.name}`)
return []
}
} catch (error) {
console.warn(`Failed to process file ${fileInfo.name}: ${error.message}`)
return []
}
}
private isSupportedBinaryFile(mimeType: string): boolean {
const supportedBinaryTypes = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/msword',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.ms-excel',
'text/csv'
]
return supportedBinaryTypes.includes(mimeType)
}
private async processBinaryFile(fileInfo: any, accessToken: string): Promise<IDocument[]> {
let tempFilePath: string | null = null
try {
let buffer: Buffer
let processedMimeType: string
let processedFileName: string
if (this.isGoogleWorkspaceFile(fileInfo.mimeType)) {
// Handle Google Workspace files by exporting to appropriate format
const exportResult = await this.exportGoogleWorkspaceFileAsBuffer(fileInfo.id, fileInfo.mimeType, accessToken)
buffer = exportResult.buffer
processedMimeType = exportResult.mimeType
processedFileName = exportResult.fileName
} else {
// Handle regular binary files
buffer = await this.downloadBinaryFile(fileInfo.id, accessToken)
processedMimeType = fileInfo.mimeType
processedFileName = fileInfo.name
}
// Download file to temporary location
tempFilePath = await this.createTempFile(buffer, processedFileName, processedMimeType)
let docs: IDocument[] = []
const mimeType = processedMimeType.toLowerCase()
switch (mimeType) {
case 'application/pdf': {
const pdfLoader = new PDFLoader(tempFilePath, {
// @ts-ignore
pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
})
docs = await pdfLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
case 'application/msword': {
const docxLoader = new DocxLoader(tempFilePath)
docs = await docxLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
case 'application/vnd.ms-excel': {
const excelLoader = new LoadOfSheet(tempFilePath)
docs = await excelLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
case 'application/vnd.ms-powerpoint': {
const pptxLoader = new PowerpointLoader(tempFilePath)
docs = await pptxLoader.load()
break
}
case 'text/csv': {
const csvLoader = new CSVLoader(tempFilePath)
docs = await csvLoader.load()
break
}
default:
throw new Error(`Unsupported binary file type: ${mimeType}`)
}
// Add Google Drive metadata to each document
if (docs.length > 0) {
const googleDriveMetadata = {
source: fileInfo.webViewLink || `https://drive.google.com/file/d/${fileInfo.id}/view`,
fileId: fileInfo.id,
fileName: fileInfo.name,
mimeType: fileInfo.mimeType,
size: fileInfo.size ? parseInt(fileInfo.size) : undefined,
createdTime: fileInfo.createdTime,
modifiedTime: fileInfo.modifiedTime,
parents: fileInfo.parents,
totalPages: docs.length // Total number of pages/sheets in the file
}
return docs.map((doc, index) => ({
...doc,
metadata: {
...doc.metadata, // Keep original loader metadata (page numbers, etc.)
...googleDriveMetadata, // Add Google Drive metadata
pageIndex: index, // Add page/sheet index
driveId: fileInfo.driveId,
driveContext: fileInfo.driveContext || (fileInfo.driveId ? ' (Shared Drive)' : ' (My Drive)')
}
}))
}
return []
} catch (error) {
throw new Error(`Failed to process binary file: ${error.message}`)
} finally {
// Clean up temporary file
if (tempFilePath && fs.existsSync(tempFilePath)) {
try {
fs.unlinkSync(tempFilePath)
} catch (e) {
console.warn(`Failed to delete temporary file: ${tempFilePath}`)
}
}
}
}
private async createTempFile(buffer: Buffer, fileName: string, mimeType: string): Promise<string> {
// Get appropriate file extension
let extension = path.extname(fileName)
if (!extension) {
const extensionMap: { [key: string]: string } = {
'application/pdf': '.pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
'application/msword': '.doc',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
'application/vnd.ms-excel': '.xls',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
'application/vnd.ms-powerpoint': '.ppt',
'text/csv': '.csv'
}
extension = extensionMap[mimeType] || '.tmp'
}
// Create temporary file
const tempDir = os.tmpdir()
const tempFileName = `gdrive_${Date.now()}_${Math.random().toString(36).substring(7)}${extension}`
const tempFilePath = path.join(tempDir, tempFileName)
fs.writeFileSync(tempFilePath, buffer)
return tempFilePath
}
private async downloadBinaryFile(fileId: string, accessToken: string): Promise<Buffer> {
const url = `https://www.googleapis.com/drive/v3/files/${encodeURIComponent(fileId)}?alt=media`
const response = await fetch(url, {
headers: {
Authorization: `Bearer ${accessToken}`
}
})
if (!response.ok) {
throw new Error(`Failed to download file: ${response.statusText}`)
}
const arrayBuffer = await response.arrayBuffer()
return Buffer.from(arrayBuffer)
}
private async downloadFile(fileId: string, accessToken: string): Promise<string> {
const url = `https://www.googleapis.com/drive/v3/files/${encodeURIComponent(fileId)}?alt=media`
const response = await fetch(url, {
headers: {
Authorization: `Bearer ${accessToken}`
}
})
if (!response.ok) {
throw new Error(`Failed to download file: ${response.statusText}`)
}
// Only call response.text() for text-based files
const contentType = response.headers.get('content-type') || ''
if (!contentType.startsWith('text/') && !contentType.includes('json') && !contentType.includes('xml')) {
throw new Error(`Cannot process binary file with content-type: ${contentType}`)
}
return await response.text()
}
private isGoogleWorkspaceFile(mimeType: string): boolean {
const googleWorkspaceMimeTypes = [
'application/vnd.google-apps.document',
'application/vnd.google-apps.spreadsheet',
'application/vnd.google-apps.presentation',
'application/vnd.google-apps.drawing'
]
return googleWorkspaceMimeTypes.includes(mimeType)
}
private isTextBasedFile(mimeType: string): boolean {
const textBasedMimeTypes = [
'text/plain',
'text/html',
'text/css',
'text/javascript',
'text/csv',
'text/xml',
'application/json',
'application/xml',
'text/markdown',
'text/x-markdown'
]
return textBasedMimeTypes.includes(mimeType)
}
private async exportGoogleWorkspaceFileAsBuffer(
fileId: string,
mimeType: string,
accessToken: string
): Promise<{ buffer: Buffer; mimeType: string; fileName: string }> {
// Automatic mapping of Google Workspace MIME types to export formats
let exportMimeType: string
let fileExtension: string
switch (mimeType) {
case 'application/vnd.google-apps.document':
exportMimeType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
fileExtension = '.docx'
break
case 'application/vnd.google-apps.spreadsheet':
exportMimeType = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
fileExtension = '.xlsx'
break
case 'application/vnd.google-apps.presentation':
exportMimeType = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
fileExtension = '.pptx'
break
case 'application/vnd.google-apps.drawing':
exportMimeType = 'application/pdf'
fileExtension = '.pdf'
break
default:
// Fallback to DOCX for any other Google Workspace file
exportMimeType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
fileExtension = '.docx'
break
}
const url = `https://www.googleapis.com/drive/v3/files/${encodeURIComponent(fileId)}/export?mimeType=${encodeURIComponent(
exportMimeType
)}`
const response = await fetch(url, {
headers: {
Authorization: `Bearer ${accessToken}`
}
})
if (!response.ok) {
throw new Error(`Failed to export file: ${response.statusText}`)
}
const arrayBuffer = await response.arrayBuffer()
const buffer = Buffer.from(arrayBuffer)
return {
buffer,
mimeType: exportMimeType,
fileName: `exported_file${fileExtension}`
}
}
}
module.exports = { nodeClass: GoogleDrive_DocumentLoaders }
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="96px" height="96px"><path fill="#1e88e5" d="M38.59,39c-0.535,0.93-0.298,1.68-1.195,2.197C36.498,41.715,35.465,42,34.39,42H13.61 c-1.074,0-2.106-0.285-3.004-0.802C9.708,40.681,9.945,39.93,9.41,39l7.67-9h13.84L38.59,39z"/><path fill="#fbc02d" d="M27.463,6.999c1.073-0.002,2.104-0.716,3.001-0.198c0.897,0.519,1.66,1.27,2.197,2.201l10.39,17.996 c0.537,0.93,0.807,1.967,0.808,3.002c0.001,1.037-1.267,2.073-1.806,3.001l-11.127-3.005l-6.924-11.993L27.463,6.999z"/><path fill="#e53935" d="M43.86,30c0,1.04-0.27,2.07-0.81,3l-3.67,6.35c-0.53,0.78-1.21,1.4-1.99,1.85L30.92,30H43.86z"/><path fill="#4caf50" d="M5.947,33.001c-0.538-0.928-1.806-1.964-1.806-3c0.001-1.036,0.27-2.073,0.808-3.004l10.39-17.996 c0.537-0.93,1.3-1.682,2.196-2.2c0.897-0.519,1.929,0.195,3.002,0.197l3.459,11.009l-6.922,11.989L5.947,33.001z"/><path fill="#1565c0" d="M17.08,30l-6.47,11.2c-0.78-0.45-1.46-1.07-1.99-1.85L4.95,33c-0.54-0.93-0.81-1.96-0.81-3H17.08z"/><path fill="#2e7d32" d="M30.46,6.8L24,18L17.53,6.8c0.78-0.45,1.66-0.73,2.6-0.79L27.46,6C28.54,6,29.57,6.28,30.46,6.8z"/></svg>

After

Width:  |  Height:  |  Size: 1.1 KiB

@@ -0,0 +1,429 @@
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams, INodeOptionsValue } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import {
convertMultiOptionsToStringArray,
getCredentialData,
getCredentialParam,
handleEscapeCharacters,
INodeOutputsValue,
refreshOAuth2Token
} from '../../../src'
class GoogleSheets_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
credential: INodeParams
inputs: INodeParams[]
outputs: INodeOutputsValue[]
constructor() {
this.label = 'Google Sheets'
this.name = 'googleSheets'
this.version = 1.0
this.type = 'Document'
this.icon = 'google-sheets.svg'
this.category = 'Document Loaders'
this.description = `Load data from Google Sheets as documents`
this.baseClasses = [this.type]
this.credential = {
label: 'Connect Credential',
name: 'credential',
type: 'credential',
description: 'Google Sheets OAuth2 Credential',
credentialNames: ['googleSheetsOAuth2']
}
this.inputs = [
{
label: 'Select Spreadsheet',
name: 'spreadsheetIds',
type: 'asyncMultiOptions',
loadMethod: 'listSpreadsheets',
description: 'Select spreadsheet from your Google Drive',
refresh: true
},
{
label: 'Sheet Names',
name: 'sheetNames',
type: 'string',
description: 'Comma-separated list of sheet names to load. If empty, loads all sheets.',
placeholder: 'Sheet1, Sheet2',
optional: true
},
{
label: 'Range',
name: 'range',
type: 'string',
description: 'Range to load (e.g., A1:E10). If empty, loads entire sheet.',
placeholder: 'A1:E10',
optional: true
},
{
label: 'Include Headers',
name: 'includeHeaders',
type: 'boolean',
description: 'Whether to include the first row as headers',
default: true
},
{
label: 'Value Render Option',
name: 'valueRenderOption',
type: 'options',
description: 'How values should be represented in the output',
options: [
{
label: 'Formatted Value',
name: 'FORMATTED_VALUE'
},
{
label: 'Unformatted Value',
name: 'UNFORMATTED_VALUE'
},
{
label: 'Formula',
name: 'FORMULA'
}
],
default: 'FORMATTED_VALUE',
optional: true
},
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true
},
{
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
]
this.outputs = [
{
label: 'Document',
name: 'document',
description: 'Array of document objects containing metadata and pageContent',
baseClasses: [...this.baseClasses, 'json']
},
{
label: 'Text',
name: 'text',
description: 'Concatenated string from pageContent of documents',
baseClasses: ['string', 'json']
}
]
}
//@ts-ignore
loadMethods = {
async listSpreadsheets(nodeData: INodeData, options: ICommonObject): Promise<INodeOptionsValue[]> {
const returnData: INodeOptionsValue[] = []
try {
let credentialData = await getCredentialData(nodeData.credential ?? '', options)
credentialData = await refreshOAuth2Token(nodeData.credential ?? '', credentialData, options)
const accessToken = getCredentialParam('access_token', credentialData, nodeData)
if (!accessToken) {
return returnData
}
// Query for Google Sheets files specifically
const query = "mimeType='application/vnd.google-apps.spreadsheet' and trashed = false"
const url = new URL('https://www.googleapis.com/drive/v3/files')
url.searchParams.append('q', query)
url.searchParams.append('pageSize', '100')
url.searchParams.append('fields', 'files(id, name, modifiedTime, webViewLink)')
url.searchParams.append('orderBy', 'modifiedTime desc')
const response = await fetch(url.toString(), {
headers: {
Authorization: `Bearer ${accessToken}`,
'Content-Type': 'application/json'
}
})
if (!response.ok) {
console.error(`Failed to list spreadsheets: ${response.statusText}`)
return returnData
}
const data = await response.json()
for (const file of data.files) {
const obj: INodeOptionsValue = {
name: file.id,
label: file.name,
description: `Modified: ${new Date(file.modifiedTime).toLocaleDateString()}`
}
returnData.push(obj)
}
} catch (error) {
console.error('Error listing Google Sheets:', error)
}
return returnData
}
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const _spreadsheetIds = nodeData.inputs?.spreadsheetIds as string
const sheetNames = nodeData.inputs?.sheetNames as string
const range = nodeData.inputs?.range as string
const includeHeaders = nodeData.inputs?.includeHeaders as boolean
const valueRenderOption = (nodeData.inputs?.valueRenderOption as string) || 'FORMATTED_VALUE'
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
if (!_spreadsheetIds) {
throw new Error('At least one spreadsheet is required')
}
let spreadsheetIds = convertMultiOptionsToStringArray(_spreadsheetIds)
let credentialData = await getCredentialData(nodeData.credential ?? '', options)
credentialData = await refreshOAuth2Token(nodeData.credential ?? '', credentialData, options)
const accessToken = getCredentialParam('access_token', credentialData, nodeData)
if (!accessToken) {
throw new Error('No access token found in credential')
}
let docs: IDocument[] = []
try {
// Process each spreadsheet
for (const spreadsheetId of spreadsheetIds) {
try {
// Get spreadsheet metadata first
const spreadsheetMetadata = await this.getSpreadsheetMetadata(spreadsheetId, accessToken)
// Determine which sheets to load
let sheetsToLoad: string[] = []
if (sheetNames) {
sheetsToLoad = sheetNames.split(',').map((name) => name.trim())
} else {
// Get all sheet names from metadata
sheetsToLoad = spreadsheetMetadata.sheets?.map((sheet: any) => sheet.properties.title) || []
}
// Load data from each sheet
for (const sheetName of sheetsToLoad) {
const sheetRange = range ? `${sheetName}!${range}` : sheetName
const sheetData = await this.getSheetData(spreadsheetId, sheetRange, valueRenderOption, accessToken)
if (sheetData.values && sheetData.values.length > 0) {
const sheetDoc = this.convertSheetToDocument(
sheetData,
sheetName,
spreadsheetId,
spreadsheetMetadata,
includeHeaders
)
docs.push(sheetDoc)
}
}
} catch (error) {
console.warn(`Failed to process spreadsheet ${spreadsheetId}: ${error.message}`)
// Continue processing other spreadsheets even if one fails
}
}
// Apply text splitter if provided
if (textSplitter && docs.length > 0) {
docs = await textSplitter.splitDocuments(docs)
}
// Apply metadata transformations
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {
...parsedMetadata
}
: omit(
{
...doc.metadata,
...parsedMetadata
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {}
: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
} catch (error) {
throw new Error(`Failed to load Google Sheets data: ${error.message}`)
}
if (output === 'document') {
return docs
} else {
let finaltext = ''
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
}
}
private async getSpreadsheetMetadata(spreadsheetId: string, accessToken: string): Promise<any> {
const url = `https://sheets.googleapis.com/v4/spreadsheets/${spreadsheetId}`
const response = await fetch(url, {
headers: {
Authorization: `Bearer ${accessToken}`,
'Content-Type': 'application/json'
}
})
if (!response.ok) {
const errorText = await response.text()
throw new Error(`Failed to get spreadsheet metadata: ${response.status} ${response.statusText} - ${errorText}`)
}
return response.json()
}
private async getSheetData(spreadsheetId: string, range: string, valueRenderOption: string, accessToken: string): Promise<any> {
const url = `https://sheets.googleapis.com/v4/spreadsheets/${spreadsheetId}/values/${encodeURIComponent(range)}`
const params = new URLSearchParams({
valueRenderOption,
dateTimeRenderOption: 'FORMATTED_STRING',
majorDimension: 'ROWS'
})
const response = await fetch(`${url}?${params}`, {
headers: {
Authorization: `Bearer ${accessToken}`,
'Content-Type': 'application/json'
}
})
if (!response.ok) {
const errorText = await response.text()
throw new Error(`Failed to get sheet data: ${response.status} ${response.statusText} - ${errorText}`)
}
return response.json()
}
private convertSheetToDocument(
sheetData: any,
sheetName: string,
spreadsheetId: string,
spreadsheetMetadata: any,
includeHeaders: boolean
): IDocument {
const values = sheetData.values || []
if (values.length === 0) {
return {
pageContent: '',
metadata: {
source: `Google Sheets: ${spreadsheetMetadata.properties?.title || 'Unknown'} - ${sheetName}`,
spreadsheetId,
sheetName,
spreadsheetTitle: spreadsheetMetadata.properties?.title,
range: sheetData.range,
rowCount: 0,
columnCount: 0
}
}
}
let headers: string[] = []
let dataRows: string[][] = []
if (includeHeaders && values.length > 0) {
headers = values[0] || []
dataRows = values.slice(1)
} else {
// Generate default headers like A, B, C, etc.
const maxColumns = Math.max(...values.map((row: any[]) => row.length))
headers = Array.from({ length: maxColumns }, (_, i) => String.fromCharCode(65 + i))
dataRows = values
}
// Convert to markdown table format
let content = ''
if (headers.length > 0) {
// Create header row
content += '| ' + headers.join(' | ') + ' |\n'
// Create separator row
content += '| ' + headers.map(() => '---').join(' | ') + ' |\n'
// Add data rows
for (const row of dataRows) {
const paddedRow = [...row]
// Pad row to match header length
while (paddedRow.length < headers.length) {
paddedRow.push('')
}
content += '| ' + paddedRow.join(' | ') + ' |\n'
}
}
return {
pageContent: content,
metadata: {
source: `Google Sheets: ${spreadsheetMetadata.properties?.title || 'Unknown'} - ${sheetName}`,
spreadsheetId,
sheetName,
spreadsheetTitle: spreadsheetMetadata.properties?.title,
spreadsheetUrl: `https://docs.google.com/spreadsheets/d/${spreadsheetId}`,
range: sheetData.range,
rowCount: values.length,
columnCount: headers.length,
headers: includeHeaders ? headers : undefined,
totalDataRows: dataRows.length
}
}
}
}
module.exports = { nodeClass: GoogleSheets_DocumentLoaders }
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="96px" height="96px"><path fill="#43a047" d="M37,45H11c-1.657,0-3-1.343-3-3V6c0-1.657,1.343-3,3-3h19l10,10v29C40,43.657,38.657,45,37,45z"/><path fill="#c8e6c9" d="M40 13L30 13 30 3z"/><path fill="#2e7d32" d="M30 13L40 23 40 13z"/><path fill="#e8f5e9" d="M31,23H17h-2v2v2v2v2v2v2v2h18v-2v-2v-2v-2v-2v-2v-2H31z M17,25h4v2h-4V25z M17,29h4v2h-4V29z M17,33h4v2h-4V33z M31,35h-8v-2h8V35z M31,31h-8v-2h8V31z M31,27h-8v-2h8V27z"/></svg>

After

Width:  |  Height:  |  Size: 495 B

@@ -1,2 +1 @@
<?xml version="1.0" encoding="utf-8"?><!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
<svg width="800px" height="800px" viewBox="0 0 48 48" xmlns="http://www.w3.org/2000/svg"><defs><style>.a{fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;}</style></defs><path class="a" d="M5.5,22.9722h0a8.7361,8.7361,0,0,0,8.7361,8.7361h2.0556v2.0556A8.7361,8.7361,0,0,0,25.0278,42.5h0V22.9722Z"/><path class="a" d="M14.2361,14.2361h0a8.7361,8.7361,0,0,0,8.7361,8.7361h2.0556v2.0556a8.7361,8.7361,0,0,0,8.7361,8.7361h0V14.2361Z"/><path class="a" d="M22.9722,5.5h0a8.7361,8.7361,0,0,0,8.7361,8.7361h2.0556v2.0556A8.7361,8.7361,0,0,0,42.5,25.0278h0V5.5Z"/></svg>
<svg height="2500" preserveAspectRatio="xMidYMid" width="2500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 -30.632388516510233 255.324 285.95638851651023"><linearGradient id="a"><stop offset=".18" stop-color="#0052cc"/><stop offset="1" stop-color="#2684ff"/></linearGradient><linearGradient id="b" x1="98.031%" x2="58.888%" xlink:href="#a" y1=".161%" y2="40.766%"/><linearGradient id="c" x1="100.665%" x2="55.402%" xlink:href="#a" y1=".455%" y2="44.727%"/><path d="M244.658 0H121.707a55.502 55.502 0 0 0 55.502 55.502h22.649V77.37c.02 30.625 24.841 55.447 55.466 55.467V10.666C255.324 4.777 250.55 0 244.658 0z" fill="#2684ff"/><path d="M183.822 61.262H60.872c.019 30.625 24.84 55.447 55.466 55.467h22.649v21.938c.039 30.625 24.877 55.43 55.502 55.43V71.93c0-5.891-4.776-10.667-10.667-10.667z" fill="url(#b)"/><path d="M122.951 122.489H0c0 30.653 24.85 55.502 55.502 55.502h22.72v21.867c.02 30.597 24.798 55.408 55.396 55.466V133.156c0-5.891-4.776-10.667-10.667-10.667z" fill="url(#c)"/></svg>

Before

Width:  |  Height:  |  Size: 699 B

After

Width:  |  Height:  |  Size: 1.0 KiB

@@ -0,0 +1,72 @@
import { Document } from '@langchain/core/documents'
import { BufferLoader } from 'langchain/document_loaders/fs/buffer'
import { read, utils } from 'xlsx'
/**
* Document loader that uses SheetJS to load documents.
*
* Each worksheet is parsed into an array of row objects using the SheetJS
* `sheet_to_json` method and projected to a `Document`. Metadata includes
* original sheet name, row data, and row index
*/
export class LoadOfSheet extends BufferLoader {
attributes: { name: string; description: string; type: string }[] = []
constructor(filePathOrBlob: string | Blob) {
super(filePathOrBlob)
this.attributes = []
}
/**
* Parse document
*
* NOTE: column labels in multiple sheets are not disambiguated!
*
* @param raw Raw data Buffer
* @param metadata Document metadata
* @returns Array of Documents
*/
async parse(raw: Buffer, metadata: Document['metadata']): Promise<Document[]> {
const result: Document[] = []
this.attributes = [
{ name: 'worksheet', description: 'Sheet or Worksheet Name', type: 'string' },
{ name: 'rowNum', description: 'Row index', type: 'number' }
]
const wb = read(raw, { type: 'buffer' })
for (let name of wb.SheetNames) {
const fields: Record<string, Record<string, boolean>> = {}
const ws = wb.Sheets[name]
if (!ws) continue
const aoo = utils.sheet_to_json(ws) as Record<string, unknown>[]
aoo.forEach((row) => {
result.push({
pageContent:
Object.entries(row)
.map((kv) => `- ${kv[0]}: ${kv[1]}`)
.join('\n') + '\n',
metadata: {
worksheet: name,
rowNum: row['__rowNum__'],
...metadata,
...row
}
})
Object.entries(row).forEach(([k, v]) => {
if (v != null) (fields[k] || (fields[k] = {}))[v instanceof Date ? 'date' : typeof v] = true
})
})
Object.entries(fields).forEach(([k, v]) =>
this.attributes.push({
name: k,
description: k,
type: Object.keys(v).join(' or ')
})
)
}
return result
}
}
@@ -0,0 +1,142 @@
import { TextSplitter } from 'langchain/text_splitter'
import { LoadOfSheet } from './ExcelLoader'
import { getFileFromStorage, handleDocumentLoaderDocuments, handleDocumentLoaderMetadata, handleDocumentLoaderOutput } from '../../../src'
import { ICommonObject, IDocument, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
class MicrosoftExcel_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
inputs: INodeParams[]
outputs: INodeOutputsValue[]
constructor() {
this.label = 'Microsoft Excel'
this.name = 'microsoftExcel'
this.version = 1.0
this.type = 'Document'
this.icon = 'excel.svg'
this.category = 'Document Loaders'
this.description = `Load data from Microsoft Excel files`
this.baseClasses = [this.type]
this.inputs = [
{
label: 'Excel File',
name: 'excelFile',
type: 'file',
fileType: '.xlsx, .xls, .xlsm, .xlsb'
},
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true
},
{
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
]
this.outputs = [
{
label: 'Document',
name: 'document',
description: 'Array of document objects containing metadata and pageContent',
baseClasses: [...this.baseClasses, 'json']
},
{
label: 'Text',
name: 'text',
description: 'Concatenated string from pageContent of documents',
baseClasses: ['string', 'json']
}
]
}
getFiles(nodeData: INodeData) {
const excelFileBase64 = nodeData.inputs?.excelFile as string
let files: string[] = []
let fromStorage: boolean = true
if (excelFileBase64.startsWith('FILE-STORAGE::')) {
const fileName = excelFileBase64.replace('FILE-STORAGE::', '')
if (fileName.startsWith('[') && fileName.endsWith(']')) {
files = JSON.parse(fileName)
} else {
files = [fileName]
}
} else {
if (excelFileBase64.startsWith('[') && excelFileBase64.endsWith(']')) {
files = JSON.parse(excelFileBase64)
} else {
files = [excelFileBase64]
}
fromStorage = false
}
return { files, fromStorage }
}
async getFileData(file: string, { orgId, chatflowid }: { orgId: string; chatflowid: string }, fromStorage?: boolean) {
if (fromStorage) {
return getFileFromStorage(file, orgId, chatflowid)
} else {
const splitDataURI = file.split(',')
splitDataURI.pop()
return Buffer.from(splitDataURI.pop() || '', 'base64')
}
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const output = nodeData.outputs?.output as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let docs: IDocument[] = []
const orgId = options.orgId
const chatflowid = options.chatflowid
const { files, fromStorage } = this.getFiles(nodeData)
for (const file of files) {
if (!file) continue
const fileData = await this.getFileData(file, { orgId, chatflowid }, fromStorage)
const blob = new Blob([fileData])
const loader = new LoadOfSheet(blob)
// use spread instead of push, because it raises RangeError: Maximum call stack size exceeded when too many docs
docs = [...docs, ...(await handleDocumentLoaderDocuments(loader, textSplitter))]
}
docs = handleDocumentLoaderMetadata(docs, _omitMetadataKeys, metadata)
return handleDocumentLoaderOutput(docs, output)
}
}
module.exports = { nodeClass: MicrosoftExcel_DocumentLoaders }
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="96px" height="96px"><path fill="#169154" d="M29,6H15.744C14.781,6,14,6.781,14,7.744v7.259h15V6z"/><path fill="#18482a" d="M14,33.054v7.202C14,41.219,14.781,42,15.743,42H29v-8.946H14z"/><path fill="#0c8045" d="M14 15.003H29V24.005000000000003H14z"/><path fill="#17472a" d="M14 24.005H29V33.055H14z"/><g><path fill="#29c27f" d="M42.256,6H29v9.003h15V7.744C44,6.781,43.219,6,42.256,6z"/><path fill="#27663f" d="M29,33.054V42h13.257C43.219,42,44,41.219,44,40.257v-7.202H29z"/><path fill="#19ac65" d="M29 15.003H44V24.005000000000003H29z"/><path fill="#129652" d="M29 24.005H44V33.055H29z"/></g><path fill="#0c7238" d="M22.319,34H5.681C4.753,34,4,33.247,4,32.319V15.681C4,14.753,4.753,14,5.681,14h16.638 C23.247,14,24,14.753,24,15.681v16.638C24,33.247,23.247,34,22.319,34z"/><path fill="#fff" d="M9.807 19L12.193 19 14.129 22.754 16.175 19 18.404 19 15.333 24 18.474 29 16.123 29 14.013 25.07 11.912 29 9.526 29 12.719 23.982z"/></svg>

After

Width:  |  Height:  |  Size: 998 B

@@ -0,0 +1,142 @@
import { TextSplitter } from 'langchain/text_splitter'
import { PowerpointLoader } from './PowerpointLoader'
import { getFileFromStorage, handleDocumentLoaderDocuments, handleDocumentLoaderMetadata, handleDocumentLoaderOutput } from '../../../src'
import { ICommonObject, IDocument, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
class MicrosoftPowerpoint_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
inputs: INodeParams[]
outputs: INodeOutputsValue[]
constructor() {
this.label = 'Microsoft PowerPoint'
this.name = 'microsoftPowerpoint'
this.version = 1.0
this.type = 'Document'
this.icon = 'powerpoint.svg'
this.category = 'Document Loaders'
this.description = `Load data from Microsoft PowerPoint files`
this.baseClasses = [this.type]
this.inputs = [
{
label: 'PowerPoint File',
name: 'powerpointFile',
type: 'file',
fileType: '.pptx, .ppt'
},
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true
},
{
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
]
this.outputs = [
{
label: 'Document',
name: 'document',
description: 'Array of document objects containing metadata and pageContent',
baseClasses: [...this.baseClasses, 'json']
},
{
label: 'Text',
name: 'text',
description: 'Concatenated string from pageContent of documents',
baseClasses: ['string', 'json']
}
]
}
getFiles(nodeData: INodeData) {
const powerpointFileBase64 = nodeData.inputs?.powerpointFile as string
let files: string[] = []
let fromStorage: boolean = true
if (powerpointFileBase64.startsWith('FILE-STORAGE::')) {
const fileName = powerpointFileBase64.replace('FILE-STORAGE::', '')
if (fileName.startsWith('[') && fileName.endsWith(']')) {
files = JSON.parse(fileName)
} else {
files = [fileName]
}
} else {
if (powerpointFileBase64.startsWith('[') && powerpointFileBase64.endsWith(']')) {
files = JSON.parse(powerpointFileBase64)
} else {
files = [powerpointFileBase64]
}
fromStorage = false
}
return { files, fromStorage }
}
async getFileData(file: string, { orgId, chatflowid }: { orgId: string; chatflowid: string }, fromStorage?: boolean) {
if (fromStorage) {
return getFileFromStorage(file, orgId, chatflowid)
} else {
const splitDataURI = file.split(',')
splitDataURI.pop()
return Buffer.from(splitDataURI.pop() || '', 'base64')
}
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const output = nodeData.outputs?.output as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let docs: IDocument[] = []
const orgId = options.orgId
const chatflowid = options.chatflowid
const { files, fromStorage } = this.getFiles(nodeData)
for (const file of files) {
if (!file) continue
const fileData = await this.getFileData(file, { orgId, chatflowid }, fromStorage)
const blob = new Blob([fileData])
const loader = new PowerpointLoader(blob)
// use spread instead of push, because it raises RangeError: Maximum call stack size exceeded when too many docs
docs = [...docs, ...(await handleDocumentLoaderDocuments(loader, textSplitter))]
}
docs = handleDocumentLoaderMetadata(docs, _omitMetadataKeys, metadata)
return handleDocumentLoaderOutput(docs, output)
}
}
module.exports = { nodeClass: MicrosoftPowerpoint_DocumentLoaders }
@@ -0,0 +1,101 @@
import { Document } from '@langchain/core/documents'
import { BufferLoader } from 'langchain/document_loaders/fs/buffer'
import { parseOfficeAsync } from 'officeparser'
/**
* Document loader that uses officeparser to load PowerPoint documents.
*
* Each slide is parsed into a separate Document with metadata including
* slide number and extracted text content.
*/
export class PowerpointLoader extends BufferLoader {
attributes: { name: string; description: string; type: string }[] = []
constructor(filePathOrBlob: string | Blob) {
super(filePathOrBlob)
this.attributes = []
}
/**
* Parse PowerPoint document
*
* @param raw Raw data Buffer
* @param metadata Document metadata
* @returns Array of Documents
*/
async parse(raw: Buffer, metadata: Document['metadata']): Promise<Document[]> {
const result: Document[] = []
this.attributes = [
{ name: 'slideNumber', description: 'Slide number', type: 'number' },
{ name: 'documentType', description: 'Type of document', type: 'string' }
]
try {
// Use officeparser to extract text from PowerPoint
const data = await parseOfficeAsync(raw)
if (typeof data === 'string' && data.trim()) {
// Split content by common slide separators or use the entire content as one document
const slides = this.splitIntoSlides(data)
slides.forEach((slideContent, index) => {
if (slideContent.trim()) {
result.push({
pageContent: slideContent.trim(),
metadata: {
slideNumber: index + 1,
documentType: 'powerpoint',
...metadata
}
})
}
})
}
} catch (error) {
console.error('Error parsing PowerPoint file:', error)
throw new Error(`Failed to parse PowerPoint file: ${error instanceof Error ? error.message : 'Unknown error'}`)
}
return result
}
/**
* Split content into slides based on common patterns
* This is a heuristic approach since officeparser returns plain text
*/
private splitIntoSlides(content: string): string[] {
// Try to split by common slide patterns
const slidePatterns = [
/\n\s*Slide\s+\d+/gi,
/\n\s*Page\s+\d+/gi,
/\n\s*\d+\s*\/\s*\d+/gi,
/\n\s*_{3,}/g, // Underscores as separators
/\n\s*-{3,}/g // Dashes as separators
]
let slides: string[] = []
// Try each pattern and use the one that creates the most reasonable splits
for (const pattern of slidePatterns) {
const potentialSlides = content.split(pattern)
if (potentialSlides.length > 1 && potentialSlides.length < 100) {
// Reasonable number of slides
slides = potentialSlides
break
}
}
// If no good pattern found, split by double newlines as a fallback
if (slides.length === 0) {
slides = content.split(/\n\s*\n\s*\n/)
}
// If still no good split, treat entire content as one slide
if (slides.length === 0 || slides.every((slide) => slide.trim().length < 10)) {
slides = [content]
}
return slides.filter((slide) => slide.trim().length > 0)
}
}
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="96px" height="96px"><path fill="#d35230" d="M8,24c0,9.941,8.059,18,18,18s18-8.059,18-18H26H8z"/><path fill="#ff8f6b" d="M26,6v18h18C44,14.059,35.941,6,26,6z"/><path fill="#ed6c47" d="M26,6C16.059,6,8,14.059,8,24h18V6z"/><path d="M26,16.681C26,14.648,24.352,13,22.319,13H11.774C9.417,16.044,8,19.852,8,24 c0,5.116,2.145,9.723,5.571,13h8.747C24.352,37,26,35.352,26,33.319V16.681z" opacity=".05"/><path d="M22.213,13.333H11.525C9.32,16.321,8,20.002,8,24c0,4.617,1.753,8.814,4.611,12h9.602 c1.724,0,3.121-1.397,3.121-3.121V16.454C25.333,14.731,23.936,13.333,22.213,13.333z" opacity=".07"/><path d="M22.106,13.667H11.276C9.218,16.593,8,20.151,8,24c0,4.148,1.417,7.956,3.774,11h10.332 c1.414,0,2.56-1.146,2.56-2.56V16.227C24.667,14.813,23.52,13.667,22.106,13.667z" opacity=".09"/><linearGradient id="N~uyq1CljjkKMh72IFt0Fa" x1="4.586" x2="22.77" y1="14.586" y2="32.77" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#ca4e2a"/><stop offset="1" stop-color="#b63016"/></linearGradient><path fill="url(#N~uyq1CljjkKMh72IFt0Fa)" d="M22,34H6c-1.105,0-2-0.895-2-2V16c0-1.105,0.895-2,2-2h16c1.105,0,2,0.895,2,2v16 C24,33.105,23.105,34,22,34z"/><path fill="#fff" d="M14.673,19.012H10v10h2.024v-3.521H14.3c1.876,0,3.397-1.521,3.397-3.397v-0.058 C17.697,20.366,16.343,19.012,14.673,19.012z M15.57,22.358c0,0.859-0.697,1.556-1.556,1.556h-1.99v-3.325h1.99 c0.859,0,1.556,0.697,1.556,1.556V22.358z"/></svg>

After

Width:  |  Height:  |  Size: 1.4 KiB

@@ -0,0 +1,142 @@
import { TextSplitter } from 'langchain/text_splitter'
import { WordLoader } from './WordLoader'
import { getFileFromStorage, handleDocumentLoaderDocuments, handleDocumentLoaderMetadata, handleDocumentLoaderOutput } from '../../../src'
import { ICommonObject, IDocument, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
class MicrosoftWord_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
inputs: INodeParams[]
outputs: INodeOutputsValue[]
constructor() {
this.label = 'Microsoft Word'
this.name = 'microsoftWord'
this.version = 1.0
this.type = 'Document'
this.icon = 'word.svg'
this.category = 'Document Loaders'
this.description = `Load data from Microsoft Word files`
this.baseClasses = [this.type]
this.inputs = [
{
label: 'Word File',
name: 'docxFile',
type: 'file',
fileType: '.docx, .doc'
},
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true
},
{
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
}
]
this.outputs = [
{
label: 'Document',
name: 'document',
description: 'Array of document objects containing metadata and pageContent',
baseClasses: [...this.baseClasses, 'json']
},
{
label: 'Text',
name: 'text',
description: 'Concatenated string from pageContent of documents',
baseClasses: ['string', 'json']
}
]
}
getFiles(nodeData: INodeData) {
const docxFileBase64 = nodeData.inputs?.docxFile as string
let files: string[] = []
let fromStorage: boolean = true
if (docxFileBase64.startsWith('FILE-STORAGE::')) {
const fileName = docxFileBase64.replace('FILE-STORAGE::', '')
if (fileName.startsWith('[') && fileName.endsWith(']')) {
files = JSON.parse(fileName)
} else {
files = [fileName]
}
} else {
if (docxFileBase64.startsWith('[') && docxFileBase64.endsWith(']')) {
files = JSON.parse(docxFileBase64)
} else {
files = [docxFileBase64]
}
fromStorage = false
}
return { files, fromStorage }
}
async getFileData(file: string, { orgId, chatflowid }: { orgId: string; chatflowid: string }, fromStorage?: boolean) {
if (fromStorage) {
return getFileFromStorage(file, orgId, chatflowid)
} else {
const splitDataURI = file.split(',')
splitDataURI.pop()
return Buffer.from(splitDataURI.pop() || '', 'base64')
}
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const output = nodeData.outputs?.output as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
let docs: IDocument[] = []
const orgId = options.orgId
const chatflowid = options.chatflowid
const { files, fromStorage } = this.getFiles(nodeData)
for (const file of files) {
if (!file) continue
const fileData = await this.getFileData(file, { orgId, chatflowid }, fromStorage)
const blob = new Blob([fileData])
const loader = new WordLoader(blob)
// use spread instead of push, because it raises RangeError: Maximum call stack size exceeded when too many docs
docs = [...docs, ...(await handleDocumentLoaderDocuments(loader, textSplitter))]
}
docs = handleDocumentLoaderMetadata(docs, _omitMetadataKeys, metadata)
return handleDocumentLoaderOutput(docs, output)
}
}
module.exports = { nodeClass: MicrosoftWord_DocumentLoaders }
@@ -0,0 +1,108 @@
import { Document } from '@langchain/core/documents'
import { BufferLoader } from 'langchain/document_loaders/fs/buffer'
import { parseOfficeAsync } from 'officeparser'
/**
* Document loader that uses officeparser to load Word documents.
*
* The document is parsed into a single Document with metadata including
* document type and extracted text content.
*/
export class WordLoader extends BufferLoader {
attributes: { name: string; description: string; type: string }[] = []
constructor(filePathOrBlob: string | Blob) {
super(filePathOrBlob)
this.attributes = []
}
/**
* Parse Word document
*
* @param raw Raw data Buffer
* @param metadata Document metadata
* @returns Array of Documents
*/
async parse(raw: Buffer, metadata: Document['metadata']): Promise<Document[]> {
const result: Document[] = []
this.attributes = [
{ name: 'documentType', description: 'Type of document', type: 'string' },
{ name: 'pageCount', description: 'Number of pages/sections', type: 'number' }
]
try {
// Use officeparser to extract text from Word document
const data = await parseOfficeAsync(raw)
if (typeof data === 'string' && data.trim()) {
// Split content by common page/section separators
const sections = this.splitIntoSections(data)
sections.forEach((sectionContent, index) => {
if (sectionContent.trim()) {
result.push({
pageContent: sectionContent.trim(),
metadata: {
documentType: 'word',
pageNumber: index + 1,
...metadata
}
})
}
})
}
} catch (error) {
console.error('Error parsing Word file:', error)
throw new Error(`Failed to parse Word file: ${error instanceof Error ? error.message : 'Unknown error'}`)
}
return result
}
/**
* Split content into sections based on common patterns
* This is a heuristic approach since officeparser returns plain text
*/
private splitIntoSections(content: string): string[] {
// Try to split by common section patterns
const sectionPatterns = [
/\n\s*Page\s+\d+/gi,
/\n\s*Section\s+\d+/gi,
/\n\s*Chapter\s+\d+/gi,
/\n\s*\d+\.\s+/gi, // Numbered sections like "1. ", "2. "
/\n\s*[A-Z][A-Z\s]{2,}\n/g, // ALL CAPS headings
/\n\s*_{5,}/g, // Long underscores as separators
/\n\s*-{5,}/g // Long dashes as separators
]
let sections: string[] = []
// Try each pattern and use the one that creates the most reasonable splits
for (const pattern of sectionPatterns) {
const potentialSections = content.split(pattern)
if (potentialSections.length > 1 && potentialSections.length < 50) {
// Reasonable number of sections
sections = potentialSections
break
}
}
// If no good pattern found, split by multiple newlines as a fallback
if (sections.length === 0) {
sections = content.split(/\n\s*\n\s*\n\s*\n/)
}
// If still no good split, split by double newlines
if (sections.length === 0 || sections.every((section) => section.trim().length < 20)) {
sections = content.split(/\n\s*\n\s*\n/)
}
// If still no good split, treat entire content as one section
if (sections.length === 0 || sections.every((section) => section.trim().length < 10)) {
sections = [content]
}
return sections.filter((section) => section.trim().length > 0)
}
}
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="96px" height="96px"><linearGradient id="Q7XamDf1hnh~bz~vAO7C6a" x1="28" x2="28" y1="14.966" y2="6.45" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#42a3f2"/><stop offset="1" stop-color="#42a4eb"/></linearGradient><path fill="url(#Q7XamDf1hnh~bz~vAO7C6a)" d="M42,6H14c-1.105,0-2,0.895-2,2v7.003h32V8C44,6.895,43.105,6,42,6z"/><linearGradient id="Q7XamDf1hnh~bz~vAO7C6b" x1="28" x2="28" y1="42" y2="33.054" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#11408a"/><stop offset="1" stop-color="#103f8f"/></linearGradient><path fill="url(#Q7XamDf1hnh~bz~vAO7C6b)" d="M12,33.054V40c0,1.105,0.895,2,2,2h28c1.105,0,2-0.895,2-2v-6.946H12z"/><linearGradient id="Q7XamDf1hnh~bz~vAO7C6c" x1="28" x2="28" y1="-15.46" y2="-15.521" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#3079d6"/><stop offset="1" stop-color="#297cd2"/></linearGradient><path fill="url(#Q7XamDf1hnh~bz~vAO7C6c)" d="M12,15.003h32v9.002H12V15.003z"/><linearGradient id="Q7XamDf1hnh~bz~vAO7C6d" x1="12" x2="44" y1="28.53" y2="28.53" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#1d59b3"/><stop offset="1" stop-color="#195bbc"/></linearGradient><path fill="url(#Q7XamDf1hnh~bz~vAO7C6d)" d="M12,24.005h32v9.05H12V24.005z"/><path d="M22.319,13H12v24h10.319C24.352,37,26,35.352,26,33.319V16.681C26,14.648,24.352,13,22.319,13z" opacity=".05"/><path d="M22.213,36H12V13.333h10.213c1.724,0,3.121,1.397,3.121,3.121v16.425 C25.333,34.603,23.936,36,22.213,36z" opacity=".07"/><path d="M22.106,35H12V13.667h10.106c1.414,0,2.56,1.146,2.56,2.56V32.44C24.667,33.854,23.52,35,22.106,35z" opacity=".09"/><linearGradient id="Q7XamDf1hnh~bz~vAO7C6e" x1="4.744" x2="23.494" y1="14.744" y2="33.493" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#256ac2"/><stop offset="1" stop-color="#1247ad"/></linearGradient><path fill="url(#Q7XamDf1hnh~bz~vAO7C6e)" d="M22,34H6c-1.105,0-2-0.895-2-2V16c0-1.105,0.895-2,2-2h16c1.105,0,2,0.895,2,2v16 C24,33.105,23.105,34,22,34z"/><path fill="#fff" d="M18.403,19l-1.546,7.264L15.144,19h-2.187l-1.767,7.489L9.597,19H7.641l2.344,10h2.352l1.713-7.689 L15.764,29h2.251l2.344-10H18.403z"/></svg>

After

Width:  |  Height:  |  Size: 2.1 KiB

@@ -19,9 +19,9 @@ import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
import { TextLoader } from 'langchain/document_loaders/fs/text'
import { TextSplitter } from 'langchain/text_splitter'
import { CSVLoader } from '../Csv/CsvLoader'
import { LoadOfSheet } from '../MicrosoftExcel/ExcelLoader'
import { PowerpointLoader } from '../MicrosoftPowerpoint/PowerpointLoader'
class S3_DocumentLoaders implements INode {
label: string
name: string
@@ -240,7 +240,13 @@ class S3_DocumentLoaders implements INode {
'.json': (path) => new JSONLoader(path),
'.txt': (path) => new TextLoader(path),
'.csv': (path) => new CSVLoader(path),
'.xls': (path) => new LoadOfSheet(path),
'.xlsx': (path) => new LoadOfSheet(path),
'.xlsm': (path) => new LoadOfSheet(path),
'.xlsb': (path) => new LoadOfSheet(path),
'.docx': (path) => new DocxLoader(path),
'.ppt': (path) => new PowerpointLoader(path),
'.pptx': (path) => new PowerpointLoader(path),
'.pdf': (path) =>
new PDFLoader(path, {
splitPages: pdfUsage !== 'perFile',
@@ -14,12 +14,21 @@ import {
handleDocumentLoaderMetadata,
handleDocumentLoaderOutput
} from '../../../src/utils'
import { S3Client, GetObjectCommand, S3ClientConfig } from '@aws-sdk/client-s3'
import { S3Client, GetObjectCommand, HeadObjectCommand, S3ClientConfig } from '@aws-sdk/client-s3'
import { getRegions, MODEL_TYPE } from '../../../src/modelLoader'
import { Readable } from 'node:stream'
import * as fsDefault from 'node:fs'
import * as path from 'node:path'
import * as os from 'node:os'
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
import { LoadOfSheet } from '../MicrosoftExcel/ExcelLoader'
import { PowerpointLoader } from '../MicrosoftPowerpoint/PowerpointLoader'
import { TextSplitter } from 'langchain/text_splitter'
import { IDocument } from '../../../src/Interface'
import { omit } from 'lodash'
import { handleEscapeCharacters } from '../../../src'
class S3_DocumentLoaders implements INode {
label: string
@@ -37,7 +46,7 @@ class S3_DocumentLoaders implements INode {
constructor() {
this.label = 'S3'
this.name = 'S3'
this.version = 4.0
this.version = 5.0
this.type = 'Document'
this.icon = 's3.svg'
this.category = 'Document Loaders'
@@ -70,6 +79,52 @@ class S3_DocumentLoaders implements INode {
loadMethod: 'listRegions',
default: 'us-east-1'
},
{
label: 'File Processing Method',
name: 'fileProcessingMethod',
type: 'options',
options: [
{
label: 'Built In Loaders',
name: 'builtIn',
description: 'Use the built in loaders to process the file.'
},
{
label: 'Unstructured',
name: 'unstructured',
description: 'Use the Unstructured API to process the file.'
}
],
default: 'builtIn'
},
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true,
show: {
fileProcessingMethod: 'builtIn'
}
},
{
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
},
{
label: 'Unstructured API URL',
name: 'unstructuredAPIUrl',
@@ -77,13 +132,21 @@ class S3_DocumentLoaders implements INode {
'Your Unstructured.io URL. Read <a target="_blank" href="https://unstructured-io.github.io/unstructured/introduction.html#getting-started">more</a> on how to get started',
type: 'string',
placeholder: process.env.UNSTRUCTURED_API_URL || 'http://localhost:8000/general/v0/general',
optional: !!process.env.UNSTRUCTURED_API_URL
optional: !!process.env.UNSTRUCTURED_API_URL,
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Unstructured API KEY',
name: 'unstructuredAPIKey',
type: 'password',
optional: true
optional: true,
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Strategy',
@@ -110,7 +173,10 @@ class S3_DocumentLoaders implements INode {
],
optional: true,
additionalParams: true,
default: 'auto'
default: 'auto',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Encoding',
@@ -119,7 +185,10 @@ class S3_DocumentLoaders implements INode {
type: 'string',
optional: true,
additionalParams: true,
default: 'utf-8'
default: 'utf-8',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Skip Infer Table Types',
@@ -214,7 +283,10 @@ class S3_DocumentLoaders implements INode {
],
optional: true,
additionalParams: true,
default: '["pdf", "jpg", "png"]'
default: '["pdf", "jpg", "png"]',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Hi-Res Model Name',
@@ -247,7 +319,10 @@ class S3_DocumentLoaders implements INode {
],
optional: true,
additionalParams: true,
default: 'detectron2_onnx'
default: 'detectron2_onnx',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Chunking Strategy',
@@ -267,7 +342,10 @@ class S3_DocumentLoaders implements INode {
],
optional: true,
additionalParams: true,
default: 'by_title'
default: 'by_title',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'OCR Languages',
@@ -337,7 +415,10 @@ class S3_DocumentLoaders implements INode {
}
],
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Source ID Key',
@@ -348,7 +429,10 @@ class S3_DocumentLoaders implements INode {
default: 'source',
placeholder: 'source',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Coordinates',
@@ -357,7 +441,10 @@ class S3_DocumentLoaders implements INode {
description: 'If true, return coordinates for each element. Default: false.',
optional: true,
additionalParams: true,
default: false
default: false,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'XML Keep Tags',
@@ -366,7 +453,10 @@ class S3_DocumentLoaders implements INode {
'If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml.',
type: 'boolean',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Include Page Breaks',
@@ -374,15 +464,10 @@ class S3_DocumentLoaders implements INode {
description: 'When true, the output will include page break elements when the filetype supports it.',
type: 'boolean',
optional: true,
additionalParams: true
},
{
label: 'XML Keep Tags',
name: 'xmlKeepTags',
description: 'Whether to keep XML tags in the output.',
type: 'boolean',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Multi-Page Sections',
@@ -390,7 +475,10 @@ class S3_DocumentLoaders implements INode {
description: 'Whether to treat multi-page documents as separate sections.',
type: 'boolean',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Combine Under N Chars',
@@ -399,7 +487,10 @@ class S3_DocumentLoaders implements INode {
"If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: value of max_characters. Can't exceed value of max_characters.",
type: 'number',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'New After N Chars',
@@ -408,7 +499,10 @@ class S3_DocumentLoaders implements INode {
"If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). value of max_characters. Can't exceed value of max_characters.",
type: 'number',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Max Characters',
@@ -418,7 +512,10 @@ class S3_DocumentLoaders implements INode {
type: 'number',
optional: true,
additionalParams: true,
default: '500'
default: '500',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Additional Metadata',
@@ -426,7 +523,10 @@ class S3_DocumentLoaders implements INode {
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Omit Metadata Keys',
@@ -437,7 +537,10 @@ class S3_DocumentLoaders implements INode {
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
}
]
this.outputs = [
@@ -466,6 +569,171 @@ class S3_DocumentLoaders implements INode {
const bucketName = nodeData.inputs?.bucketName as string
const keyName = nodeData.inputs?.keyName as string
const region = nodeData.inputs?.region as string
const fileProcessingMethod = nodeData.inputs?.fileProcessingMethod as string
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let credentials: S3ClientConfig['credentials'] | undefined
if (nodeData.credential) {
const credentialData = await getCredentialData(nodeData.credential, options)
const accessKeyId = getCredentialParam('awsKey', credentialData, nodeData)
const secretAccessKey = getCredentialParam('awsSecret', credentialData, nodeData)
if (accessKeyId && secretAccessKey) {
credentials = {
accessKeyId,
secretAccessKey
}
}
}
const s3Config: S3ClientConfig = {
region,
credentials
}
if (fileProcessingMethod === 'builtIn') {
return await this.processWithBuiltInLoaders(
bucketName,
keyName,
s3Config,
textSplitter,
metadata,
omitMetadataKeys,
_omitMetadataKeys,
output
)
} else {
return await this.processWithUnstructured(nodeData, options, bucketName, keyName, s3Config)
}
}
private async processWithBuiltInLoaders(
bucketName: string,
keyName: string,
s3Config: S3ClientConfig,
textSplitter: TextSplitter,
metadata: any,
omitMetadataKeys: string[],
_omitMetadataKeys: string,
output: string
): Promise<any> {
let docs: IDocument[] = []
try {
const s3Client = new S3Client(s3Config)
// Get file metadata to determine content type
const headCommand = new HeadObjectCommand({
Bucket: bucketName,
Key: keyName
})
const headResponse = await s3Client.send(headCommand)
const contentType = headResponse.ContentType || this.getMimeTypeFromExtension(keyName)
// Download the file
const getObjectCommand = new GetObjectCommand({
Bucket: bucketName,
Key: keyName
})
const response = await s3Client.send(getObjectCommand)
const objectData = await new Promise<Buffer>((resolve, reject) => {
const chunks: Buffer[] = []
if (response.Body instanceof Readable) {
response.Body.on('data', (chunk: Buffer) => chunks.push(chunk))
response.Body.on('end', () => resolve(Buffer.concat(chunks)))
response.Body.on('error', reject)
} else {
reject(new Error('Response body is not a readable stream.'))
}
})
// Process the file based on content type
const fileInfo = {
id: keyName,
name: path.basename(keyName),
mimeType: contentType,
size: objectData.length,
webViewLink: `s3://${bucketName}/${keyName}`,
bucketName: bucketName,
key: keyName,
lastModified: headResponse.LastModified,
etag: headResponse.ETag
}
docs = await this.processFile(fileInfo, objectData)
// Apply text splitter if provided
if (textSplitter && docs.length > 0) {
docs = await textSplitter.splitDocuments(docs)
}
// Apply metadata transformations
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {
...parsedMetadata
}
: omit(
{
...doc.metadata,
...parsedMetadata
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {}
: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
} catch (error) {
throw new Error(`Failed to load S3 document: ${error.message}`)
}
if (output === 'document') {
return docs
} else {
let finaltext = ''
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
}
}
private async processWithUnstructured(
nodeData: INodeData,
options: ICommonObject,
bucketName: string,
keyName: string,
s3Config: S3ClientConfig
): Promise<any> {
const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string
const unstructuredAPIKey = nodeData.inputs?.unstructuredAPIKey as string
const strategy = nodeData.inputs?.strategy as UnstructuredLoaderStrategy
@@ -488,26 +756,6 @@ class S3_DocumentLoaders implements INode {
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
let credentials: S3ClientConfig['credentials'] | undefined
if (nodeData.credential) {
const credentialData = await getCredentialData(nodeData.credential, options)
const accessKeyId = getCredentialParam('awsKey', credentialData, nodeData)
const secretAccessKey = getCredentialParam('awsSecret', credentialData, nodeData)
if (accessKeyId && secretAccessKey) {
credentials = {
accessKeyId,
secretAccessKey
}
}
}
const s3Config: S3ClientConfig = {
region,
credentials
}
const loader = new S3Loader({
bucket: bucketName,
key: keyName,
@@ -586,5 +834,202 @@ class S3_DocumentLoaders implements INode {
return loader.load()
}
private getMimeTypeFromExtension(fileName: string): string {
const extension = path.extname(fileName).toLowerCase()
const mimeTypeMap: { [key: string]: string } = {
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xls': 'application/vnd.ms-excel',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.ppt': 'application/vnd.ms-powerpoint',
'.txt': 'text/plain',
'.csv': 'text/csv',
'.html': 'text/html',
'.htm': 'text/html',
'.json': 'application/json',
'.xml': 'application/xml',
'.md': 'text/markdown'
}
return mimeTypeMap[extension] || 'application/octet-stream'
}
private async processFile(fileInfo: any, buffer: Buffer): Promise<IDocument[]> {
try {
// Handle different file types
if (this.isTextBasedFile(fileInfo.mimeType)) {
// Process text files directly from buffer
const content = buffer.toString('utf-8')
// Create document with metadata
return [
{
pageContent: content,
metadata: {
source: fileInfo.webViewLink,
fileId: fileInfo.key,
fileName: fileInfo.name,
mimeType: fileInfo.mimeType,
size: fileInfo.size,
lastModified: fileInfo.lastModified,
etag: fileInfo.etag,
bucketName: fileInfo.bucketName
}
}
]
} else if (this.isSupportedBinaryFile(fileInfo.mimeType)) {
// Process binary files using loaders
return await this.processBinaryFile(fileInfo, buffer)
} else {
console.warn(`Unsupported file type ${fileInfo.mimeType} for file ${fileInfo.name}`)
return []
}
} catch (error) {
console.warn(`Failed to process file ${fileInfo.name}: ${error.message}`)
return []
}
}
private isTextBasedFile(mimeType: string): boolean {
const textBasedMimeTypes = [
'text/plain',
'text/html',
'text/css',
'text/javascript',
'text/csv',
'text/xml',
'application/json',
'application/xml',
'text/markdown',
'text/x-markdown'
]
return textBasedMimeTypes.includes(mimeType)
}
private isSupportedBinaryFile(mimeType: string): boolean {
const supportedBinaryTypes = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/msword',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.ms-excel',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/vnd.ms-powerpoint'
]
return supportedBinaryTypes.includes(mimeType)
}
private async processBinaryFile(fileInfo: any, buffer: Buffer): Promise<IDocument[]> {
let tempFilePath: string | null = null
try {
// Create temporary file
tempFilePath = await this.createTempFile(buffer, fileInfo.name, fileInfo.mimeType)
let docs: IDocument[] = []
const mimeType = fileInfo.mimeType.toLowerCase()
switch (mimeType) {
case 'application/pdf': {
const pdfLoader = new PDFLoader(tempFilePath, {
// @ts-ignore
pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
})
docs = await pdfLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
case 'application/msword': {
const docxLoader = new DocxLoader(tempFilePath)
docs = await docxLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
case 'application/vnd.ms-excel': {
const excelLoader = new LoadOfSheet(tempFilePath)
docs = await excelLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
case 'application/vnd.ms-powerpoint': {
const pptxLoader = new PowerpointLoader(tempFilePath)
docs = await pptxLoader.load()
break
}
case 'text/csv': {
const csvLoader = new CSVLoader(tempFilePath)
docs = await csvLoader.load()
break
}
default:
throw new Error(`Unsupported binary file type: ${mimeType}`)
}
// Add S3 metadata to each document
if (docs.length > 0) {
const s3Metadata = {
source: fileInfo.webViewLink,
fileId: fileInfo.key,
fileName: fileInfo.name,
mimeType: fileInfo.mimeType,
size: fileInfo.size,
lastModified: fileInfo.lastModified,
etag: fileInfo.etag,
bucketName: fileInfo.bucketName,
totalPages: docs.length // Total number of pages/sheets in the file
}
return docs.map((doc, index) => ({
...doc,
metadata: {
...doc.metadata, // Keep original loader metadata (page numbers, etc.)
...s3Metadata, // Add S3 metadata
pageIndex: index // Add page/sheet index
}
}))
}
return []
} catch (error) {
throw new Error(`Failed to process binary file: ${error.message}`)
} finally {
// Clean up temporary file
if (tempFilePath && fsDefault.existsSync(tempFilePath)) {
try {
fsDefault.unlinkSync(tempFilePath)
} catch (e) {
console.warn(`Failed to delete temporary file: ${tempFilePath}`)
}
}
}
}
private async createTempFile(buffer: Buffer, fileName: string, mimeType: string): Promise<string> {
// Get appropriate file extension
let extension = path.extname(fileName)
if (!extension) {
const extensionMap: { [key: string]: string } = {
'application/pdf': '.pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
'application/msword': '.doc',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
'application/vnd.ms-excel': '.xls',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
'application/vnd.ms-powerpoint': '.ppt',
'text/csv': '.csv'
}
extension = extensionMap[mimeType] || '.tmp'
}
// Create temporary file
const tempDir = os.tmpdir()
const tempFileName = `s3_${Date.now()}_${Math.random().toString(36).substring(7)}${extension}`
const tempFilePath = path.join(tempDir, tempFileName)
fsDefault.writeFileSync(tempFilePath, buffer)
return tempFilePath
}
}
module.exports = { nodeClass: S3_DocumentLoaders }