Feature/Add teams, gmail, outlook tools (#4577)

* add teams, gmail, outlook tools

* update docs link

* update credentials for oauth2

* add jira tool

* add google drive, google calendar, google sheets tools, powerpoint, excel, word doc loader

* update jira logo

* Refactor Gmail and Outlook tools to remove maxOutputLength parameter and enhance request handling. Update response formatting to include parameters in the output. Adjust Google Drive tools to simplify success messages by removing unnecessary parameter details.
This commit is contained in:
Henry Heng
2025-06-06 19:52:04 +01:00
committed by GitHub
parent 6dcb65cedb
commit 30c4180d97
62 changed files with 16832 additions and 144 deletions
@@ -14,12 +14,21 @@ import {
handleDocumentLoaderMetadata,
handleDocumentLoaderOutput
} from '../../../src/utils'
import { S3Client, GetObjectCommand, S3ClientConfig } from '@aws-sdk/client-s3'
import { S3Client, GetObjectCommand, HeadObjectCommand, S3ClientConfig } from '@aws-sdk/client-s3'
import { getRegions, MODEL_TYPE } from '../../../src/modelLoader'
import { Readable } from 'node:stream'
import * as fsDefault from 'node:fs'
import * as path from 'node:path'
import * as os from 'node:os'
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
import { LoadOfSheet } from '../MicrosoftExcel/ExcelLoader'
import { PowerpointLoader } from '../MicrosoftPowerpoint/PowerpointLoader'
import { TextSplitter } from 'langchain/text_splitter'
import { IDocument } from '../../../src/Interface'
import { omit } from 'lodash'
import { handleEscapeCharacters } from '../../../src'
class S3_DocumentLoaders implements INode {
label: string
@@ -37,7 +46,7 @@ class S3_DocumentLoaders implements INode {
constructor() {
this.label = 'S3'
this.name = 'S3'
this.version = 4.0
this.version = 5.0
this.type = 'Document'
this.icon = 's3.svg'
this.category = 'Document Loaders'
@@ -70,6 +79,52 @@ class S3_DocumentLoaders implements INode {
loadMethod: 'listRegions',
default: 'us-east-1'
},
{
label: 'File Processing Method',
name: 'fileProcessingMethod',
type: 'options',
options: [
{
label: 'Built In Loaders',
name: 'builtIn',
description: 'Use the built in loaders to process the file.'
},
{
label: 'Unstructured',
name: 'unstructured',
description: 'Use the Unstructured API to process the file.'
}
],
default: 'builtIn'
},
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true,
show: {
fileProcessingMethod: 'builtIn'
}
},
{
label: 'Additional Metadata',
name: 'metadata',
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
},
{
label: 'Omit Metadata Keys',
name: 'omitMetadataKeys',
type: 'string',
rows: 4,
description:
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
},
{
label: 'Unstructured API URL',
name: 'unstructuredAPIUrl',
@@ -77,13 +132,21 @@ class S3_DocumentLoaders implements INode {
'Your Unstructured.io URL. Read <a target="_blank" href="https://unstructured-io.github.io/unstructured/introduction.html#getting-started">more</a> on how to get started',
type: 'string',
placeholder: process.env.UNSTRUCTURED_API_URL || 'http://localhost:8000/general/v0/general',
optional: !!process.env.UNSTRUCTURED_API_URL
optional: !!process.env.UNSTRUCTURED_API_URL,
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Unstructured API KEY',
name: 'unstructuredAPIKey',
type: 'password',
optional: true
optional: true,
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Strategy',
@@ -110,7 +173,10 @@ class S3_DocumentLoaders implements INode {
],
optional: true,
additionalParams: true,
default: 'auto'
default: 'auto',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Encoding',
@@ -119,7 +185,10 @@ class S3_DocumentLoaders implements INode {
type: 'string',
optional: true,
additionalParams: true,
default: 'utf-8'
default: 'utf-8',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Skip Infer Table Types',
@@ -214,7 +283,10 @@ class S3_DocumentLoaders implements INode {
],
optional: true,
additionalParams: true,
default: '["pdf", "jpg", "png"]'
default: '["pdf", "jpg", "png"]',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Hi-Res Model Name',
@@ -247,7 +319,10 @@ class S3_DocumentLoaders implements INode {
],
optional: true,
additionalParams: true,
default: 'detectron2_onnx'
default: 'detectron2_onnx',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Chunking Strategy',
@@ -267,7 +342,10 @@ class S3_DocumentLoaders implements INode {
],
optional: true,
additionalParams: true,
default: 'by_title'
default: 'by_title',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'OCR Languages',
@@ -337,7 +415,10 @@ class S3_DocumentLoaders implements INode {
}
],
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Source ID Key',
@@ -348,7 +429,10 @@ class S3_DocumentLoaders implements INode {
default: 'source',
placeholder: 'source',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Coordinates',
@@ -357,7 +441,10 @@ class S3_DocumentLoaders implements INode {
description: 'If true, return coordinates for each element. Default: false.',
optional: true,
additionalParams: true,
default: false
default: false,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'XML Keep Tags',
@@ -366,7 +453,10 @@ class S3_DocumentLoaders implements INode {
'If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml.',
type: 'boolean',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Include Page Breaks',
@@ -374,15 +464,10 @@ class S3_DocumentLoaders implements INode {
description: 'When true, the output will include page break elements when the filetype supports it.',
type: 'boolean',
optional: true,
additionalParams: true
},
{
label: 'XML Keep Tags',
name: 'xmlKeepTags',
description: 'Whether to keep XML tags in the output.',
type: 'boolean',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Multi-Page Sections',
@@ -390,7 +475,10 @@ class S3_DocumentLoaders implements INode {
description: 'Whether to treat multi-page documents as separate sections.',
type: 'boolean',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Combine Under N Chars',
@@ -399,7 +487,10 @@ class S3_DocumentLoaders implements INode {
"If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: value of max_characters. Can't exceed value of max_characters.",
type: 'number',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'New After N Chars',
@@ -408,7 +499,10 @@ class S3_DocumentLoaders implements INode {
"If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). value of max_characters. Can't exceed value of max_characters.",
type: 'number',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Max Characters',
@@ -418,7 +512,10 @@ class S3_DocumentLoaders implements INode {
type: 'number',
optional: true,
additionalParams: true,
default: '500'
default: '500',
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Additional Metadata',
@@ -426,7 +523,10 @@ class S3_DocumentLoaders implements INode {
type: 'json',
description: 'Additional metadata to be added to the extracted documents',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
},
{
label: 'Omit Metadata Keys',
@@ -437,7 +537,10 @@ class S3_DocumentLoaders implements INode {
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
placeholder: 'key1, key2, key3.nestedKey1',
optional: true,
additionalParams: true
additionalParams: true,
show: {
fileProcessingMethod: 'unstructured'
}
}
]
this.outputs = [
@@ -466,6 +569,171 @@ class S3_DocumentLoaders implements INode {
const bucketName = nodeData.inputs?.bucketName as string
const keyName = nodeData.inputs?.keyName as string
const region = nodeData.inputs?.region as string
const fileProcessingMethod = nodeData.inputs?.fileProcessingMethod as string
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
let omitMetadataKeys: string[] = []
if (_omitMetadataKeys) {
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
}
let credentials: S3ClientConfig['credentials'] | undefined
if (nodeData.credential) {
const credentialData = await getCredentialData(nodeData.credential, options)
const accessKeyId = getCredentialParam('awsKey', credentialData, nodeData)
const secretAccessKey = getCredentialParam('awsSecret', credentialData, nodeData)
if (accessKeyId && secretAccessKey) {
credentials = {
accessKeyId,
secretAccessKey
}
}
}
const s3Config: S3ClientConfig = {
region,
credentials
}
if (fileProcessingMethod === 'builtIn') {
return await this.processWithBuiltInLoaders(
bucketName,
keyName,
s3Config,
textSplitter,
metadata,
omitMetadataKeys,
_omitMetadataKeys,
output
)
} else {
return await this.processWithUnstructured(nodeData, options, bucketName, keyName, s3Config)
}
}
private async processWithBuiltInLoaders(
bucketName: string,
keyName: string,
s3Config: S3ClientConfig,
textSplitter: TextSplitter,
metadata: any,
omitMetadataKeys: string[],
_omitMetadataKeys: string,
output: string
): Promise<any> {
let docs: IDocument[] = []
try {
const s3Client = new S3Client(s3Config)
// Get file metadata to determine content type
const headCommand = new HeadObjectCommand({
Bucket: bucketName,
Key: keyName
})
const headResponse = await s3Client.send(headCommand)
const contentType = headResponse.ContentType || this.getMimeTypeFromExtension(keyName)
// Download the file
const getObjectCommand = new GetObjectCommand({
Bucket: bucketName,
Key: keyName
})
const response = await s3Client.send(getObjectCommand)
const objectData = await new Promise<Buffer>((resolve, reject) => {
const chunks: Buffer[] = []
if (response.Body instanceof Readable) {
response.Body.on('data', (chunk: Buffer) => chunks.push(chunk))
response.Body.on('end', () => resolve(Buffer.concat(chunks)))
response.Body.on('error', reject)
} else {
reject(new Error('Response body is not a readable stream.'))
}
})
// Process the file based on content type
const fileInfo = {
id: keyName,
name: path.basename(keyName),
mimeType: contentType,
size: objectData.length,
webViewLink: `s3://${bucketName}/${keyName}`,
bucketName: bucketName,
key: keyName,
lastModified: headResponse.LastModified,
etag: headResponse.ETag
}
docs = await this.processFile(fileInfo, objectData)
// Apply text splitter if provided
if (textSplitter && docs.length > 0) {
docs = await textSplitter.splitDocuments(docs)
}
// Apply metadata transformations
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {
...parsedMetadata
}
: omit(
{
...doc.metadata,
...parsedMetadata
},
omitMetadataKeys
)
}))
} else {
docs = docs.map((doc) => ({
...doc,
metadata:
_omitMetadataKeys === '*'
? {}
: omit(
{
...doc.metadata
},
omitMetadataKeys
)
}))
}
} catch (error) {
throw new Error(`Failed to load S3 document: ${error.message}`)
}
if (output === 'document') {
return docs
} else {
let finaltext = ''
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
}
}
private async processWithUnstructured(
nodeData: INodeData,
options: ICommonObject,
bucketName: string,
keyName: string,
s3Config: S3ClientConfig
): Promise<any> {
const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string
const unstructuredAPIKey = nodeData.inputs?.unstructuredAPIKey as string
const strategy = nodeData.inputs?.strategy as UnstructuredLoaderStrategy
@@ -488,26 +756,6 @@ class S3_DocumentLoaders implements INode {
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
let credentials: S3ClientConfig['credentials'] | undefined
if (nodeData.credential) {
const credentialData = await getCredentialData(nodeData.credential, options)
const accessKeyId = getCredentialParam('awsKey', credentialData, nodeData)
const secretAccessKey = getCredentialParam('awsSecret', credentialData, nodeData)
if (accessKeyId && secretAccessKey) {
credentials = {
accessKeyId,
secretAccessKey
}
}
}
const s3Config: S3ClientConfig = {
region,
credentials
}
const loader = new S3Loader({
bucket: bucketName,
key: keyName,
@@ -586,5 +834,202 @@ class S3_DocumentLoaders implements INode {
return loader.load()
}
private getMimeTypeFromExtension(fileName: string): string {
const extension = path.extname(fileName).toLowerCase()
const mimeTypeMap: { [key: string]: string } = {
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xls': 'application/vnd.ms-excel',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.ppt': 'application/vnd.ms-powerpoint',
'.txt': 'text/plain',
'.csv': 'text/csv',
'.html': 'text/html',
'.htm': 'text/html',
'.json': 'application/json',
'.xml': 'application/xml',
'.md': 'text/markdown'
}
return mimeTypeMap[extension] || 'application/octet-stream'
}
private async processFile(fileInfo: any, buffer: Buffer): Promise<IDocument[]> {
try {
// Handle different file types
if (this.isTextBasedFile(fileInfo.mimeType)) {
// Process text files directly from buffer
const content = buffer.toString('utf-8')
// Create document with metadata
return [
{
pageContent: content,
metadata: {
source: fileInfo.webViewLink,
fileId: fileInfo.key,
fileName: fileInfo.name,
mimeType: fileInfo.mimeType,
size: fileInfo.size,
lastModified: fileInfo.lastModified,
etag: fileInfo.etag,
bucketName: fileInfo.bucketName
}
}
]
} else if (this.isSupportedBinaryFile(fileInfo.mimeType)) {
// Process binary files using loaders
return await this.processBinaryFile(fileInfo, buffer)
} else {
console.warn(`Unsupported file type ${fileInfo.mimeType} for file ${fileInfo.name}`)
return []
}
} catch (error) {
console.warn(`Failed to process file ${fileInfo.name}: ${error.message}`)
return []
}
}
private isTextBasedFile(mimeType: string): boolean {
const textBasedMimeTypes = [
'text/plain',
'text/html',
'text/css',
'text/javascript',
'text/csv',
'text/xml',
'application/json',
'application/xml',
'text/markdown',
'text/x-markdown'
]
return textBasedMimeTypes.includes(mimeType)
}
private isSupportedBinaryFile(mimeType: string): boolean {
const supportedBinaryTypes = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/msword',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.ms-excel',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/vnd.ms-powerpoint'
]
return supportedBinaryTypes.includes(mimeType)
}
private async processBinaryFile(fileInfo: any, buffer: Buffer): Promise<IDocument[]> {
let tempFilePath: string | null = null
try {
// Create temporary file
tempFilePath = await this.createTempFile(buffer, fileInfo.name, fileInfo.mimeType)
let docs: IDocument[] = []
const mimeType = fileInfo.mimeType.toLowerCase()
switch (mimeType) {
case 'application/pdf': {
const pdfLoader = new PDFLoader(tempFilePath, {
// @ts-ignore
pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
})
docs = await pdfLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
case 'application/msword': {
const docxLoader = new DocxLoader(tempFilePath)
docs = await docxLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
case 'application/vnd.ms-excel': {
const excelLoader = new LoadOfSheet(tempFilePath)
docs = await excelLoader.load()
break
}
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
case 'application/vnd.ms-powerpoint': {
const pptxLoader = new PowerpointLoader(tempFilePath)
docs = await pptxLoader.load()
break
}
case 'text/csv': {
const csvLoader = new CSVLoader(tempFilePath)
docs = await csvLoader.load()
break
}
default:
throw new Error(`Unsupported binary file type: ${mimeType}`)
}
// Add S3 metadata to each document
if (docs.length > 0) {
const s3Metadata = {
source: fileInfo.webViewLink,
fileId: fileInfo.key,
fileName: fileInfo.name,
mimeType: fileInfo.mimeType,
size: fileInfo.size,
lastModified: fileInfo.lastModified,
etag: fileInfo.etag,
bucketName: fileInfo.bucketName,
totalPages: docs.length // Total number of pages/sheets in the file
}
return docs.map((doc, index) => ({
...doc,
metadata: {
...doc.metadata, // Keep original loader metadata (page numbers, etc.)
...s3Metadata, // Add S3 metadata
pageIndex: index // Add page/sheet index
}
}))
}
return []
} catch (error) {
throw new Error(`Failed to process binary file: ${error.message}`)
} finally {
// Clean up temporary file
if (tempFilePath && fsDefault.existsSync(tempFilePath)) {
try {
fsDefault.unlinkSync(tempFilePath)
} catch (e) {
console.warn(`Failed to delete temporary file: ${tempFilePath}`)
}
}
}
}
private async createTempFile(buffer: Buffer, fileName: string, mimeType: string): Promise<string> {
// Get appropriate file extension
let extension = path.extname(fileName)
if (!extension) {
const extensionMap: { [key: string]: string } = {
'application/pdf': '.pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
'application/msword': '.doc',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
'application/vnd.ms-excel': '.xls',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
'application/vnd.ms-powerpoint': '.ppt',
'text/csv': '.csv'
}
extension = extensionMap[mimeType] || '.tmp'
}
// Create temporary file
const tempDir = os.tmpdir()
const tempFileName = `s3_${Date.now()}_${Math.random().toString(36).substring(7)}${extension}`
const tempFilePath = path.join(tempDir, tempFileName)
fsDefault.writeFileSync(tempFilePath, buffer)
return tempFilePath
}
}
module.exports = { nodeClass: S3_DocumentLoaders }