feat: additional metadata added to Spider tool (#2923)

* feat: additional metadata added to Spider tool * console.log error instead of throwing to keep flow * add omit * pnpm lint
2026-06-22 11:01:22 +03:00 · 2024-08-02 23:48:48 +02:00
parent 64d9de205f
commit 99ef9214c7
1 changed files with 65 additions and 3 deletions
@@ -1,3 +1,4 @@
+import { omit } from 'lodash'
 import { TextSplitter } from 'langchain/text_splitter'
 import { Document, DocumentInterface } from '@langchain/core/documents'
 import { BaseDocumentLoader } from 'langchain/document_loaders/base'
@@ -10,6 +11,7 @@ interface SpiderLoaderParameters {
    apiKey?: string
    mode?: 'crawl' | 'scrape'
    limit?: number
+    additionalMetadata?: Record<string, unknown>
    params?: Record<string, unknown>
 }

@@ -18,11 +20,12 @@ class SpiderLoader extends BaseDocumentLoader {
    private url: string
    private mode: 'crawl' | 'scrape'
    private limit?: number
+    private additionalMetadata?: Record<string, unknown>
    private params?: Record<string, unknown>

    constructor(loaderParams: SpiderLoaderParameters) {
        super()
-        const { apiKey, url, mode = 'crawl', limit, params } = loaderParams
+        const { apiKey, url, mode = 'crawl', limit, additionalMetadata, params } = loaderParams
        if (!apiKey) {
            throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.')
        }
@@ -31,6 +34,7 @@ class SpiderLoader extends BaseDocumentLoader {
        this.url = url
        this.mode = mode
        this.limit = Number(limit)
+        this.additionalMetadata = additionalMetadata
        this.params = params
    }

@@ -61,7 +65,10 @@ class SpiderLoader extends BaseDocumentLoader {
            (doc) =>
                new Document({
                    pageContent: doc.content || '',
-                    metadata: { source: doc.url }
+                    metadata: {
+                        ...(this.additionalMetadata || {}),
+                        source: doc.url
+                    }
                })
        )
    }
@@ -125,6 +132,14 @@ class Spider_DocumentLoaders implements INode {
                type: 'number',
                default: 25
            },
+            {
+                label: 'Additional Metadata',
+                name: 'additional_metadata',
+                type: 'json',
+                description: 'Additional metadata to be added to the extracted documents',
+                optional: true,
+                additionalParams: true
+            },
            {
                label: 'Additional Parameters',
                name: 'params',
@@ -134,6 +149,17 @@ class Spider_DocumentLoaders implements INode {
                placeholder: '{ "anti_bot": true }',
                type: 'json',
                optional: true
+            },
+            {
+                label: 'Omit Metadata Keys',
+                name: 'omitMetadataKeys',
+                type: 'string',
+                rows: 4,
+                description:
+                    'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
+                placeholder: 'key1, key2, key3.nestedKey1',
+                optional: true,
+                additionalParams: true
            }
        ]
        this.credential = {
@@ -149,18 +175,39 @@ class Spider_DocumentLoaders implements INode {
        const url = nodeData.inputs?.url as string
        const mode = nodeData.inputs?.mode as 'crawl' | 'scrape'
        const limit = nodeData.inputs?.limit as number
+        let additionalMetadata = nodeData.inputs?.additional_metadata
        let params = nodeData.inputs?.params || {}
        const credentialData = await getCredentialData(nodeData.credential ?? '', options)
        const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData)
+        const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
+
+        let omitMetadataKeys: string[] = []
+        if (_omitMetadataKeys) {
+            omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
+        }

        if (typeof params === 'string') {
            try {
                params = JSON.parse(params)
            } catch (e) {
-                throw new Error('Invalid JSON string provided for params')
+                console.error('Invalid JSON string provided for params')
            }
        }

+        if (additionalMetadata) {
+            if (typeof additionalMetadata === 'string') {
+                try {
+                    additionalMetadata = JSON.parse(additionalMetadata)
+                } catch (e) {
+                    console.error('Invalid JSON string provided for additional metadata')
+                }
+            } else if (typeof additionalMetadata !== 'object') {
+                console.error('Additional metadata must be a valid JSON object')
+            }
+        } else {
+            additionalMetadata = {}
+        }
+
        // Ensure return_format is set to markdown
        params.return_format = 'markdown'

@@ -169,6 +216,7 @@ class Spider_DocumentLoaders implements INode {
            mode: mode as 'crawl' | 'scrape',
            apiKey: spiderApiKey,
            limit: limit as number,
+            additionalMetadata: additionalMetadata as Record<string, unknown>,
            params: params as Record<string, unknown>
        }

@@ -182,6 +230,20 @@ class Spider_DocumentLoaders implements INode {
            docs = await loader.load()
        }

+        docs = docs.map((doc: DocumentInterface) => ({
+            ...doc,
+            metadata:
+                _omitMetadataKeys === '*'
+                    ? additionalMetadata
+                    : omit(
+                          {
+                              ...doc.metadata,
+                              ...additionalMetadata
+                          },
+                          omitMetadataKeys
+                      )
+        }))
+
        return docs
    }
 }