From 99ef9214c735b267a7f2e9dca0dee8dd14feabaa Mon Sep 17 00:00:00 2001 From: William Espegren <131612909+WilliamEspegren@users.noreply.github.com> Date: Fri, 2 Aug 2024 23:48:48 +0200 Subject: [PATCH] feat: additional metadata added to Spider tool (#2923) * feat: additional metadata added to Spider tool * console.log error instead of throwing to keep flow * add omit * pnpm lint --- .../nodes/documentloaders/Spider/Spider.ts | 68 ++++++++++++++++++- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/packages/components/nodes/documentloaders/Spider/Spider.ts b/packages/components/nodes/documentloaders/Spider/Spider.ts index e4817ac9..3dbb4baf 100644 --- a/packages/components/nodes/documentloaders/Spider/Spider.ts +++ b/packages/components/nodes/documentloaders/Spider/Spider.ts @@ -1,3 +1,4 @@ +import { omit } from 'lodash' import { TextSplitter } from 'langchain/text_splitter' import { Document, DocumentInterface } from '@langchain/core/documents' import { BaseDocumentLoader } from 'langchain/document_loaders/base' @@ -10,6 +11,7 @@ interface SpiderLoaderParameters { apiKey?: string mode?: 'crawl' | 'scrape' limit?: number + additionalMetadata?: Record params?: Record } @@ -18,11 +20,12 @@ class SpiderLoader extends BaseDocumentLoader { private url: string private mode: 'crawl' | 'scrape' private limit?: number + private additionalMetadata?: Record private params?: Record constructor(loaderParams: SpiderLoaderParameters) { super() - const { apiKey, url, mode = 'crawl', limit, params } = loaderParams + const { apiKey, url, mode = 'crawl', limit, additionalMetadata, params } = loaderParams if (!apiKey) { throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.') } @@ -31,6 +34,7 @@ class SpiderLoader extends BaseDocumentLoader { this.url = url this.mode = mode this.limit = Number(limit) + this.additionalMetadata = additionalMetadata this.params = params } @@ -61,7 +65,10 @@ class SpiderLoader extends BaseDocumentLoader { (doc) => new Document({ pageContent: doc.content || '', - metadata: { source: doc.url } + metadata: { + ...(this.additionalMetadata || {}), + source: doc.url + } }) ) } @@ -125,6 +132,14 @@ class Spider_DocumentLoaders implements INode { type: 'number', default: 25 }, + { + label: 'Additional Metadata', + name: 'additional_metadata', + type: 'json', + description: 'Additional metadata to be added to the extracted documents', + optional: true, + additionalParams: true + }, { label: 'Additional Parameters', name: 'params', @@ -134,6 +149,17 @@ class Spider_DocumentLoaders implements INode { placeholder: '{ "anti_bot": true }', type: 'json', optional: true + }, + { + label: 'Omit Metadata Keys', + name: 'omitMetadataKeys', + type: 'string', + rows: 4, + description: + 'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field', + placeholder: 'key1, key2, key3.nestedKey1', + optional: true, + additionalParams: true } ] this.credential = { @@ -149,18 +175,39 @@ class Spider_DocumentLoaders implements INode { const url = nodeData.inputs?.url as string const mode = nodeData.inputs?.mode as 'crawl' | 'scrape' const limit = nodeData.inputs?.limit as number + let additionalMetadata = nodeData.inputs?.additional_metadata let params = nodeData.inputs?.params || {} const credentialData = await getCredentialData(nodeData.credential ?? '', options) const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData) + const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string + + let omitMetadataKeys: string[] = [] + if (_omitMetadataKeys) { + omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim()) + } if (typeof params === 'string') { try { params = JSON.parse(params) } catch (e) { - throw new Error('Invalid JSON string provided for params') + console.error('Invalid JSON string provided for params') } } + if (additionalMetadata) { + if (typeof additionalMetadata === 'string') { + try { + additionalMetadata = JSON.parse(additionalMetadata) + } catch (e) { + console.error('Invalid JSON string provided for additional metadata') + } + } else if (typeof additionalMetadata !== 'object') { + console.error('Additional metadata must be a valid JSON object') + } + } else { + additionalMetadata = {} + } + // Ensure return_format is set to markdown params.return_format = 'markdown' @@ -169,6 +216,7 @@ class Spider_DocumentLoaders implements INode { mode: mode as 'crawl' | 'scrape', apiKey: spiderApiKey, limit: limit as number, + additionalMetadata: additionalMetadata as Record, params: params as Record } @@ -182,6 +230,20 @@ class Spider_DocumentLoaders implements INode { docs = await loader.load() } + docs = docs.map((doc: DocumentInterface) => ({ + ...doc, + metadata: + _omitMetadataKeys === '*' + ? additionalMetadata + : omit( + { + ...doc.metadata, + ...additionalMetadata + }, + omitMetadataKeys + ) + })) + return docs } }