From 2c0a8723f941fba6d956f5d5c4a5e89fd11db80b Mon Sep 17 00:00:00 2001 From: drobnikj Date: Fri, 23 Jun 2023 13:41:59 +0200 Subject: [PATCH] feat: add document loader for Apify Website Content Crawler --- .../ApifyWebsiteContentCrawler.ts | 68 +++++++++++++++++++ .../apify-symbol-transparent.svg | 1 + packages/components/package.json | 1 + 3 files changed, 70 insertions(+) create mode 100644 packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts create mode 100644 packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts new file mode 100644 index 00000000..f292ada3 --- /dev/null +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts @@ -0,0 +1,68 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { TextSplitter } from 'langchain/text_splitter' +import { ApifyDatasetLoader } from 'langchain/document_loaders/web/apify_dataset' +import { Document } from 'langchain/document' + +class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'Apify Website Content Crawler' + this.name = 'apifyWebsiteContentCrawler' + this.type = 'Document' + this.icon = 'apify-symbol-transparent.svg' + this.category = 'Document Loaders' + this.description = 'Load data from Apify Website Content Crawler' + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'Apify API Token', + name: 'apifyApiToken', + type: 'password' + }, + { + label: 'Input', + name: 'input', + type: 'json', + default: JSON.stringify({ + startUrls: [{ url: 'https://js.langchain.com/docs/' }], + maxCrawlPages: 1 + }) + }, + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + const apifyApiToken = nodeData.inputs?.apifyApiToken as string + const input = typeof nodeData.inputs?.input === 'object' ? nodeData.inputs?.input : JSON.parse(nodeData.inputs?.input as string) + + const loader = await ApifyDatasetLoader.fromActorCall('apify/website-content-crawler', input, { + datasetMappingFunction: (item) => + new Document({ + pageContent: (item.text || '') as string, + metadata: { source: item.url } + }), + clientOptions: { + token: apifyApiToken + } + }) + + return textSplitter ? loader.loadAndSplit(textSplitter) : loader.load() + } +} + +module.exports = { nodeClass: ApifyWebsiteContentCrawler_DocumentLoaders } diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg new file mode 100644 index 00000000..423a3328 --- /dev/null +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index e5e0ba00..bc55fb70 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -22,6 +22,7 @@ "@pinecone-database/pinecone": "^0.0.12", "@supabase/supabase-js": "^2.21.0", "@types/js-yaml": "^4.0.5", + "apify-client": "^2.7.1", "axios": "^0.27.2", "cheerio": "^1.0.0-rc.12", "chromadb": "^1.4.2",