mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-29 11:01:18 +03:00
feat: add document loader for Apify Website Content Crawler
This commit is contained in:
+68
@@ -0,0 +1,68 @@
|
|||||||
|
import { INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||||
|
import { TextSplitter } from 'langchain/text_splitter'
|
||||||
|
import { ApifyDatasetLoader } from 'langchain/document_loaders/web/apify_dataset'
|
||||||
|
import { Document } from 'langchain/document'
|
||||||
|
|
||||||
|
class ApifyWebsiteContentCrawler_DocumentLoaders implements INode {
|
||||||
|
label: string
|
||||||
|
name: string
|
||||||
|
description: string
|
||||||
|
type: string
|
||||||
|
icon: string
|
||||||
|
category: string
|
||||||
|
baseClasses: string[]
|
||||||
|
inputs: INodeParams[]
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.label = 'Apify Website Content Crawler'
|
||||||
|
this.name = 'apifyWebsiteContentCrawler'
|
||||||
|
this.type = 'Document'
|
||||||
|
this.icon = 'apify-symbol-transparent.svg'
|
||||||
|
this.category = 'Document Loaders'
|
||||||
|
this.description = 'Load data from Apify Website Content Crawler'
|
||||||
|
this.baseClasses = [this.type]
|
||||||
|
this.inputs = [
|
||||||
|
{
|
||||||
|
label: 'Apify API Token',
|
||||||
|
name: 'apifyApiToken',
|
||||||
|
type: 'password'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Input',
|
||||||
|
name: 'input',
|
||||||
|
type: 'json',
|
||||||
|
default: JSON.stringify({
|
||||||
|
startUrls: [{ url: 'https://js.langchain.com/docs/' }],
|
||||||
|
maxCrawlPages: 1
|
||||||
|
})
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Text Splitter',
|
||||||
|
name: 'textSplitter',
|
||||||
|
type: 'TextSplitter',
|
||||||
|
optional: true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
async init(nodeData: INodeData): Promise<any> {
|
||||||
|
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
||||||
|
const apifyApiToken = nodeData.inputs?.apifyApiToken as string
|
||||||
|
const input = typeof nodeData.inputs?.input === 'object' ? nodeData.inputs?.input : JSON.parse(nodeData.inputs?.input as string)
|
||||||
|
|
||||||
|
const loader = await ApifyDatasetLoader.fromActorCall('apify/website-content-crawler', input, {
|
||||||
|
datasetMappingFunction: (item) =>
|
||||||
|
new Document({
|
||||||
|
pageContent: (item.text || '') as string,
|
||||||
|
metadata: { source: item.url }
|
||||||
|
}),
|
||||||
|
clientOptions: {
|
||||||
|
token: apifyApiToken
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return textSplitter ? loader.loadAndSplit(textSplitter) : loader.load()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { nodeClass: ApifyWebsiteContentCrawler_DocumentLoaders }
|
||||||
+1
@@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><defs><style>.cls-1{fill:none;}.cls-2{fill:#97d700;}.cls-3{fill:#71c5e8;}.cls-4{fill:#ff9013;}</style></defs><g id="Trmplate"><rect class="cls-1" width="512" height="512"/><path class="cls-2" d="M163.14,152.65a36.06,36.06,0,0,0-30.77,40.67v0l21.34,152.33,89.74-204.23Z"/><path class="cls-3" d="M379.69,279.56l-8.38-117.1a36.12,36.12,0,0,0-38.53-33.36,17.61,17.61,0,0,0-2.4.26l-34.63,4.79,76.08,170.57A35.94,35.94,0,0,0,379.69,279.56Z"/><path class="cls-4" d="M186.43,382.69a35.88,35.88,0,0,0,18-2.63l130.65-55.13L273,185.65Z"/></g></svg>
|
||||||
|
After Width: | Height: | Size: 599 B |
@@ -22,6 +22,7 @@
|
|||||||
"@pinecone-database/pinecone": "^0.0.12",
|
"@pinecone-database/pinecone": "^0.0.12",
|
||||||
"@supabase/supabase-js": "^2.21.0",
|
"@supabase/supabase-js": "^2.21.0",
|
||||||
"@types/js-yaml": "^4.0.5",
|
"@types/js-yaml": "^4.0.5",
|
||||||
|
"apify-client": "^2.7.1",
|
||||||
"axios": "^0.27.2",
|
"axios": "^0.27.2",
|
||||||
"cheerio": "^1.0.0-rc.12",
|
"cheerio": "^1.0.0-rc.12",
|
||||||
"chromadb": "^1.4.2",
|
"chromadb": "^1.4.2",
|
||||||
|
|||||||
Reference in New Issue
Block a user