From c83d0ab3205739dc5fa9e289ed609c2241041a36 Mon Sep 17 00:00:00 2001 From: Atish Amte Date: Thu, 17 Aug 2023 00:33:01 +0530 Subject: [PATCH 1/4] added puppeteer options --- .../documentloaders/Puppeteer/Puppeteer.ts | 64 +++++++++++++++++-- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index ea6280db..036e4053 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -1,8 +1,9 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' -import { PuppeteerWebBaseLoader } from 'langchain/document_loaders/web/puppeteer' +import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from 'langchain/document_loaders/web/puppeteer' import { test } from 'linkifyjs' import { webCrawl, xmlScrape } from '../../../src' +import { PuppeteerLifeCycleEvent } from 'puppeteer' class Puppeteer_DocumentLoaders implements INode { label: string @@ -62,10 +63,47 @@ class Puppeteer_DocumentLoaders implements INode { type: 'number', optional: true, additionalParams: true, - description: - 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', + description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` }, + { + label: 'Wait Until', + name: 'waitUntilGoToOption', + type: 'options', + description: 'Select a go to wait until option', + options: [ + { + label: 'Load', + name: 'load', + description: `When the initial HTML document\'s DOM has been loaded and parsed` + }, + { + label: 'DOM Content Loaded', + name: 'domcontentloaded', + description: `When the complete HTML document\'s DOM has been loaded and parsed` + }, + { + label: 'Network Idle 0', + name: 'networkidle0', + description: 'Navigation is finished when there are no more than 0 network connections for at least 500 ms' + }, + { + label: 'Network Idle 2', + name: 'networkidle2', + description: 'Navigation is finished when there are no more than 2 network connections for at least 500 ms' + } + ], + optional: true, + additionalParams: true + }, + { + label: 'Wait for selector to load', + name: 'waitForSelector', + type: 'string', + optional: true, + additionalParams: true, + description: 'CSS selectors like .div or #div', + }, { label: 'Metadata', name: 'metadata', @@ -81,6 +119,8 @@ class Puppeteer_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string let limit = nodeData.inputs?.limit as string + let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent + let waitForSelector = nodeData.inputs?.waitForSelector as string let url = nodeData.inputs?.url as string url = url.trim() @@ -91,12 +131,26 @@ class Puppeteer_DocumentLoaders implements INode { async function puppeteerLoader(url: string): Promise { try { let docs = [] - const loader = new PuppeteerWebBaseLoader(url, { + const config: PuppeteerWebBaseLoaderOptions = { launchOptions: { args: ['--no-sandbox'], headless: 'new' } - }) + }; + if (waitUntilGoToOption) { + config['gotoOptions'] = { + waitUntil: waitUntilGoToOption + } + } + if (waitForSelector) { + config['evaluate'] = async (page: Page, browser: Browser): Promise => { + await page.waitForSelector(waitForSelector) + + const result = await page.evaluate(() => document.body.innerHTML) + return result + } + } + const loader = new PuppeteerWebBaseLoader(url, config) if (textSplitter) { docs = await loader.loadAndSplit(textSplitter) } else { From 8414f347def05fe405744562c28615e5bc9952d8 Mon Sep 17 00:00:00 2001 From: Atish Amte Date: Thu, 17 Aug 2023 00:36:03 +0530 Subject: [PATCH 2/4] spelling correction --- .../nodes/documentloaders/Cheerio/Cheerio.ts | 2 +- .../documentloaders/Playwright/Playwright.ts | 2 +- .../documentloaders/Puppeteer/Puppeteer.ts | 2 +- .../marketplaces/chatflows/WebPage QnA.json | 56 ++++++++++++++----- 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index 310aa9e6..1c21c1ea 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -64,7 +64,7 @@ class Cheerio_DocumentLoaders implements INode { additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', - warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` + warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` }, { label: 'Metadata', diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index 3399574d..2ddd6a8d 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -64,7 +64,7 @@ class Playwright_DocumentLoaders implements INode { additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', - warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` + warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` }, { label: 'Metadata', diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index 036e4053..c3b61a2b 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -64,7 +64,7 @@ class Puppeteer_DocumentLoaders implements INode { optional: true, additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', - warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` + warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` }, { label: 'Wait Until', diff --git a/packages/server/marketplaces/chatflows/WebPage QnA.json b/packages/server/marketplaces/chatflows/WebPage QnA.json index 8197c20a..09246150 100644 --- a/packages/server/marketplaces/chatflows/WebPage QnA.json +++ b/packages/server/marketplaces/chatflows/WebPage QnA.json @@ -16,7 +16,11 @@ "version": 1, "name": "chatOpenAI", "type": "ChatOpenAI", - "baseClasses": ["ChatOpenAI", "BaseChatModel", "BaseLanguageModel"], + "baseClasses": [ + "ChatOpenAI", + "BaseChatModel", + "BaseLanguageModel" + ], "category": "Chat Models", "description": "Wrapper around OpenAI large language models that use the Chat endpoint", "inputParams": [ @@ -24,7 +28,9 @@ "label": "Connect Credential", "name": "credential", "type": "credential", - "credentialNames": ["openAIApi"], + "credentialNames": [ + "openAIApi" + ], "id": "chatOpenAI_0-input-credential-credential" }, { @@ -170,7 +176,10 @@ "version": 1, "name": "openAIEmbeddings", "type": "OpenAIEmbeddings", - "baseClasses": ["OpenAIEmbeddings", "Embeddings"], + "baseClasses": [ + "OpenAIEmbeddings", + "Embeddings" + ], "category": "Embeddings", "description": "OpenAI API to generate embeddings for a given text", "inputParams": [ @@ -178,7 +187,9 @@ "label": "Connect Credential", "name": "credential", "type": "credential", - "credentialNames": ["openAIApi"], + "credentialNames": [ + "openAIApi" + ], "id": "openAIEmbeddings_0-input-credential-credential" }, { @@ -318,7 +329,10 @@ "version": 1, "name": "conversationalRetrievalQAChain", "type": "ConversationalRetrievalQAChain", - "baseClasses": ["ConversationalRetrievalQAChain", "BaseChain"], + "baseClasses": [ + "ConversationalRetrievalQAChain", + "BaseChain" + ], "category": "Chains", "description": "Document QA - built on RetrievalQAChain to provide a chat history component", "inputParams": [ @@ -428,7 +442,9 @@ "version": 1, "name": "cheerioWebScraper", "type": "Document", - "baseClasses": ["Document"], + "baseClasses": [ + "Document" + ], "category": "Document Loaders", "description": "Load data from webpages", "inputParams": [ @@ -466,7 +482,7 @@ "optional": true, "additionalParams": true, "description": "Only used when \"Get Relative Links Method\" is selected. Set 0 to retrieve all relative links, default limit is 10.", - "warning": "Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)", + "warning": "Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)", "id": "cheerioWebScraper_0-input-limit-number" }, { @@ -527,7 +543,11 @@ "version": 1, "name": "pineconeUpsert", "type": "Pinecone", - "baseClasses": ["Pinecone", "VectorStoreRetriever", "BaseRetriever"], + "baseClasses": [ + "Pinecone", + "VectorStoreRetriever", + "BaseRetriever" + ], "category": "Vector Stores", "description": "Upsert documents to Pinecone", "inputParams": [ @@ -535,7 +555,9 @@ "label": "Connect Credential", "name": "credential", "type": "credential", - "credentialNames": ["pineconeApi"], + "credentialNames": [ + "pineconeApi" + ], "id": "pineconeUpsert_0-input-credential-credential" }, { @@ -580,7 +602,9 @@ } ], "inputs": { - "document": ["{{cheerioWebScraper_0.data.instance}}"], + "document": [ + "{{cheerioWebScraper_0.data.instance}}" + ], "embeddings": "{{openAIEmbeddings_0.data.instance}}", "pineconeIndex": "", "pineconeNamespace": "", @@ -635,7 +659,11 @@ "version": 1, "name": "motorheadMemory", "type": "MotorheadMemory", - "baseClasses": ["MotorheadMemory", "BaseChatMemory", "BaseMemory"], + "baseClasses": [ + "MotorheadMemory", + "BaseChatMemory", + "BaseMemory" + ], "category": "Memory", "description": "Use Motorhead Memory to store chat conversations", "inputParams": [ @@ -645,7 +673,9 @@ "type": "credential", "optional": true, "description": "Only needed when using hosted solution - https://getmetal.io", - "credentialNames": ["motorheadMemoryApi"], + "credentialNames": [ + "motorheadMemoryApi" + ], "id": "motorheadMemory_0-input-credential-credential" }, { @@ -768,4 +798,4 @@ } } ] -} +} \ No newline at end of file From 338082f0aa6e7bfc7d61077d03b0ff10253c3d9b Mon Sep 17 00:00:00 2001 From: Atish Amte Date: Thu, 17 Aug 2023 00:52:35 +0530 Subject: [PATCH 3/4] playwright config --- .../documentloaders/Playwright/Playwright.ts | 63 ++++++++++++++++++- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index 2ddd6a8d..b376c05b 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -1,6 +1,6 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' -import { PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright' +import { Browser, Page, PlaywrightWebBaseLoader, PlaywrightWebBaseLoaderOptions } from 'langchain/document_loaders/web/playwright' import { test } from 'linkifyjs' import { webCrawl, xmlScrape } from '../../../src' @@ -66,6 +66,44 @@ class Playwright_DocumentLoaders implements INode { 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` }, + { + label: 'Wait Until', + name: 'waitUntilGoToOption', + type: 'options', + description: 'Select a go to wait until option', + options: [ + { + label: 'Load', + name: 'load', + description: 'Consider operation to be finished when the load event is fired.' + }, + { + label: 'DOM Content Loaded', + name: 'domcontentloaded', + description: 'Consider operation to be finished when the DOMContentLoaded event is fired.' + }, + { + label: 'Network Idle', + name: 'networkidle', + description: 'Navigation is finished when there are no more connections for at least 500 ms.' + }, + { + label: 'Commit', + name: 'commit', + description: 'Consider operation to be finished when network response is received and the document started loading.' + } + ], + optional: true, + additionalParams: true + }, + { + label: 'Wait for selector to load', + name: 'waitForSelector', + type: 'string', + optional: true, + additionalParams: true, + description: 'CSS selectors like .div or #div', + }, { label: 'Metadata', name: 'metadata', @@ -81,6 +119,8 @@ class Playwright_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string let limit = nodeData.inputs?.limit as string + let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as "load" | "domcontentloaded" | "networkidle" | "commit" | undefined + let waitForSelector = nodeData.inputs?.waitForSelector as string let url = nodeData.inputs?.url as string url = url.trim() @@ -91,7 +131,26 @@ class Playwright_DocumentLoaders implements INode { async function playwrightLoader(url: string): Promise { try { let docs = [] - const loader = new PlaywrightWebBaseLoader(url) + const config: PlaywrightWebBaseLoaderOptions = { + launchOptions: { + args: ['--no-sandbox'], + headless: true + } + }; + if (waitUntilGoToOption) { + config['gotoOptions'] = { + waitUntil: waitUntilGoToOption + } + } + if (waitForSelector) { + config['evaluate'] = async (page: Page, browser: Browser): Promise => { + await page.waitForSelector(waitForSelector) + + const result = await page.evaluate(() => document.body.innerHTML) + return result + } + } + const loader = new PlaywrightWebBaseLoader(url, config) if (textSplitter) { docs = await loader.loadAndSplit(textSplitter) } else { From 888fa356b93d2d0d2ff3b11addd11c839c5b225f Mon Sep 17 00:00:00 2001 From: Atish Amte Date: Thu, 17 Aug 2023 01:11:31 +0530 Subject: [PATCH 4/4] lint fixes --- .../documentloaders/Playwright/Playwright.ts | 8 +-- .../documentloaders/Puppeteer/Puppeteer.ts | 13 ++--- .../marketplaces/chatflows/WebPage QnA.json | 54 +++++-------------- 3 files changed, 23 insertions(+), 52 deletions(-) diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index b376c05b..eb246045 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -102,7 +102,7 @@ class Playwright_DocumentLoaders implements INode { type: 'string', optional: true, additionalParams: true, - description: 'CSS selectors like .div or #div', + description: 'CSS selectors like .div or #div' }, { label: 'Metadata', @@ -119,7 +119,7 @@ class Playwright_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string let limit = nodeData.inputs?.limit as string - let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as "load" | "domcontentloaded" | "networkidle" | "commit" | undefined + let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined let waitForSelector = nodeData.inputs?.waitForSelector as string let url = nodeData.inputs?.url as string @@ -136,14 +136,14 @@ class Playwright_DocumentLoaders implements INode { args: ['--no-sandbox'], headless: true } - }; + } if (waitUntilGoToOption) { config['gotoOptions'] = { waitUntil: waitUntilGoToOption } } if (waitForSelector) { - config['evaluate'] = async (page: Page, browser: Browser): Promise => { + config['evaluate'] = async (page: Page, _: Browser): Promise => { await page.waitForSelector(waitForSelector) const result = await page.evaluate(() => document.body.innerHTML) diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index c3b61a2b..4691eb94 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -63,7 +63,8 @@ class Puppeteer_DocumentLoaders implements INode { type: 'number', optional: true, additionalParams: true, - description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', + description: + 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)` }, { @@ -75,12 +76,12 @@ class Puppeteer_DocumentLoaders implements INode { { label: 'Load', name: 'load', - description: `When the initial HTML document\'s DOM has been loaded and parsed` + description: `When the initial HTML document's DOM has been loaded and parsed` }, { label: 'DOM Content Loaded', name: 'domcontentloaded', - description: `When the complete HTML document\'s DOM has been loaded and parsed` + description: `When the complete HTML document's DOM has been loaded and parsed` }, { label: 'Network Idle 0', @@ -102,7 +103,7 @@ class Puppeteer_DocumentLoaders implements INode { type: 'string', optional: true, additionalParams: true, - description: 'CSS selectors like .div or #div', + description: 'CSS selectors like .div or #div' }, { label: 'Metadata', @@ -136,14 +137,14 @@ class Puppeteer_DocumentLoaders implements INode { args: ['--no-sandbox'], headless: 'new' } - }; + } if (waitUntilGoToOption) { config['gotoOptions'] = { waitUntil: waitUntilGoToOption } } if (waitForSelector) { - config['evaluate'] = async (page: Page, browser: Browser): Promise => { + config['evaluate'] = async (page: Page, _: Browser): Promise => { await page.waitForSelector(waitForSelector) const result = await page.evaluate(() => document.body.innerHTML) diff --git a/packages/server/marketplaces/chatflows/WebPage QnA.json b/packages/server/marketplaces/chatflows/WebPage QnA.json index 09246150..812f0bd5 100644 --- a/packages/server/marketplaces/chatflows/WebPage QnA.json +++ b/packages/server/marketplaces/chatflows/WebPage QnA.json @@ -16,11 +16,7 @@ "version": 1, "name": "chatOpenAI", "type": "ChatOpenAI", - "baseClasses": [ - "ChatOpenAI", - "BaseChatModel", - "BaseLanguageModel" - ], + "baseClasses": ["ChatOpenAI", "BaseChatModel", "BaseLanguageModel"], "category": "Chat Models", "description": "Wrapper around OpenAI large language models that use the Chat endpoint", "inputParams": [ @@ -28,9 +24,7 @@ "label": "Connect Credential", "name": "credential", "type": "credential", - "credentialNames": [ - "openAIApi" - ], + "credentialNames": ["openAIApi"], "id": "chatOpenAI_0-input-credential-credential" }, { @@ -176,10 +170,7 @@ "version": 1, "name": "openAIEmbeddings", "type": "OpenAIEmbeddings", - "baseClasses": [ - "OpenAIEmbeddings", - "Embeddings" - ], + "baseClasses": ["OpenAIEmbeddings", "Embeddings"], "category": "Embeddings", "description": "OpenAI API to generate embeddings for a given text", "inputParams": [ @@ -187,9 +178,7 @@ "label": "Connect Credential", "name": "credential", "type": "credential", - "credentialNames": [ - "openAIApi" - ], + "credentialNames": ["openAIApi"], "id": "openAIEmbeddings_0-input-credential-credential" }, { @@ -329,10 +318,7 @@ "version": 1, "name": "conversationalRetrievalQAChain", "type": "ConversationalRetrievalQAChain", - "baseClasses": [ - "ConversationalRetrievalQAChain", - "BaseChain" - ], + "baseClasses": ["ConversationalRetrievalQAChain", "BaseChain"], "category": "Chains", "description": "Document QA - built on RetrievalQAChain to provide a chat history component", "inputParams": [ @@ -442,9 +428,7 @@ "version": 1, "name": "cheerioWebScraper", "type": "Document", - "baseClasses": [ - "Document" - ], + "baseClasses": ["Document"], "category": "Document Loaders", "description": "Load data from webpages", "inputParams": [ @@ -543,11 +527,7 @@ "version": 1, "name": "pineconeUpsert", "type": "Pinecone", - "baseClasses": [ - "Pinecone", - "VectorStoreRetriever", - "BaseRetriever" - ], + "baseClasses": ["Pinecone", "VectorStoreRetriever", "BaseRetriever"], "category": "Vector Stores", "description": "Upsert documents to Pinecone", "inputParams": [ @@ -555,9 +535,7 @@ "label": "Connect Credential", "name": "credential", "type": "credential", - "credentialNames": [ - "pineconeApi" - ], + "credentialNames": ["pineconeApi"], "id": "pineconeUpsert_0-input-credential-credential" }, { @@ -602,9 +580,7 @@ } ], "inputs": { - "document": [ - "{{cheerioWebScraper_0.data.instance}}" - ], + "document": ["{{cheerioWebScraper_0.data.instance}}"], "embeddings": "{{openAIEmbeddings_0.data.instance}}", "pineconeIndex": "", "pineconeNamespace": "", @@ -659,11 +635,7 @@ "version": 1, "name": "motorheadMemory", "type": "MotorheadMemory", - "baseClasses": [ - "MotorheadMemory", - "BaseChatMemory", - "BaseMemory" - ], + "baseClasses": ["MotorheadMemory", "BaseChatMemory", "BaseMemory"], "category": "Memory", "description": "Use Motorhead Memory to store chat conversations", "inputParams": [ @@ -673,9 +645,7 @@ "type": "credential", "optional": true, "description": "Only needed when using hosted solution - https://getmetal.io", - "credentialNames": [ - "motorheadMemoryApi" - ], + "credentialNames": ["motorheadMemoryApi"], "id": "motorheadMemory_0-input-credential-credential" }, { @@ -798,4 +768,4 @@ } } ] -} \ No newline at end of file +}