From e7edbc695cd41bf59cfb206d1bb0b15a77c0bcb8 Mon Sep 17 00:00:00 2001 From: Ilango Date: Fri, 19 Jan 2024 12:29:41 +0530 Subject: [PATCH 01/13] Add api endpoint for fetching links from a url --- packages/server/src/index.ts | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 94a3b538..c7079ed9 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -57,7 +57,7 @@ import { Tool } from './database/entities/Tool' import { Assistant } from './database/entities/Assistant' import { ChatflowPool } from './ChatflowPool' import { CachePool } from './CachePool' -import { ICommonObject, IMessage, INodeOptionsValue, handleEscapeCharacters } from 'flowise-components' +import { ICommonObject, IMessage, INodeOptionsValue, handleEscapeCharacters, webCrawl, xmlScrape } from 'flowise-components' import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit' import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey' import { sanitizeMiddleware } from './utils/XSS' @@ -1087,6 +1087,19 @@ export class App { } }) + // ---------------------------------------- + // Scraper + // ---------------------------------------- + + this.app.get('/api/v1/fetch-links', async (req: Request, res: Response) => { + const url = decodeURIComponent(req.query.url as string) + const relativeLinksMethod = req.query.relativeLinksMethod as string + if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) + const links: string[] = relativeLinksMethod === 'webCrawl' ? await webCrawl(url, 0) : await xmlScrape(url, 0) + + res.json({ status: 'OK', links }) + }) + // ---------------------------------------- // Upsert // ---------------------------------------- From 1b8813a8b92df702fc26922dd361f018b486b4ee Mon Sep 17 00:00:00 2001 From: Ilango Date: Fri, 19 Jan 2024 12:32:58 +0530 Subject: [PATCH 02/13] Show a manage links button for web scraper nodes - cheerio, puppeteer, playwright --- .../ui/src/views/canvas/NodeInputHandler.js | 58 ++++++++++++++++++- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/packages/ui/src/views/canvas/NodeInputHandler.js b/packages/ui/src/views/canvas/NodeInputHandler.js index a673d6b7..6c7a277e 100644 --- a/packages/ui/src/views/canvas/NodeInputHandler.js +++ b/packages/ui/src/views/canvas/NodeInputHandler.js @@ -28,6 +28,8 @@ import ToolDialog from 'views/tools/ToolDialog' import AssistantDialog from 'views/assistants/AssistantDialog' import ExpandTextDialog from 'ui-component/dialog/ExpandTextDialog' import FormatPromptValuesDialog from 'ui-component/dialog/FormatPromptValuesDialog' +import PromptLangsmithHubDialog from 'ui-component/dialog/PromptLangsmithHubDialog' +import ManageScrapedLinksDialog from 'ui-component/dialog/ManageScrapedLinksDialog' import CredentialInputHandler from './CredentialInputHandler' // utils @@ -35,7 +37,6 @@ import { getInputVariables } from 'utils/genericHelper' // const import { FLOWISE_CREDENTIAL_ID } from 'store/constant' -import PromptLangsmithHubDialog from '../../ui-component/dialog/PromptLangsmithHubDialog' const EDITABLE_OPTIONS = ['selectedTool', 'selectedAssistant'] @@ -62,22 +63,25 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA const [showFormatPromptValuesDialog, setShowFormatPromptValuesDialog] = useState(false) const [formatPromptValuesDialogProps, setFormatPromptValuesDialogProps] = useState({}) const [showPromptHubDialog, setShowPromptHubDialog] = useState(false) + const [showManageScrapedLinksDialog, setShowManageScrapedLinksDialog] = useState(false) + const [manageScrapedLinksDialogProps, setManageScrapedLinksDialogProps] = useState({}) const onExpandDialogClicked = (value, inputParam) => { - const dialogProp = { + const dialogProps = { value, inputParam, disabled, confirmButtonName: 'Save', cancelButtonName: 'Cancel' } - setExpandDialogProps(dialogProp) + setExpandDialogProps(dialogProps) setShowExpandDialog(true) } const onShowPromptHubButtonClicked = () => { setShowPromptHubDialog(true) } + const onShowPromptHubButtonSubmit = (templates) => { setShowPromptHubDialog(false) for (const t of templates) { @@ -86,6 +90,23 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA } } } + + const onManageLinksDialogClicked = (url, selectedLinks) => { + const dialogProps = { + url, + selectedLinks, + confirmButtonName: 'Save', + cancelButtonName: 'Cancel' + } + setManageScrapedLinksDialogProps(dialogProps) + setShowManageScrapedLinksDialog(true) + } + + const onManageLinksDialogSave = (links) => { + setShowManageScrapedLinksDialog(false) + data.inputs.selectedLinks = links + } + const onEditJSONClicked = (value, inputParam) => { // Preset values if the field is format prompt values let inputValue = value @@ -436,6 +457,37 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA )} + {(data.name === 'cheerioWebScraper' || + data.name === 'puppeteerWebScraper' || + data.name === 'playwrightWebScraper') && + inputParam.name === 'url' && ( + <> + + setShowManageScrapedLinksDialog(false)} + onSave={onManageLinksDialogSave} + /> + + )} )} From 9637c122974e8236a8fcc8d9487cfd5cdbd64e72 Mon Sep 17 00:00:00 2001 From: Ilango Date: Fri, 19 Jan 2024 12:33:54 +0530 Subject: [PATCH 03/13] Show a dialog to fetch and manage links in web scraper nodes --- .../dialog/ManageScrapedLinksDialog.js | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js diff --git a/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js new file mode 100644 index 00000000..ecfcd403 --- /dev/null +++ b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js @@ -0,0 +1,184 @@ +import PropTypes from 'prop-types' +import { createPortal } from 'react-dom' +import { useDispatch } from 'react-redux' +import { useState, useEffect } from 'react' + +import { + Box, + Button, + Dialog, + DialogActions, + DialogContent, + DialogTitle, + FormControl, + IconButton, + OutlinedInput, + Stack, + Typography +} from '@mui/material' +import { IconTrash } from '@tabler/icons' +import PerfectScrollbar from 'react-perfect-scrollbar' + +import { BackdropLoader } from 'ui-component/loading/BackdropLoader' +import { StyledButton } from 'ui-component/button/StyledButton' + +import scraperApi from 'api/scraper' + +import { HIDE_CANVAS_DIALOG, SHOW_CANVAS_DIALOG } from 'store/actions' + +const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => { + const portalElement = document.getElementById('portal') + const dispatch = useDispatch() + + const [loading, setLoading] = useState(false) + const [selectedLinks, setSelectedLinks] = useState([]) + const [url, setUrl] = useState('') + + useEffect(() => { + if (dialogProps.url) setUrl(dialogProps.url) + if (dialogProps.selectedLinks) setSelectedLinks(dialogProps.selectedLinks) + + return () => { + setLoading(false) + setSelectedLinks([]) + setUrl('') + } + }, [dialogProps]) + + useEffect(() => { + if (show) dispatch({ type: SHOW_CANVAS_DIALOG }) + else dispatch({ type: HIDE_CANVAS_DIALOG }) + return () => dispatch({ type: HIDE_CANVAS_DIALOG }) + }, [show, dispatch]) + + const handleFetchLinks = async () => { + setLoading(true) + const fetchLinksResp = await scraperApi.fetchAllLinks(url, 'webCrawl') + if (fetchLinksResp.data) { + setSelectedLinks(fetchLinksResp.data.links) + } + setLoading(false) + } + + const handleChangeLink = (index, event) => { + const { value } = event.target + const links = [...selectedLinks] + links[index] = value + setSelectedLinks(links) + } + + const handleRemoveLink = (index) => { + const links = [...selectedLinks] + links.splice(index, 1) + setSelectedLinks(links) + } + + const handleSaveLinks = () => { + onSave(selectedLinks) + } + + const component = show ? ( + + + {dialogProps.title || `Manage Scraped Links - ${url}`} + + + + + + { + setUrl(e.target.value) + }} + /> + + + + + Scraped Links + {selectedLinks.length > 0 ? ( + + {selectedLinks.map((link, index) => ( +
+ + handleChangeLink(index, e)} + size='small' + value={link} + name={`link_${index}`} + /> + + + handleRemoveLink(index)} + edge='end' + > + + + +
+ ))} +
+ ) : ( + <> + {loading && } +
+ Links scraped from the URL will appear here +
+ + )} +
+ + + + Save + + +
+ ) : null + + return createPortal(component, portalElement) +} + +ManageScrapedLinksDialog.propTypes = { + show: PropTypes.bool, + dialogProps: PropTypes.object, + onCancel: PropTypes.func, + onSave: PropTypes.func +} + +export default ManageScrapedLinksDialog From 43fa1166df61f1c5027429f99c8f41833227f4cb Mon Sep 17 00:00:00 2001 From: Ilango Date: Fri, 19 Jan 2024 12:34:22 +0530 Subject: [PATCH 04/13] Add interface for fetching links from server --- packages/ui/src/api/scraper.js | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 packages/ui/src/api/scraper.js diff --git a/packages/ui/src/api/scraper.js b/packages/ui/src/api/scraper.js new file mode 100644 index 00000000..382a9263 --- /dev/null +++ b/packages/ui/src/api/scraper.js @@ -0,0 +1,8 @@ +import client from './client' + +const fetchAllLinks = (url, relativeLinksMethod) => + client.get(`/fetch-links?url=${encodeURIComponent(url)}&relativeLinksMethod=${relativeLinksMethod}`) + +export default { + fetchAllLinks +} From bfa26a72c4fd6b373aea58a6f5e88a8842367685 Mon Sep 17 00:00:00 2001 From: Ilango Date: Fri, 19 Jan 2024 14:25:04 +0530 Subject: [PATCH 05/13] Use selected links if available when scraping in cheerio, puppeteer, and playwright nodes --- .../nodes/documentloaders/Cheerio/Cheerio.ts | 12 +++++++++++- .../nodes/documentloaders/Playwright/Playwright.ts | 12 +++++++++++- .../nodes/documentloaders/Puppeteer/Puppeteer.ts | 12 +++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index aa899bcb..e883c097 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -90,6 +90,7 @@ class Cheerio_DocumentLoaders implements INode { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] let limit = nodeData.inputs?.limit as string let url = nodeData.inputs?.url as string @@ -127,13 +128,22 @@ class Cheerio_DocumentLoaders implements INode { if (!limit) limit = '10' else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, parseInt(limit)) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, parseInt(limit)) + : await xmlScrape(url, parseInt(limit)) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await cheerioLoader(page))) } if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await cheerioLoader(page))) + } } else { docs = await cheerioLoader(url) } diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index eb246045..65be3ce7 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -118,6 +118,7 @@ class Playwright_DocumentLoaders implements INode { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] let limit = nodeData.inputs?.limit as string let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined let waitForSelector = nodeData.inputs?.waitForSelector as string @@ -168,13 +169,22 @@ class Playwright_DocumentLoaders implements INode { if (!limit) limit = '10' else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, parseInt(limit)) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, parseInt(limit)) + : await xmlScrape(url, parseInt(limit)) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await playwrightLoader(page))) } if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await playwrightLoader(page))) + } } else { docs = await playwrightLoader(url) } diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index 4691eb94..d5539659 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -119,6 +119,7 @@ class Puppeteer_DocumentLoaders implements INode { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] let limit = nodeData.inputs?.limit as string let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent let waitForSelector = nodeData.inputs?.waitForSelector as string @@ -169,13 +170,22 @@ class Puppeteer_DocumentLoaders implements INode { if (!limit) limit = '10' else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, parseInt(limit)) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, parseInt(limit)) + : await xmlScrape(url, parseInt(limit)) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await puppeteerLoader(page))) } if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await puppeteerLoader(page))) + } } else { docs = await puppeteerLoader(url) } From 76cb8794bf0255cc7bb6f00bcdcd0ee2d13f0e99 Mon Sep 17 00:00:00 2001 From: Ilango Date: Mon, 22 Jan 2024 08:19:20 +0530 Subject: [PATCH 06/13] Update where loader is rendered in manage links dialog --- .../dialog/ManageScrapedLinksDialog.js | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js index ecfcd403..443ef094 100644 --- a/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js +++ b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js @@ -115,52 +115,52 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => { Scraped Links - {selectedLinks.length > 0 ? ( - - {selectedLinks.map((link, index) => ( -
- - handleChangeLink(index, e)} - size='small' - value={link} - name={`link_${index}`} - /> - - - handleRemoveLink(index)} - edge='end' - > - - - -
- ))} -
- ) : ( - <> - {loading && } + <> + {loading && } + {selectedLinks.length > 0 ? ( + + {selectedLinks.map((link, index) => ( +
+ + handleChangeLink(index, e)} + size='small' + value={link} + name={`link_${index}`} + /> + + + handleRemoveLink(index)} + edge='end' + > + + + +
+ ))} +
+ ) : (
Links scraped from the URL will appear here
- - )} + )} + From 62ec17d6841db4f687a76bf9ede68622dd17c89b Mon Sep 17 00:00:00 2001 From: Ilango Date: Mon, 22 Jan 2024 08:19:41 +0530 Subject: [PATCH 07/13] Update manage links button variant --- packages/ui/src/views/canvas/NodeInputHandler.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ui/src/views/canvas/NodeInputHandler.js b/packages/ui/src/views/canvas/NodeInputHandler.js index 6c7a277e..f7309bd3 100644 --- a/packages/ui/src/views/canvas/NodeInputHandler.js +++ b/packages/ui/src/views/canvas/NodeInputHandler.js @@ -470,7 +470,7 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA }} disabled={disabled} sx={{ borderRadius: '12px', width: '100%', mt: 1 }} - variant='contained' + variant='outlined' onClick={() => onManageLinksDialogClicked( data.inputs[inputParam.name] ?? inputParam.default ?? '', From bf60a1a2a929840d185f161d78f16567dcc8e8bb Mon Sep 17 00:00:00 2001 From: Ilango Date: Mon, 22 Jan 2024 08:30:36 +0530 Subject: [PATCH 08/13] Fix multiple calls to parseInt --- .../nodes/documentloaders/Cheerio/Cheerio.ts | 12 ++++++------ .../nodes/documentloaders/Playwright/Playwright.ts | 12 ++++++------ .../nodes/documentloaders/Puppeteer/Puppeteer.ts | 12 ++++++------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index e883c097..28069c22 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -91,7 +91,7 @@ class Cheerio_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string const selectedLinks = nodeData.inputs?.selectedLinks as string[] - let limit = nodeData.inputs?.limit as string + let limit = parseInt(nodeData.inputs?.limit as string) let url = nodeData.inputs?.url as string url = url.trim() @@ -125,14 +125,14 @@ class Cheerio_DocumentLoaders implements INode { let docs = [] if (relativeLinksMethod) { if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) - if (!limit) limit = '10' - else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') + if (!limit) limit = 10 + else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = selectedLinks && selectedLinks.length > 0 - ? selectedLinks.slice(0, parseInt(limit)) + ? selectedLinks.slice(0, limit) : relativeLinksMethod === 'webCrawl' - ? await webCrawl(url, parseInt(limit)) - : await xmlScrape(url, parseInt(limit)) + ? await webCrawl(url, limit) + : await xmlScrape(url, limit) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index 65be3ce7..fd4650c4 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -119,7 +119,7 @@ class Playwright_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string const selectedLinks = nodeData.inputs?.selectedLinks as string[] - let limit = nodeData.inputs?.limit as string + let limit = parseInt(nodeData.inputs?.limit as string) let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined let waitForSelector = nodeData.inputs?.waitForSelector as string @@ -166,14 +166,14 @@ class Playwright_DocumentLoaders implements INode { let docs = [] if (relativeLinksMethod) { if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) - if (!limit) limit = '10' - else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') + if (!limit) limit = 10 + else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = selectedLinks && selectedLinks.length > 0 - ? selectedLinks.slice(0, parseInt(limit)) + ? selectedLinks.slice(0, limit) : relativeLinksMethod === 'webCrawl' - ? await webCrawl(url, parseInt(limit)) - : await xmlScrape(url, parseInt(limit)) + ? await webCrawl(url, limit) + : await xmlScrape(url, limit) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index d5539659..ed004b6d 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -120,7 +120,7 @@ class Puppeteer_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string const selectedLinks = nodeData.inputs?.selectedLinks as string[] - let limit = nodeData.inputs?.limit as string + let limit = parseInt(nodeData.inputs?.limit as string) let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent let waitForSelector = nodeData.inputs?.waitForSelector as string @@ -167,14 +167,14 @@ class Puppeteer_DocumentLoaders implements INode { let docs = [] if (relativeLinksMethod) { if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) - if (!limit) limit = '10' - else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') + if (!limit) limit = 10 + else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = selectedLinks && selectedLinks.length > 0 - ? selectedLinks.slice(0, parseInt(limit)) + ? selectedLinks.slice(0, limit) : relativeLinksMethod === 'webCrawl' - ? await webCrawl(url, parseInt(limit)) - : await xmlScrape(url, parseInt(limit)) + ? await webCrawl(url, limit) + : await xmlScrape(url, limit) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { From c24708f53bcc830b96ba7634b0919f04fd846c80 Mon Sep 17 00:00:00 2001 From: Ilango Date: Mon, 22 Jan 2024 08:42:44 +0530 Subject: [PATCH 09/13] Set default value for get links limit in web scraper nodes - cheerio, playwright, and puppeteer --- packages/components/nodes/documentloaders/Cheerio/Cheerio.ts | 1 + .../components/nodes/documentloaders/Playwright/Playwright.ts | 1 + packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts | 1 + 3 files changed, 3 insertions(+) diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index 28069c22..2f0bd8b6 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -63,6 +63,7 @@ class Cheerio_DocumentLoaders implements INode { name: 'limit', type: 'number', optional: true, + default: '10', additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index fd4650c4..cb27f1c4 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -61,6 +61,7 @@ class Playwright_DocumentLoaders implements INode { name: 'limit', type: 'number', optional: true, + default: '10', additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index ed004b6d..fe7d4f8a 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -62,6 +62,7 @@ class Puppeteer_DocumentLoaders implements INode { name: 'limit', type: 'number', optional: true, + default: '10', additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', From 193e5c4640d96b73499beb032e6e8e092b2dfa7d Mon Sep 17 00:00:00 2001 From: Ilango Date: Mon, 22 Jan 2024 08:49:04 +0530 Subject: [PATCH 10/13] Update console statements to use logger --- .../nodes/documentloaders/Cheerio/Cheerio.ts | 15 ++++++++------- .../documentloaders/Playwright/Playwright.ts | 15 ++++++++------- .../nodes/documentloaders/Puppeteer/Puppeteer.ts | 15 ++++++++------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index 2f0bd8b6..3eba0ece 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -1,4 +1,4 @@ -import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio' import { test } from 'linkifyjs' @@ -87,7 +87,7 @@ class Cheerio_DocumentLoaders implements INode { ] } - async init(nodeData: INodeData): Promise { + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string @@ -119,13 +119,13 @@ class Cheerio_DocumentLoaders implements INode { } return docs } catch (err) { - if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`) + if (process.env.DEBUG === 'true') options.logger.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`) } } let docs = [] if (relativeLinksMethod) { - if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) if (!limit) limit = 10 else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = @@ -134,14 +134,15 @@ class Cheerio_DocumentLoaders implements INode { : relativeLinksMethod === 'webCrawl' ? await webCrawl(url, limit) : await xmlScrape(url, limit) - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await cheerioLoader(page))) } - if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`) } else if (selectedLinks && selectedLinks.length > 0) { - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + if (process.env.DEBUG === 'true') + options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) for (const page of selectedLinks) { docs.push(...(await cheerioLoader(page))) } diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index cb27f1c4..2de166ce 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -1,4 +1,4 @@ -import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { Browser, Page, PlaywrightWebBaseLoader, PlaywrightWebBaseLoaderOptions } from 'langchain/document_loaders/web/playwright' import { test } from 'linkifyjs' @@ -115,7 +115,7 @@ class Playwright_DocumentLoaders implements INode { ] } - async init(nodeData: INodeData): Promise { + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string @@ -160,13 +160,13 @@ class Playwright_DocumentLoaders implements INode { } return docs } catch (err) { - if (process.env.DEBUG === 'true') console.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`) + if (process.env.DEBUG === 'true') options.logger.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`) } } let docs = [] if (relativeLinksMethod) { - if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) if (!limit) limit = 10 else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = @@ -175,14 +175,15 @@ class Playwright_DocumentLoaders implements INode { : relativeLinksMethod === 'webCrawl' ? await webCrawl(url, limit) : await xmlScrape(url, limit) - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await playwrightLoader(page))) } - if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`) } else if (selectedLinks && selectedLinks.length > 0) { - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + if (process.env.DEBUG === 'true') + options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) for (const page of selectedLinks) { docs.push(...(await playwrightLoader(page))) } diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index fe7d4f8a..3d28f310 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -1,4 +1,4 @@ -import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from 'langchain/document_loaders/web/puppeteer' import { test } from 'linkifyjs' @@ -116,7 +116,7 @@ class Puppeteer_DocumentLoaders implements INode { ] } - async init(nodeData: INodeData): Promise { + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string @@ -161,13 +161,13 @@ class Puppeteer_DocumentLoaders implements INode { } return docs } catch (err) { - if (process.env.DEBUG === 'true') console.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`) + if (process.env.DEBUG === 'true') options.logger.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`) } } let docs = [] if (relativeLinksMethod) { - if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) if (!limit) limit = 10 else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = @@ -176,14 +176,15 @@ class Puppeteer_DocumentLoaders implements INode { : relativeLinksMethod === 'webCrawl' ? await webCrawl(url, limit) : await xmlScrape(url, limit) - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await puppeteerLoader(page))) } - if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`) } else if (selectedLinks && selectedLinks.length > 0) { - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + if (process.env.DEBUG === 'true') + options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) for (const page of selectedLinks) { docs.push(...(await puppeteerLoader(page))) } From 6395b121b47d4194dda06edbfca55d3478ecbb0c Mon Sep 17 00:00:00 2001 From: Ilango Date: Tue, 23 Jan 2024 16:59:05 +0530 Subject: [PATCH 11/13] Update input url if user changed the url in manage links dialog --- .../ui/src/ui-component/dialog/ManageScrapedLinksDialog.js | 2 +- packages/ui/src/views/canvas/NodeInputHandler.js | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js index 443ef094..a707d82e 100644 --- a/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js +++ b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js @@ -74,7 +74,7 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => { } const handleSaveLinks = () => { - onSave(selectedLinks) + onSave(url, selectedLinks) } const component = show ? ( diff --git a/packages/ui/src/views/canvas/NodeInputHandler.js b/packages/ui/src/views/canvas/NodeInputHandler.js index f7309bd3..bc877c9f 100644 --- a/packages/ui/src/views/canvas/NodeInputHandler.js +++ b/packages/ui/src/views/canvas/NodeInputHandler.js @@ -102,8 +102,9 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA setShowManageScrapedLinksDialog(true) } - const onManageLinksDialogSave = (links) => { + const onManageLinksDialogSave = (url, links) => { setShowManageScrapedLinksDialog(false) + data.inputs.url = url data.inputs.selectedLinks = links } From 3abfa13587cad2d484449052b3b7a7f49fce6d97 Mon Sep 17 00:00:00 2001 From: Ilango Date: Thu, 25 Jan 2024 11:23:11 +0530 Subject: [PATCH 12/13] Add condition to skip initializing web scraper nodes during prediction --- packages/server/src/utils/index.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/packages/server/src/utils/index.ts b/packages/server/src/utils/index.ts index dafe612c..6864cf28 100644 --- a/packages/server/src/utils/index.ts +++ b/packages/server/src/utils/index.ts @@ -328,6 +328,14 @@ export const buildLangchain = async ( logger.debug(`[server]: Finished upserting ${reactFlowNode.data.label} (${reactFlowNode.data.id})`) break } else { + // skip initializing web scraper nodes during prediction since they would have already run as a part of upsert + if ( + reactFlowNode.data.name === 'cheerioWebScraper' || + reactFlowNode.data.name === 'playwrightWebScraper' || + reactFlowNode.data.name === 'puppeteerWebScraper' + ) { + continue + } logger.debug(`[server]: Initializing ${reactFlowNode.data.label} (${reactFlowNode.data.id})`) let outputResult = await newNodeInstance.init(reactFlowNodeData, question, { chatId, From 98acb353764d3e63d8263552dab0b1c2be9902e1 Mon Sep 17 00:00:00 2001 From: Ilango Date: Fri, 26 Jan 2024 03:55:59 +0530 Subject: [PATCH 13/13] Revert adding condition to skip initialization of web scraper nodes --- packages/server/src/utils/index.ts | 8 -------- 1 file changed, 8 deletions(-) diff --git a/packages/server/src/utils/index.ts b/packages/server/src/utils/index.ts index 6864cf28..dafe612c 100644 --- a/packages/server/src/utils/index.ts +++ b/packages/server/src/utils/index.ts @@ -328,14 +328,6 @@ export const buildLangchain = async ( logger.debug(`[server]: Finished upserting ${reactFlowNode.data.label} (${reactFlowNode.data.id})`) break } else { - // skip initializing web scraper nodes during prediction since they would have already run as a part of upsert - if ( - reactFlowNode.data.name === 'cheerioWebScraper' || - reactFlowNode.data.name === 'playwrightWebScraper' || - reactFlowNode.data.name === 'puppeteerWebScraper' - ) { - continue - } logger.debug(`[server]: Initializing ${reactFlowNode.data.label} (${reactFlowNode.data.id})`) let outputResult = await newNodeInstance.init(reactFlowNodeData, question, { chatId,