diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index aa899bcb..3eba0ece 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -1,4 +1,4 @@ -import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio' import { test } from 'linkifyjs' @@ -63,6 +63,7 @@ class Cheerio_DocumentLoaders implements INode { name: 'limit', type: 'number', optional: true, + default: '10', additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', @@ -86,11 +87,12 @@ class Cheerio_DocumentLoaders implements INode { ] } - async init(nodeData: INodeData): Promise { + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string - let limit = nodeData.inputs?.limit as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] + let limit = parseInt(nodeData.inputs?.limit as string) let url = nodeData.inputs?.url as string url = url.trim() @@ -117,23 +119,33 @@ class Cheerio_DocumentLoaders implements INode { } return docs } catch (err) { - if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`) + if (process.env.DEBUG === 'true') options.logger.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`) } } let docs = [] if (relativeLinksMethod) { - if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) - if (!limit) limit = '10' - else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') + if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) + if (!limit) limit = 10 + else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, limit) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, limit) + : await xmlScrape(url, limit) + if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await cheerioLoader(page))) } - if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') + options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await cheerioLoader(page))) + } } else { docs = await cheerioLoader(url) } diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index eb246045..2de166ce 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -1,4 +1,4 @@ -import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { Browser, Page, PlaywrightWebBaseLoader, PlaywrightWebBaseLoaderOptions } from 'langchain/document_loaders/web/playwright' import { test } from 'linkifyjs' @@ -61,6 +61,7 @@ class Playwright_DocumentLoaders implements INode { name: 'limit', type: 'number', optional: true, + default: '10', additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', @@ -114,11 +115,12 @@ class Playwright_DocumentLoaders implements INode { ] } - async init(nodeData: INodeData): Promise { + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string - let limit = nodeData.inputs?.limit as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] + let limit = parseInt(nodeData.inputs?.limit as string) let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined let waitForSelector = nodeData.inputs?.waitForSelector as string @@ -158,23 +160,33 @@ class Playwright_DocumentLoaders implements INode { } return docs } catch (err) { - if (process.env.DEBUG === 'true') console.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`) + if (process.env.DEBUG === 'true') options.logger.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`) } } let docs = [] if (relativeLinksMethod) { - if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) - if (!limit) limit = '10' - else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') + if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) + if (!limit) limit = 10 + else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, limit) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, limit) + : await xmlScrape(url, limit) + if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await playwrightLoader(page))) } - if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') + options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await playwrightLoader(page))) + } } else { docs = await playwrightLoader(url) } diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index 4691eb94..3d28f310 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -1,4 +1,4 @@ -import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from 'langchain/document_loaders/web/puppeteer' import { test } from 'linkifyjs' @@ -62,6 +62,7 @@ class Puppeteer_DocumentLoaders implements INode { name: 'limit', type: 'number', optional: true, + default: '10', additionalParams: true, description: 'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.', @@ -115,11 +116,12 @@ class Puppeteer_DocumentLoaders implements INode { ] } - async init(nodeData: INodeData): Promise { + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string - let limit = nodeData.inputs?.limit as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] + let limit = parseInt(nodeData.inputs?.limit as string) let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent let waitForSelector = nodeData.inputs?.waitForSelector as string @@ -159,23 +161,33 @@ class Puppeteer_DocumentLoaders implements INode { } return docs } catch (err) { - if (process.env.DEBUG === 'true') console.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`) + if (process.env.DEBUG === 'true') options.logger.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`) } } let docs = [] if (relativeLinksMethod) { - if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) - if (!limit) limit = '10' - else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') + if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`) + if (!limit) limit = 10 + else if (limit < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) - if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, limit) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, limit) + : await xmlScrape(url, limit) + if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await puppeteerLoader(page))) } - if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') + options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await puppeteerLoader(page))) + } } else { docs = await puppeteerLoader(url) } diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 6d4c1887..0adad721 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -59,7 +59,7 @@ import { Tool } from './database/entities/Tool' import { Assistant } from './database/entities/Assistant' import { ChatflowPool } from './ChatflowPool' import { CachePool } from './CachePool' -import { ICommonObject, IMessage, INodeOptionsValue, handleEscapeCharacters } from 'flowise-components' +import { ICommonObject, IMessage, INodeOptionsValue, handleEscapeCharacters, webCrawl, xmlScrape } from 'flowise-components' import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit' import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey' import { sanitizeMiddleware } from './utils/XSS' @@ -1117,6 +1117,19 @@ export class App { } }) + // ---------------------------------------- + // Scraper + // ---------------------------------------- + + this.app.get('/api/v1/fetch-links', async (req: Request, res: Response) => { + const url = decodeURIComponent(req.query.url as string) + const relativeLinksMethod = req.query.relativeLinksMethod as string + if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) + const links: string[] = relativeLinksMethod === 'webCrawl' ? await webCrawl(url, 0) : await xmlScrape(url, 0) + + res.json({ status: 'OK', links }) + }) + // ---------------------------------------- // Upsert // ---------------------------------------- diff --git a/packages/ui/src/api/scraper.js b/packages/ui/src/api/scraper.js new file mode 100644 index 00000000..382a9263 --- /dev/null +++ b/packages/ui/src/api/scraper.js @@ -0,0 +1,8 @@ +import client from './client' + +const fetchAllLinks = (url, relativeLinksMethod) => + client.get(`/fetch-links?url=${encodeURIComponent(url)}&relativeLinksMethod=${relativeLinksMethod}`) + +export default { + fetchAllLinks +} diff --git a/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js new file mode 100644 index 00000000..a707d82e --- /dev/null +++ b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.js @@ -0,0 +1,184 @@ +import PropTypes from 'prop-types' +import { createPortal } from 'react-dom' +import { useDispatch } from 'react-redux' +import { useState, useEffect } from 'react' + +import { + Box, + Button, + Dialog, + DialogActions, + DialogContent, + DialogTitle, + FormControl, + IconButton, + OutlinedInput, + Stack, + Typography +} from '@mui/material' +import { IconTrash } from '@tabler/icons' +import PerfectScrollbar from 'react-perfect-scrollbar' + +import { BackdropLoader } from 'ui-component/loading/BackdropLoader' +import { StyledButton } from 'ui-component/button/StyledButton' + +import scraperApi from 'api/scraper' + +import { HIDE_CANVAS_DIALOG, SHOW_CANVAS_DIALOG } from 'store/actions' + +const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => { + const portalElement = document.getElementById('portal') + const dispatch = useDispatch() + + const [loading, setLoading] = useState(false) + const [selectedLinks, setSelectedLinks] = useState([]) + const [url, setUrl] = useState('') + + useEffect(() => { + if (dialogProps.url) setUrl(dialogProps.url) + if (dialogProps.selectedLinks) setSelectedLinks(dialogProps.selectedLinks) + + return () => { + setLoading(false) + setSelectedLinks([]) + setUrl('') + } + }, [dialogProps]) + + useEffect(() => { + if (show) dispatch({ type: SHOW_CANVAS_DIALOG }) + else dispatch({ type: HIDE_CANVAS_DIALOG }) + return () => dispatch({ type: HIDE_CANVAS_DIALOG }) + }, [show, dispatch]) + + const handleFetchLinks = async () => { + setLoading(true) + const fetchLinksResp = await scraperApi.fetchAllLinks(url, 'webCrawl') + if (fetchLinksResp.data) { + setSelectedLinks(fetchLinksResp.data.links) + } + setLoading(false) + } + + const handleChangeLink = (index, event) => { + const { value } = event.target + const links = [...selectedLinks] + links[index] = value + setSelectedLinks(links) + } + + const handleRemoveLink = (index) => { + const links = [...selectedLinks] + links.splice(index, 1) + setSelectedLinks(links) + } + + const handleSaveLinks = () => { + onSave(url, selectedLinks) + } + + const component = show ? ( + + + {dialogProps.title || `Manage Scraped Links - ${url}`} + + + + + + { + setUrl(e.target.value) + }} + /> + + + + + Scraped Links + <> + {loading && } + {selectedLinks.length > 0 ? ( + + {selectedLinks.map((link, index) => ( +
+ + handleChangeLink(index, e)} + size='small' + value={link} + name={`link_${index}`} + /> + + + handleRemoveLink(index)} + edge='end' + > + + + +
+ ))} +
+ ) : ( +
+ Links scraped from the URL will appear here +
+ )} + +
+ + + + Save + + +
+ ) : null + + return createPortal(component, portalElement) +} + +ManageScrapedLinksDialog.propTypes = { + show: PropTypes.bool, + dialogProps: PropTypes.object, + onCancel: PropTypes.func, + onSave: PropTypes.func +} + +export default ManageScrapedLinksDialog diff --git a/packages/ui/src/views/canvas/NodeInputHandler.js b/packages/ui/src/views/canvas/NodeInputHandler.js index a673d6b7..bc877c9f 100644 --- a/packages/ui/src/views/canvas/NodeInputHandler.js +++ b/packages/ui/src/views/canvas/NodeInputHandler.js @@ -28,6 +28,8 @@ import ToolDialog from 'views/tools/ToolDialog' import AssistantDialog from 'views/assistants/AssistantDialog' import ExpandTextDialog from 'ui-component/dialog/ExpandTextDialog' import FormatPromptValuesDialog from 'ui-component/dialog/FormatPromptValuesDialog' +import PromptLangsmithHubDialog from 'ui-component/dialog/PromptLangsmithHubDialog' +import ManageScrapedLinksDialog from 'ui-component/dialog/ManageScrapedLinksDialog' import CredentialInputHandler from './CredentialInputHandler' // utils @@ -35,7 +37,6 @@ import { getInputVariables } from 'utils/genericHelper' // const import { FLOWISE_CREDENTIAL_ID } from 'store/constant' -import PromptLangsmithHubDialog from '../../ui-component/dialog/PromptLangsmithHubDialog' const EDITABLE_OPTIONS = ['selectedTool', 'selectedAssistant'] @@ -62,22 +63,25 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA const [showFormatPromptValuesDialog, setShowFormatPromptValuesDialog] = useState(false) const [formatPromptValuesDialogProps, setFormatPromptValuesDialogProps] = useState({}) const [showPromptHubDialog, setShowPromptHubDialog] = useState(false) + const [showManageScrapedLinksDialog, setShowManageScrapedLinksDialog] = useState(false) + const [manageScrapedLinksDialogProps, setManageScrapedLinksDialogProps] = useState({}) const onExpandDialogClicked = (value, inputParam) => { - const dialogProp = { + const dialogProps = { value, inputParam, disabled, confirmButtonName: 'Save', cancelButtonName: 'Cancel' } - setExpandDialogProps(dialogProp) + setExpandDialogProps(dialogProps) setShowExpandDialog(true) } const onShowPromptHubButtonClicked = () => { setShowPromptHubDialog(true) } + const onShowPromptHubButtonSubmit = (templates) => { setShowPromptHubDialog(false) for (const t of templates) { @@ -86,6 +90,24 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA } } } + + const onManageLinksDialogClicked = (url, selectedLinks) => { + const dialogProps = { + url, + selectedLinks, + confirmButtonName: 'Save', + cancelButtonName: 'Cancel' + } + setManageScrapedLinksDialogProps(dialogProps) + setShowManageScrapedLinksDialog(true) + } + + const onManageLinksDialogSave = (url, links) => { + setShowManageScrapedLinksDialog(false) + data.inputs.url = url + data.inputs.selectedLinks = links + } + const onEditJSONClicked = (value, inputParam) => { // Preset values if the field is format prompt values let inputValue = value @@ -436,6 +458,37 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA )} + {(data.name === 'cheerioWebScraper' || + data.name === 'puppeteerWebScraper' || + data.name === 'playwrightWebScraper') && + inputParam.name === 'url' && ( + <> + + setShowManageScrapedLinksDialog(false)} + onSave={onManageLinksDialogSave} + /> + + )} )}