Merge pull request #1566 from 0xi4o/feature/scrapped-links

FEATURE: Select which links should be used in web scraper nodes (cheerio, puppeteer, and playwright)
This commit is contained in:
Ilango
2024-01-26 04:08:34 +05:30
committed by GitHub
7 changed files with 328 additions and 34 deletions
@@ -1,4 +1,4 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio'
import { test } from 'linkifyjs'
@@ -63,6 +63,7 @@ class Cheerio_DocumentLoaders implements INode {
name: 'limit',
type: 'number',
optional: true,
default: '10',
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
@@ -86,11 +87,12 @@ class Cheerio_DocumentLoaders implements INode {
]
}
async init(nodeData: INodeData): Promise<any> {
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let url = nodeData.inputs?.url as string
url = url.trim()
@@ -117,23 +119,33 @@ class Cheerio_DocumentLoaders implements INode {
}
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
if (process.env.DEBUG === 'true') options.logger.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
}
}
let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
selectedLinks && selectedLinks.length > 0
? selectedLinks.slice(0, limit)
: relativeLinksMethod === 'webCrawl'
? await webCrawl(url, limit)
: await xmlScrape(url, limit)
if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await cheerioLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
docs.push(...(await cheerioLoader(page)))
}
} else {
docs = await cheerioLoader(url)
}
@@ -1,4 +1,4 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Browser, Page, PlaywrightWebBaseLoader, PlaywrightWebBaseLoaderOptions } from 'langchain/document_loaders/web/playwright'
import { test } from 'linkifyjs'
@@ -61,6 +61,7 @@ class Playwright_DocumentLoaders implements INode {
name: 'limit',
type: 'number',
optional: true,
default: '10',
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
@@ -114,11 +115,12 @@ class Playwright_DocumentLoaders implements INode {
]
}
async init(nodeData: INodeData): Promise<any> {
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
let waitForSelector = nodeData.inputs?.waitForSelector as string
@@ -158,23 +160,33 @@ class Playwright_DocumentLoaders implements INode {
}
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`)
if (process.env.DEBUG === 'true') options.logger.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`)
}
}
let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
selectedLinks && selectedLinks.length > 0
? selectedLinks.slice(0, limit)
: relativeLinksMethod === 'webCrawl'
? await webCrawl(url, limit)
: await xmlScrape(url, limit)
if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await playwrightLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
docs.push(...(await playwrightLoader(page)))
}
} else {
docs = await playwrightLoader(url)
}
@@ -1,4 +1,4 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from 'langchain/document_loaders/web/puppeteer'
import { test } from 'linkifyjs'
@@ -62,6 +62,7 @@ class Puppeteer_DocumentLoaders implements INode {
name: 'limit',
type: 'number',
optional: true,
default: '10',
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
@@ -115,11 +116,12 @@ class Puppeteer_DocumentLoaders implements INode {
]
}
async init(nodeData: INodeData): Promise<any> {
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
let waitForSelector = nodeData.inputs?.waitForSelector as string
@@ -159,23 +161,33 @@ class Puppeteer_DocumentLoaders implements INode {
}
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`)
if (process.env.DEBUG === 'true') options.logger.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`)
}
}
let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
selectedLinks && selectedLinks.length > 0
? selectedLinks.slice(0, limit)
: relativeLinksMethod === 'webCrawl'
? await webCrawl(url, limit)
: await xmlScrape(url, limit)
if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await puppeteerLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
docs.push(...(await puppeteerLoader(page)))
}
} else {
docs = await puppeteerLoader(url)
}
+14 -1
View File
@@ -59,7 +59,7 @@ import { Tool } from './database/entities/Tool'
import { Assistant } from './database/entities/Assistant'
import { ChatflowPool } from './ChatflowPool'
import { CachePool } from './CachePool'
import { ICommonObject, IMessage, INodeOptionsValue, handleEscapeCharacters } from 'flowise-components'
import { ICommonObject, IMessage, INodeOptionsValue, handleEscapeCharacters, webCrawl, xmlScrape } from 'flowise-components'
import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
import { sanitizeMiddleware } from './utils/XSS'
@@ -1117,6 +1117,19 @@ export class App {
}
})
// ----------------------------------------
// Scraper
// ----------------------------------------
this.app.get('/api/v1/fetch-links', async (req: Request, res: Response) => {
const url = decodeURIComponent(req.query.url as string)
const relativeLinksMethod = req.query.relativeLinksMethod as string
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
const links: string[] = relativeLinksMethod === 'webCrawl' ? await webCrawl(url, 0) : await xmlScrape(url, 0)
res.json({ status: 'OK', links })
})
// ----------------------------------------
// Upsert
// ----------------------------------------
+8
View File
@@ -0,0 +1,8 @@
import client from './client'
const fetchAllLinks = (url, relativeLinksMethod) =>
client.get(`/fetch-links?url=${encodeURIComponent(url)}&relativeLinksMethod=${relativeLinksMethod}`)
export default {
fetchAllLinks
}
@@ -0,0 +1,184 @@
import PropTypes from 'prop-types'
import { createPortal } from 'react-dom'
import { useDispatch } from 'react-redux'
import { useState, useEffect } from 'react'
import {
Box,
Button,
Dialog,
DialogActions,
DialogContent,
DialogTitle,
FormControl,
IconButton,
OutlinedInput,
Stack,
Typography
} from '@mui/material'
import { IconTrash } from '@tabler/icons'
import PerfectScrollbar from 'react-perfect-scrollbar'
import { BackdropLoader } from 'ui-component/loading/BackdropLoader'
import { StyledButton } from 'ui-component/button/StyledButton'
import scraperApi from 'api/scraper'
import { HIDE_CANVAS_DIALOG, SHOW_CANVAS_DIALOG } from 'store/actions'
const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
const portalElement = document.getElementById('portal')
const dispatch = useDispatch()
const [loading, setLoading] = useState(false)
const [selectedLinks, setSelectedLinks] = useState([])
const [url, setUrl] = useState('')
useEffect(() => {
if (dialogProps.url) setUrl(dialogProps.url)
if (dialogProps.selectedLinks) setSelectedLinks(dialogProps.selectedLinks)
return () => {
setLoading(false)
setSelectedLinks([])
setUrl('')
}
}, [dialogProps])
useEffect(() => {
if (show) dispatch({ type: SHOW_CANVAS_DIALOG })
else dispatch({ type: HIDE_CANVAS_DIALOG })
return () => dispatch({ type: HIDE_CANVAS_DIALOG })
}, [show, dispatch])
const handleFetchLinks = async () => {
setLoading(true)
const fetchLinksResp = await scraperApi.fetchAllLinks(url, 'webCrawl')
if (fetchLinksResp.data) {
setSelectedLinks(fetchLinksResp.data.links)
}
setLoading(false)
}
const handleChangeLink = (index, event) => {
const { value } = event.target
const links = [...selectedLinks]
links[index] = value
setSelectedLinks(links)
}
const handleRemoveLink = (index) => {
const links = [...selectedLinks]
links.splice(index, 1)
setSelectedLinks(links)
}
const handleSaveLinks = () => {
onSave(url, selectedLinks)
}
const component = show ? (
<Dialog
onClose={onCancel}
open={show}
fullWidth
maxWidth='sm'
aria-labelledby='manage-scraped-links-dialog-title'
aria-describedby='manage-scraped-links-dialog-description'
>
<DialogTitle sx={{ fontSize: '1rem' }} id='manage-scraped-links-dialog-title'>
{dialogProps.title || `Manage Scraped Links - ${url}`}
</DialogTitle>
<DialogContent>
<Box sx={{ mb: 4 }}>
<Stack flexDirection='row' gap={1} sx={{ width: '100%' }}>
<FormControl sx={{ mt: 1, width: '100%', display: 'flex', flexShrink: 1 }} size='small'>
<OutlinedInput
id='url'
size='small'
type='text'
value={url}
name='url'
onChange={(e) => {
setUrl(e.target.value)
}}
/>
</FormControl>
<Button
sx={{ borderRadius: '12px', mt: 1, display: 'flex', flexShrink: 0 }}
size='small'
variant='contained'
onClick={handleFetchLinks}
>
Fetch Links
</Button>
</Stack>
</Box>
<Typography sx={{ mb: 2, fontWeight: 500 }}>Scraped Links</Typography>
<>
{loading && <BackdropLoader open={loading} />}
{selectedLinks.length > 0 ? (
<PerfectScrollbar
style={{
height: '100%',
maxHeight: '320px',
overflowX: 'hidden',
display: 'flex',
flexDirection: 'column',
gap: 4
}}
>
{selectedLinks.map((link, index) => (
<div key={index} style={{ display: 'flex', width: '100%' }}>
<Box sx={{ display: 'flex', width: '100%' }}>
<OutlinedInput
sx={{ width: '100%' }}
key={index}
type='text'
onChange={(e) => handleChangeLink(index, e)}
size='small'
value={link}
name={`link_${index}`}
/>
</Box>
<Box sx={{ width: 'auto', flexGrow: 1 }}>
<IconButton
sx={{ height: 30, width: 30 }}
size='small'
color='error'
onClick={() => handleRemoveLink(index)}
edge='end'
>
<IconTrash />
</IconButton>
</Box>
</div>
))}
</PerfectScrollbar>
) : (
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'center' }}>
<Typography sx={{ my: 2 }}>Links scraped from the URL will appear here</Typography>
</div>
)}
</>
</DialogContent>
<DialogActions>
<Button onClick={onCancel}>Cancel</Button>
<StyledButton variant='contained' onClick={handleSaveLinks}>
Save
</StyledButton>
</DialogActions>
</Dialog>
) : null
return createPortal(component, portalElement)
}
ManageScrapedLinksDialog.propTypes = {
show: PropTypes.bool,
dialogProps: PropTypes.object,
onCancel: PropTypes.func,
onSave: PropTypes.func
}
export default ManageScrapedLinksDialog
@@ -28,6 +28,8 @@ import ToolDialog from 'views/tools/ToolDialog'
import AssistantDialog from 'views/assistants/AssistantDialog'
import ExpandTextDialog from 'ui-component/dialog/ExpandTextDialog'
import FormatPromptValuesDialog from 'ui-component/dialog/FormatPromptValuesDialog'
import PromptLangsmithHubDialog from 'ui-component/dialog/PromptLangsmithHubDialog'
import ManageScrapedLinksDialog from 'ui-component/dialog/ManageScrapedLinksDialog'
import CredentialInputHandler from './CredentialInputHandler'
// utils
@@ -35,7 +37,6 @@ import { getInputVariables } from 'utils/genericHelper'
// const
import { FLOWISE_CREDENTIAL_ID } from 'store/constant'
import PromptLangsmithHubDialog from '../../ui-component/dialog/PromptLangsmithHubDialog'
const EDITABLE_OPTIONS = ['selectedTool', 'selectedAssistant']
@@ -62,22 +63,25 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA
const [showFormatPromptValuesDialog, setShowFormatPromptValuesDialog] = useState(false)
const [formatPromptValuesDialogProps, setFormatPromptValuesDialogProps] = useState({})
const [showPromptHubDialog, setShowPromptHubDialog] = useState(false)
const [showManageScrapedLinksDialog, setShowManageScrapedLinksDialog] = useState(false)
const [manageScrapedLinksDialogProps, setManageScrapedLinksDialogProps] = useState({})
const onExpandDialogClicked = (value, inputParam) => {
const dialogProp = {
const dialogProps = {
value,
inputParam,
disabled,
confirmButtonName: 'Save',
cancelButtonName: 'Cancel'
}
setExpandDialogProps(dialogProp)
setExpandDialogProps(dialogProps)
setShowExpandDialog(true)
}
const onShowPromptHubButtonClicked = () => {
setShowPromptHubDialog(true)
}
const onShowPromptHubButtonSubmit = (templates) => {
setShowPromptHubDialog(false)
for (const t of templates) {
@@ -86,6 +90,24 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA
}
}
}
const onManageLinksDialogClicked = (url, selectedLinks) => {
const dialogProps = {
url,
selectedLinks,
confirmButtonName: 'Save',
cancelButtonName: 'Cancel'
}
setManageScrapedLinksDialogProps(dialogProps)
setShowManageScrapedLinksDialog(true)
}
const onManageLinksDialogSave = (url, links) => {
setShowManageScrapedLinksDialog(false)
data.inputs.url = url
data.inputs.selectedLinks = links
}
const onEditJSONClicked = (value, inputParam) => {
// Preset values if the field is format prompt values
let inputValue = value
@@ -436,6 +458,37 @@ const NodeInputHandler = ({ inputAnchor, inputParam, data, disabled = false, isA
</div>
</>
)}
{(data.name === 'cheerioWebScraper' ||
data.name === 'puppeteerWebScraper' ||
data.name === 'playwrightWebScraper') &&
inputParam.name === 'url' && (
<>
<Button
style={{
display: 'flex',
flexDirection: 'row',
width: '100%'
}}
disabled={disabled}
sx={{ borderRadius: '12px', width: '100%', mt: 1 }}
variant='outlined'
onClick={() =>
onManageLinksDialogClicked(
data.inputs[inputParam.name] ?? inputParam.default ?? '',
data.inputs.selectedLinks
)
}
>
Manage Links
</Button>
<ManageScrapedLinksDialog
show={showManageScrapedLinksDialog}
dialogProps={manageScrapedLinksDialogProps}
onCancel={() => setShowManageScrapedLinksDialog(false)}
onSave={onManageLinksDialogSave}
/>
</>
)}
</Box>
</>
)}