mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-22 07:01:07 +03:00
Puppeteer / Playwright web crawler bug fixes/improvements (#4998)
* feature/bugfix: added otpional css selector to puppeteer web scraper, fixed error when puppeteerLoader does not work. * feature: added button to add empty link in web scraper tools * feature: added custom executable file path as an input to puppeteer to fix issues when puppeteer can not find/launch the browser. * feature: added new puppeteer features to playwright aswell. * fixed review comments
This commit is contained in:
@@ -1,14 +1,15 @@
|
||||
import { omit } from 'lodash'
|
||||
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||
import { TextSplitter } from 'langchain/text_splitter'
|
||||
import {
|
||||
Browser,
|
||||
Page,
|
||||
PlaywrightWebBaseLoader,
|
||||
PlaywrightWebBaseLoaderOptions
|
||||
} from '@langchain/community/document_loaders/web/playwright'
|
||||
import { Document } from '@langchain/core/documents'
|
||||
import { TextSplitter } from 'langchain/text_splitter'
|
||||
import { test } from 'linkifyjs'
|
||||
import { omit } from 'lodash'
|
||||
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
|
||||
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||
|
||||
class Playwright_DocumentLoaders implements INode {
|
||||
label: string
|
||||
@@ -113,6 +114,14 @@ class Playwright_DocumentLoaders implements INode {
|
||||
additionalParams: true,
|
||||
description: 'CSS selectors like .div or #div'
|
||||
},
|
||||
{
|
||||
label: 'CSS Selector (Optional)',
|
||||
name: 'cssSelector',
|
||||
type: 'string',
|
||||
description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.',
|
||||
optional: true,
|
||||
additionalParams: true
|
||||
},
|
||||
{
|
||||
label: 'Additional Metadata',
|
||||
name: 'metadata',
|
||||
@@ -155,8 +164,14 @@ class Playwright_DocumentLoaders implements INode {
|
||||
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
|
||||
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
|
||||
let limit = parseInt(nodeData.inputs?.limit as string)
|
||||
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
|
||||
let waitForSelector = nodeData.inputs?.waitForSelector as string
|
||||
const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as
|
||||
| 'load'
|
||||
| 'domcontentloaded'
|
||||
| 'networkidle'
|
||||
| 'commit'
|
||||
| undefined
|
||||
const waitForSelector = nodeData.inputs?.waitForSelector as string
|
||||
const cssSelector = nodeData.inputs?.cssSelector as string
|
||||
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
|
||||
const output = nodeData.outputs?.output as string
|
||||
const orgId = options.orgId
|
||||
@@ -172,13 +187,14 @@ class Playwright_DocumentLoaders implements INode {
|
||||
throw new Error('Invalid URL')
|
||||
}
|
||||
|
||||
async function playwrightLoader(url: string): Promise<any> {
|
||||
async function playwrightLoader(url: string): Promise<Document[] | undefined> {
|
||||
try {
|
||||
let docs = []
|
||||
const config: PlaywrightWebBaseLoaderOptions = {
|
||||
launchOptions: {
|
||||
args: ['--no-sandbox'],
|
||||
headless: true
|
||||
headless: true,
|
||||
executablePath: process.env.PLAYWRIGHT_EXECUTABLE_FILE_PATH
|
||||
}
|
||||
}
|
||||
if (waitUntilGoToOption) {
|
||||
@@ -186,12 +202,22 @@ class Playwright_DocumentLoaders implements INode {
|
||||
waitUntil: waitUntilGoToOption
|
||||
}
|
||||
}
|
||||
if (waitForSelector) {
|
||||
if (cssSelector || waitForSelector) {
|
||||
config['evaluate'] = async (page: Page, _: Browser): Promise<string> => {
|
||||
await page.waitForSelector(waitForSelector)
|
||||
if (waitForSelector) {
|
||||
await page.waitForSelector(waitForSelector)
|
||||
}
|
||||
|
||||
const result = await page.evaluate(() => document.body.innerHTML)
|
||||
return result
|
||||
if (cssSelector) {
|
||||
const selectorHandle = await page.$(cssSelector)
|
||||
const result = await page.evaluate(
|
||||
(htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML,
|
||||
selectorHandle
|
||||
)
|
||||
return result
|
||||
} else {
|
||||
return await page.evaluate(() => document.body.innerHTML)
|
||||
}
|
||||
}
|
||||
}
|
||||
const loader = new PlaywrightWebBaseLoader(url, config)
|
||||
@@ -208,7 +234,7 @@ class Playwright_DocumentLoaders implements INode {
|
||||
}
|
||||
}
|
||||
|
||||
let docs: IDocument[] = []
|
||||
let docs: Document[] = []
|
||||
if (relativeLinksMethod) {
|
||||
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PlaywrightWebBaseLoader ${relativeLinksMethod}`)
|
||||
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
|
||||
@@ -225,7 +251,10 @@ class Playwright_DocumentLoaders implements INode {
|
||||
options.logger.info(`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
|
||||
if (!pages || pages.length === 0) throw new Error('No relative links found')
|
||||
for (const page of pages) {
|
||||
docs.push(...(await playwrightLoader(page)))
|
||||
const result = await playwrightLoader(page)
|
||||
if (result) {
|
||||
docs.push(...result)
|
||||
}
|
||||
}
|
||||
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PlaywrightWebBaseLoader ${relativeLinksMethod}`)
|
||||
} else if (selectedLinks && selectedLinks.length > 0) {
|
||||
@@ -234,10 +263,16 @@ class Playwright_DocumentLoaders implements INode {
|
||||
`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`
|
||||
)
|
||||
for (const page of selectedLinks.slice(0, limit)) {
|
||||
docs.push(...(await playwrightLoader(page)))
|
||||
const result = await playwrightLoader(page)
|
||||
if (result) {
|
||||
docs.push(...result)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
docs = await playwrightLoader(url)
|
||||
const result = await playwrightLoader(url)
|
||||
if (result) {
|
||||
docs.push(...result)
|
||||
}
|
||||
}
|
||||
|
||||
if (metadata) {
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import { omit } from 'lodash'
|
||||
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||
import { TextSplitter } from 'langchain/text_splitter'
|
||||
import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from '@langchain/community/document_loaders/web/puppeteer'
|
||||
import { Document } from '@langchain/core/documents'
|
||||
import { TextSplitter } from 'langchain/text_splitter'
|
||||
import { test } from 'linkifyjs'
|
||||
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
|
||||
import { omit } from 'lodash'
|
||||
import { PuppeteerLifeCycleEvent } from 'puppeteer'
|
||||
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
|
||||
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||
|
||||
class Puppeteer_DocumentLoaders implements INode {
|
||||
label: string
|
||||
@@ -109,6 +110,14 @@ class Puppeteer_DocumentLoaders implements INode {
|
||||
additionalParams: true,
|
||||
description: 'CSS selectors like .div or #div'
|
||||
},
|
||||
{
|
||||
label: 'CSS Selector (Optional)',
|
||||
name: 'cssSelector',
|
||||
type: 'string',
|
||||
description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.',
|
||||
optional: true,
|
||||
additionalParams: true
|
||||
},
|
||||
{
|
||||
label: 'Additional Metadata',
|
||||
name: 'metadata',
|
||||
@@ -151,8 +160,9 @@ class Puppeteer_DocumentLoaders implements INode {
|
||||
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
|
||||
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
|
||||
let limit = parseInt(nodeData.inputs?.limit as string)
|
||||
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
|
||||
let waitForSelector = nodeData.inputs?.waitForSelector as string
|
||||
const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
|
||||
const waitForSelector = nodeData.inputs?.waitForSelector as string
|
||||
const cssSelector = nodeData.inputs?.cssSelector as string
|
||||
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
|
||||
const output = nodeData.outputs?.output as string
|
||||
const orgId = options.orgId
|
||||
@@ -168,13 +178,14 @@ class Puppeteer_DocumentLoaders implements INode {
|
||||
throw new Error('Invalid URL')
|
||||
}
|
||||
|
||||
async function puppeteerLoader(url: string): Promise<any> {
|
||||
async function puppeteerLoader(url: string): Promise<Document[] | undefined> {
|
||||
try {
|
||||
let docs = []
|
||||
let docs: Document[] = []
|
||||
const config: PuppeteerWebBaseLoaderOptions = {
|
||||
launchOptions: {
|
||||
args: ['--no-sandbox'],
|
||||
headless: 'new'
|
||||
headless: 'new',
|
||||
executablePath: process.env.PUPPETEER_EXECUTABLE_FILE_PATH
|
||||
}
|
||||
}
|
||||
if (waitUntilGoToOption) {
|
||||
@@ -182,12 +193,22 @@ class Puppeteer_DocumentLoaders implements INode {
|
||||
waitUntil: waitUntilGoToOption
|
||||
}
|
||||
}
|
||||
if (waitForSelector) {
|
||||
if (cssSelector || waitForSelector) {
|
||||
config['evaluate'] = async (page: Page, _: Browser): Promise<string> => {
|
||||
await page.waitForSelector(waitForSelector)
|
||||
if (waitForSelector) {
|
||||
await page.waitForSelector(waitForSelector)
|
||||
}
|
||||
|
||||
const result = await page.evaluate(() => document.body.innerHTML)
|
||||
return result
|
||||
if (cssSelector) {
|
||||
const selectorHandle = await page.$(cssSelector)
|
||||
const result = await page.evaluate(
|
||||
(htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML,
|
||||
selectorHandle
|
||||
)
|
||||
return result
|
||||
} else {
|
||||
return await page.evaluate(() => document.body.innerHTML)
|
||||
}
|
||||
}
|
||||
}
|
||||
const loader = new PuppeteerWebBaseLoader(url, config)
|
||||
@@ -204,7 +225,7 @@ class Puppeteer_DocumentLoaders implements INode {
|
||||
}
|
||||
}
|
||||
|
||||
let docs: IDocument[] = []
|
||||
let docs: Document[] = []
|
||||
if (relativeLinksMethod) {
|
||||
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PuppeteerWebBaseLoader ${relativeLinksMethod}`)
|
||||
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
|
||||
@@ -221,7 +242,10 @@ class Puppeteer_DocumentLoaders implements INode {
|
||||
options.logger.info(`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
|
||||
if (!pages || pages.length === 0) throw new Error('No relative links found')
|
||||
for (const page of pages) {
|
||||
docs.push(...(await puppeteerLoader(page)))
|
||||
const result = await puppeteerLoader(page)
|
||||
if (result) {
|
||||
docs.push(...result)
|
||||
}
|
||||
}
|
||||
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PuppeteerWebBaseLoader ${relativeLinksMethod}`)
|
||||
} else if (selectedLinks && selectedLinks.length > 0) {
|
||||
@@ -230,10 +254,16 @@ class Puppeteer_DocumentLoaders implements INode {
|
||||
`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`
|
||||
)
|
||||
for (const page of selectedLinks.slice(0, limit)) {
|
||||
docs.push(...(await puppeteerLoader(page)))
|
||||
const result = await puppeteerLoader(page)
|
||||
if (result) {
|
||||
docs.push(...result)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
docs = await puppeteerLoader(url)
|
||||
const result = await puppeteerLoader(url)
|
||||
if (result) {
|
||||
docs.push(...result)
|
||||
}
|
||||
}
|
||||
|
||||
if (metadata) {
|
||||
|
||||
@@ -169,4 +169,12 @@ JWT_REFRESH_TOKEN_EXPIRY_IN_MINUTES=43200
|
||||
############################################## SECURITY ####################################################
|
||||
############################################################################################################
|
||||
|
||||
# HTTP_DENY_LIST=
|
||||
# HTTP_DENY_LIST=
|
||||
|
||||
|
||||
############################################################################################################
|
||||
########################################### DOCUMENT LOADERS ###############################################
|
||||
############################################################################################################
|
||||
|
||||
# PUPPETEER_EXECUTABLE_FILE_PATH='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
|
||||
# PLAYWRIGHT_EXECUTABLE_FILE_PATH='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
|
||||
@@ -1,7 +1,7 @@
|
||||
import PropTypes from 'prop-types'
|
||||
import { useEffect, useState } from 'react'
|
||||
import { createPortal } from 'react-dom'
|
||||
import { useDispatch } from 'react-redux'
|
||||
import { useState, useEffect } from 'react'
|
||||
|
||||
import {
|
||||
Box,
|
||||
@@ -16,11 +16,11 @@ import {
|
||||
Stack,
|
||||
Typography
|
||||
} from '@mui/material'
|
||||
import { IconEraser, IconTrash, IconX } from '@tabler/icons-react'
|
||||
import { IconEraser, IconPlus, IconTrash, IconX } from '@tabler/icons-react'
|
||||
import PerfectScrollbar from 'react-perfect-scrollbar'
|
||||
|
||||
import { BackdropLoader } from '@/ui-component/loading/BackdropLoader'
|
||||
import { StyledButton } from '@/ui-component/button/StyledButton'
|
||||
import { BackdropLoader } from '@/ui-component/loading/BackdropLoader'
|
||||
|
||||
import scraperApi from '@/api/scraper'
|
||||
|
||||
@@ -29,8 +29,8 @@ import useNotifier from '@/utils/useNotifier'
|
||||
import {
|
||||
HIDE_CANVAS_DIALOG,
|
||||
SHOW_CANVAS_DIALOG,
|
||||
enqueueSnackbar as enqueueSnackbarAction,
|
||||
closeSnackbar as closeSnackbarAction
|
||||
closeSnackbar as closeSnackbarAction,
|
||||
enqueueSnackbar as enqueueSnackbarAction
|
||||
} from '@/store/actions'
|
||||
|
||||
const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
|
||||
@@ -112,6 +112,10 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
|
||||
setSelectedLinks(links)
|
||||
}
|
||||
|
||||
const handleAddLink = () => {
|
||||
setSelectedLinks([...selectedLinks, ''])
|
||||
}
|
||||
|
||||
const handleRemoveAllLinks = () => {
|
||||
setSelectedLinks([])
|
||||
}
|
||||
@@ -160,6 +164,16 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
|
||||
</Box>
|
||||
<Box sx={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', mb: 1.5 }}>
|
||||
<Typography sx={{ fontWeight: 500 }}>Scraped Links</Typography>
|
||||
<Box sx={{ width: 'auto', flexGrow: 1 }}>
|
||||
<IconButton
|
||||
sx={{ height: 30, width: 30, marginLeft: '8px' }}
|
||||
size='small'
|
||||
color='primary'
|
||||
onClick={() => handleAddLink()}
|
||||
>
|
||||
<IconPlus />
|
||||
</IconButton>
|
||||
</Box>
|
||||
{selectedLinks.length > 0 ? (
|
||||
<Button
|
||||
sx={{ height: 'max-content', width: 'max-content' }}
|
||||
|
||||
Reference in New Issue
Block a user