Puppeteer / Playwright web crawler bug fixes/improvements (#4998)

* feature/bugfix: added otpional css selector to puppeteer web scraper, fixed error when puppeteerLoader does not work.

* feature: added button to add empty link in web scraper tools

* feature: added custom executable file path as an input to puppeteer to fix issues when puppeteer can not find/launch the browser.

* feature: added new puppeteer features to playwright aswell.

* fixed review comments
This commit is contained in:
Mewyii
2025-08-08 20:46:59 +02:00
committed by GitHub
parent fddd40a5cd
commit 9c070c7205
4 changed files with 125 additions and 38 deletions
@@ -1,14 +1,15 @@
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import {
Browser,
Page,
PlaywrightWebBaseLoader,
PlaywrightWebBaseLoaderOptions
} from '@langchain/community/document_loaders/web/playwright'
import { Document } from '@langchain/core/documents'
import { TextSplitter } from 'langchain/text_splitter'
import { test } from 'linkifyjs'
import { omit } from 'lodash'
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
class Playwright_DocumentLoaders implements INode {
label: string
@@ -113,6 +114,14 @@ class Playwright_DocumentLoaders implements INode {
additionalParams: true,
description: 'CSS selectors like .div or #div'
},
{
label: 'CSS Selector (Optional)',
name: 'cssSelector',
type: 'string',
description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.',
optional: true,
additionalParams: true
},
{
label: 'Additional Metadata',
name: 'metadata',
@@ -155,8 +164,14 @@ class Playwright_DocumentLoaders implements INode {
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
let waitForSelector = nodeData.inputs?.waitForSelector as string
const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as
| 'load'
| 'domcontentloaded'
| 'networkidle'
| 'commit'
| undefined
const waitForSelector = nodeData.inputs?.waitForSelector as string
const cssSelector = nodeData.inputs?.cssSelector as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
const orgId = options.orgId
@@ -172,13 +187,14 @@ class Playwright_DocumentLoaders implements INode {
throw new Error('Invalid URL')
}
async function playwrightLoader(url: string): Promise<any> {
async function playwrightLoader(url: string): Promise<Document[] | undefined> {
try {
let docs = []
const config: PlaywrightWebBaseLoaderOptions = {
launchOptions: {
args: ['--no-sandbox'],
headless: true
headless: true,
executablePath: process.env.PLAYWRIGHT_EXECUTABLE_FILE_PATH
}
}
if (waitUntilGoToOption) {
@@ -186,12 +202,22 @@ class Playwright_DocumentLoaders implements INode {
waitUntil: waitUntilGoToOption
}
}
if (waitForSelector) {
if (cssSelector || waitForSelector) {
config['evaluate'] = async (page: Page, _: Browser): Promise<string> => {
await page.waitForSelector(waitForSelector)
if (waitForSelector) {
await page.waitForSelector(waitForSelector)
}
const result = await page.evaluate(() => document.body.innerHTML)
return result
if (cssSelector) {
const selectorHandle = await page.$(cssSelector)
const result = await page.evaluate(
(htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML,
selectorHandle
)
return result
} else {
return await page.evaluate(() => document.body.innerHTML)
}
}
}
const loader = new PlaywrightWebBaseLoader(url, config)
@@ -208,7 +234,7 @@ class Playwright_DocumentLoaders implements INode {
}
}
let docs: IDocument[] = []
let docs: Document[] = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PlaywrightWebBaseLoader ${relativeLinksMethod}`)
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -225,7 +251,10 @@ class Playwright_DocumentLoaders implements INode {
options.logger.info(`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await playwrightLoader(page)))
const result = await playwrightLoader(page)
if (result) {
docs.push(...result)
}
}
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PlaywrightWebBaseLoader ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
@@ -234,10 +263,16 @@ class Playwright_DocumentLoaders implements INode {
`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`
)
for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await playwrightLoader(page)))
const result = await playwrightLoader(page)
if (result) {
docs.push(...result)
}
}
} else {
docs = await playwrightLoader(url)
const result = await playwrightLoader(url)
if (result) {
docs.push(...result)
}
}
if (metadata) {
@@ -1,10 +1,11 @@
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from '@langchain/community/document_loaders/web/puppeteer'
import { Document } from '@langchain/core/documents'
import { TextSplitter } from 'langchain/text_splitter'
import { test } from 'linkifyjs'
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
import { omit } from 'lodash'
import { PuppeteerLifeCycleEvent } from 'puppeteer'
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
class Puppeteer_DocumentLoaders implements INode {
label: string
@@ -109,6 +110,14 @@ class Puppeteer_DocumentLoaders implements INode {
additionalParams: true,
description: 'CSS selectors like .div or #div'
},
{
label: 'CSS Selector (Optional)',
name: 'cssSelector',
type: 'string',
description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.',
optional: true,
additionalParams: true
},
{
label: 'Additional Metadata',
name: 'metadata',
@@ -151,8 +160,9 @@ class Puppeteer_DocumentLoaders implements INode {
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
let waitForSelector = nodeData.inputs?.waitForSelector as string
const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
const waitForSelector = nodeData.inputs?.waitForSelector as string
const cssSelector = nodeData.inputs?.cssSelector as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
const orgId = options.orgId
@@ -168,13 +178,14 @@ class Puppeteer_DocumentLoaders implements INode {
throw new Error('Invalid URL')
}
async function puppeteerLoader(url: string): Promise<any> {
async function puppeteerLoader(url: string): Promise<Document[] | undefined> {
try {
let docs = []
let docs: Document[] = []
const config: PuppeteerWebBaseLoaderOptions = {
launchOptions: {
args: ['--no-sandbox'],
headless: 'new'
headless: 'new',
executablePath: process.env.PUPPETEER_EXECUTABLE_FILE_PATH
}
}
if (waitUntilGoToOption) {
@@ -182,12 +193,22 @@ class Puppeteer_DocumentLoaders implements INode {
waitUntil: waitUntilGoToOption
}
}
if (waitForSelector) {
if (cssSelector || waitForSelector) {
config['evaluate'] = async (page: Page, _: Browser): Promise<string> => {
await page.waitForSelector(waitForSelector)
if (waitForSelector) {
await page.waitForSelector(waitForSelector)
}
const result = await page.evaluate(() => document.body.innerHTML)
return result
if (cssSelector) {
const selectorHandle = await page.$(cssSelector)
const result = await page.evaluate(
(htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML,
selectorHandle
)
return result
} else {
return await page.evaluate(() => document.body.innerHTML)
}
}
}
const loader = new PuppeteerWebBaseLoader(url, config)
@@ -204,7 +225,7 @@ class Puppeteer_DocumentLoaders implements INode {
}
}
let docs: IDocument[] = []
let docs: Document[] = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PuppeteerWebBaseLoader ${relativeLinksMethod}`)
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -221,7 +242,10 @@ class Puppeteer_DocumentLoaders implements INode {
options.logger.info(`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await puppeteerLoader(page)))
const result = await puppeteerLoader(page)
if (result) {
docs.push(...result)
}
}
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PuppeteerWebBaseLoader ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
@@ -230,10 +254,16 @@ class Puppeteer_DocumentLoaders implements INode {
`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`
)
for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await puppeteerLoader(page)))
const result = await puppeteerLoader(page)
if (result) {
docs.push(...result)
}
}
} else {
docs = await puppeteerLoader(url)
const result = await puppeteerLoader(url)
if (result) {
docs.push(...result)
}
}
if (metadata) {
+9 -1
View File
@@ -169,4 +169,12 @@ JWT_REFRESH_TOKEN_EXPIRY_IN_MINUTES=43200
############################################## SECURITY ####################################################
############################################################################################################
# HTTP_DENY_LIST=
# HTTP_DENY_LIST=
############################################################################################################
########################################### DOCUMENT LOADERS ###############################################
############################################################################################################
# PUPPETEER_EXECUTABLE_FILE_PATH='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
# PLAYWRIGHT_EXECUTABLE_FILE_PATH='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
@@ -1,7 +1,7 @@
import PropTypes from 'prop-types'
import { useEffect, useState } from 'react'
import { createPortal } from 'react-dom'
import { useDispatch } from 'react-redux'
import { useState, useEffect } from 'react'
import {
Box,
@@ -16,11 +16,11 @@ import {
Stack,
Typography
} from '@mui/material'
import { IconEraser, IconTrash, IconX } from '@tabler/icons-react'
import { IconEraser, IconPlus, IconTrash, IconX } from '@tabler/icons-react'
import PerfectScrollbar from 'react-perfect-scrollbar'
import { BackdropLoader } from '@/ui-component/loading/BackdropLoader'
import { StyledButton } from '@/ui-component/button/StyledButton'
import { BackdropLoader } from '@/ui-component/loading/BackdropLoader'
import scraperApi from '@/api/scraper'
@@ -29,8 +29,8 @@ import useNotifier from '@/utils/useNotifier'
import {
HIDE_CANVAS_DIALOG,
SHOW_CANVAS_DIALOG,
enqueueSnackbar as enqueueSnackbarAction,
closeSnackbar as closeSnackbarAction
closeSnackbar as closeSnackbarAction,
enqueueSnackbar as enqueueSnackbarAction
} from '@/store/actions'
const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
@@ -112,6 +112,10 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
setSelectedLinks(links)
}
const handleAddLink = () => {
setSelectedLinks([...selectedLinks, ''])
}
const handleRemoveAllLinks = () => {
setSelectedLinks([])
}
@@ -160,6 +164,16 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
</Box>
<Box sx={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', mb: 1.5 }}>
<Typography sx={{ fontWeight: 500 }}>Scraped Links</Typography>
<Box sx={{ width: 'auto', flexGrow: 1 }}>
<IconButton
sx={{ height: 30, width: 30, marginLeft: '8px' }}
size='small'
color='primary'
onClick={() => handleAddLink()}
>
<IconPlus />
</IconButton>
</Box>
{selectedLinks.length > 0 ? (
<Button
sx={{ height: 'max-content', width: 'max-content' }}