Puppeteer / Playwright web crawler bug fixes/improvements (#4998)

* feature/bugfix: added otpional css selector to puppeteer web scraper, fixed error when puppeteerLoader does not work.

* feature: added button to add empty link in web scraper tools

* feature: added custom executable file path as an input to puppeteer to fix issues when puppeteer can not find/launch the browser.

* feature: added new puppeteer features to playwright aswell.

* fixed review comments
This commit is contained in:
Mewyii
2025-08-08 20:46:59 +02:00
committed by GitHub
parent fddd40a5cd
commit 9c070c7205
4 changed files with 125 additions and 38 deletions
@@ -1,14 +1,15 @@
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import {
Browser,
Page,
PlaywrightWebBaseLoader,
PlaywrightWebBaseLoaderOptions
} from '@langchain/community/document_loaders/web/playwright'
import { Document } from '@langchain/core/documents'
import { TextSplitter } from 'langchain/text_splitter'
import { test } from 'linkifyjs'
import { omit } from 'lodash'
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
class Playwright_DocumentLoaders implements INode {
label: string
@@ -113,6 +114,14 @@ class Playwright_DocumentLoaders implements INode {
additionalParams: true,
description: 'CSS selectors like .div or #div'
},
{
label: 'CSS Selector (Optional)',
name: 'cssSelector',
type: 'string',
description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.',
optional: true,
additionalParams: true
},
{
label: 'Additional Metadata',
name: 'metadata',
@@ -155,8 +164,14 @@ class Playwright_DocumentLoaders implements INode {
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
let waitForSelector = nodeData.inputs?.waitForSelector as string
const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as
| 'load'
| 'domcontentloaded'
| 'networkidle'
| 'commit'
| undefined
const waitForSelector = nodeData.inputs?.waitForSelector as string
const cssSelector = nodeData.inputs?.cssSelector as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
const orgId = options.orgId
@@ -172,13 +187,14 @@ class Playwright_DocumentLoaders implements INode {
throw new Error('Invalid URL')
}
async function playwrightLoader(url: string): Promise<any> {
async function playwrightLoader(url: string): Promise<Document[] | undefined> {
try {
let docs = []
const config: PlaywrightWebBaseLoaderOptions = {
launchOptions: {
args: ['--no-sandbox'],
headless: true
headless: true,
executablePath: process.env.PLAYWRIGHT_EXECUTABLE_FILE_PATH
}
}
if (waitUntilGoToOption) {
@@ -186,12 +202,22 @@ class Playwright_DocumentLoaders implements INode {
waitUntil: waitUntilGoToOption
}
}
if (waitForSelector) {
if (cssSelector || waitForSelector) {
config['evaluate'] = async (page: Page, _: Browser): Promise<string> => {
await page.waitForSelector(waitForSelector)
if (waitForSelector) {
await page.waitForSelector(waitForSelector)
}
const result = await page.evaluate(() => document.body.innerHTML)
return result
if (cssSelector) {
const selectorHandle = await page.$(cssSelector)
const result = await page.evaluate(
(htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML,
selectorHandle
)
return result
} else {
return await page.evaluate(() => document.body.innerHTML)
}
}
}
const loader = new PlaywrightWebBaseLoader(url, config)
@@ -208,7 +234,7 @@ class Playwright_DocumentLoaders implements INode {
}
}
let docs: IDocument[] = []
let docs: Document[] = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PlaywrightWebBaseLoader ${relativeLinksMethod}`)
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -225,7 +251,10 @@ class Playwright_DocumentLoaders implements INode {
options.logger.info(`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await playwrightLoader(page)))
const result = await playwrightLoader(page)
if (result) {
docs.push(...result)
}
}
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PlaywrightWebBaseLoader ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
@@ -234,10 +263,16 @@ class Playwright_DocumentLoaders implements INode {
`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`
)
for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await playwrightLoader(page)))
const result = await playwrightLoader(page)
if (result) {
docs.push(...result)
}
}
} else {
docs = await playwrightLoader(url)
const result = await playwrightLoader(url)
if (result) {
docs.push(...result)
}
}
if (metadata) {
@@ -1,10 +1,11 @@
import { omit } from 'lodash'
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from '@langchain/community/document_loaders/web/puppeteer'
import { Document } from '@langchain/core/documents'
import { TextSplitter } from 'langchain/text_splitter'
import { test } from 'linkifyjs'
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
import { omit } from 'lodash'
import { PuppeteerLifeCycleEvent } from 'puppeteer'
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
class Puppeteer_DocumentLoaders implements INode {
label: string
@@ -109,6 +110,14 @@ class Puppeteer_DocumentLoaders implements INode {
additionalParams: true,
description: 'CSS selectors like .div or #div'
},
{
label: 'CSS Selector (Optional)',
name: 'cssSelector',
type: 'string',
description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.',
optional: true,
additionalParams: true
},
{
label: 'Additional Metadata',
name: 'metadata',
@@ -151,8 +160,9 @@ class Puppeteer_DocumentLoaders implements INode {
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
let waitForSelector = nodeData.inputs?.waitForSelector as string
const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
const waitForSelector = nodeData.inputs?.waitForSelector as string
const cssSelector = nodeData.inputs?.cssSelector as string
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const output = nodeData.outputs?.output as string
const orgId = options.orgId
@@ -168,13 +178,14 @@ class Puppeteer_DocumentLoaders implements INode {
throw new Error('Invalid URL')
}
async function puppeteerLoader(url: string): Promise<any> {
async function puppeteerLoader(url: string): Promise<Document[] | undefined> {
try {
let docs = []
let docs: Document[] = []
const config: PuppeteerWebBaseLoaderOptions = {
launchOptions: {
args: ['--no-sandbox'],
headless: 'new'
headless: 'new',
executablePath: process.env.PUPPETEER_EXECUTABLE_FILE_PATH
}
}
if (waitUntilGoToOption) {
@@ -182,12 +193,22 @@ class Puppeteer_DocumentLoaders implements INode {
waitUntil: waitUntilGoToOption
}
}
if (waitForSelector) {
if (cssSelector || waitForSelector) {
config['evaluate'] = async (page: Page, _: Browser): Promise<string> => {
await page.waitForSelector(waitForSelector)
if (waitForSelector) {
await page.waitForSelector(waitForSelector)
}
const result = await page.evaluate(() => document.body.innerHTML)
return result
if (cssSelector) {
const selectorHandle = await page.$(cssSelector)
const result = await page.evaluate(
(htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML,
selectorHandle
)
return result
} else {
return await page.evaluate(() => document.body.innerHTML)
}
}
}
const loader = new PuppeteerWebBaseLoader(url, config)
@@ -204,7 +225,7 @@ class Puppeteer_DocumentLoaders implements INode {
}
}
let docs: IDocument[] = []
let docs: Document[] = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PuppeteerWebBaseLoader ${relativeLinksMethod}`)
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -221,7 +242,10 @@ class Puppeteer_DocumentLoaders implements INode {
options.logger.info(`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await puppeteerLoader(page)))
const result = await puppeteerLoader(page)
if (result) {
docs.push(...result)
}
}
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PuppeteerWebBaseLoader ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
@@ -230,10 +254,16 @@ class Puppeteer_DocumentLoaders implements INode {
`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`
)
for (const page of selectedLinks.slice(0, limit)) {
docs.push(...(await puppeteerLoader(page)))
const result = await puppeteerLoader(page)
if (result) {
docs.push(...result)
}
}
} else {
docs = await puppeteerLoader(url)
const result = await puppeteerLoader(url)
if (result) {
docs.push(...result)
}
}
if (metadata) {