From 9c070c720525460b7c7e0073faee13b1cc8ceea3 Mon Sep 17 00:00:00 2001 From: Mewyii <60858755+Mewyii@users.noreply.github.com> Date: Fri, 8 Aug 2025 20:46:59 +0200 Subject: [PATCH] Puppeteer / Playwright web crawler bug fixes/improvements (#4998) * feature/bugfix: added otpional css selector to puppeteer web scraper, fixed error when puppeteerLoader does not work. * feature: added button to add empty link in web scraper tools * feature: added custom executable file path as an input to puppeteer to fix issues when puppeteer can not find/launch the browser. * feature: added new puppeteer features to playwright aswell. * fixed review comments --- .../documentloaders/Playwright/Playwright.ts | 65 ++++++++++++++----- .../documentloaders/Puppeteer/Puppeteer.ts | 64 +++++++++++++----- packages/server/.env.example | 10 ++- .../dialog/ManageScrapedLinksDialog.jsx | 24 +++++-- 4 files changed, 125 insertions(+), 38 deletions(-) diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index ba44ee7f..c3b090e8 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -1,14 +1,15 @@ -import { omit } from 'lodash' -import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface' -import { TextSplitter } from 'langchain/text_splitter' import { Browser, Page, PlaywrightWebBaseLoader, PlaywrightWebBaseLoaderOptions } from '@langchain/community/document_loaders/web/playwright' +import { Document } from '@langchain/core/documents' +import { TextSplitter } from 'langchain/text_splitter' import { test } from 'linkifyjs' +import { omit } from 'lodash' import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src' +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' class Playwright_DocumentLoaders implements INode { label: string @@ -113,6 +114,14 @@ class Playwright_DocumentLoaders implements INode { additionalParams: true, description: 'CSS selectors like .div or #div' }, + { + label: 'CSS Selector (Optional)', + name: 'cssSelector', + type: 'string', + description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.', + optional: true, + additionalParams: true + }, { label: 'Additional Metadata', name: 'metadata', @@ -155,8 +164,14 @@ class Playwright_DocumentLoaders implements INode { const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string const selectedLinks = nodeData.inputs?.selectedLinks as string[] let limit = parseInt(nodeData.inputs?.limit as string) - let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined - let waitForSelector = nodeData.inputs?.waitForSelector as string + const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as + | 'load' + | 'domcontentloaded' + | 'networkidle' + | 'commit' + | undefined + const waitForSelector = nodeData.inputs?.waitForSelector as string + const cssSelector = nodeData.inputs?.cssSelector as string const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string const output = nodeData.outputs?.output as string const orgId = options.orgId @@ -172,13 +187,14 @@ class Playwright_DocumentLoaders implements INode { throw new Error('Invalid URL') } - async function playwrightLoader(url: string): Promise { + async function playwrightLoader(url: string): Promise { try { let docs = [] const config: PlaywrightWebBaseLoaderOptions = { launchOptions: { args: ['--no-sandbox'], - headless: true + headless: true, + executablePath: process.env.PLAYWRIGHT_EXECUTABLE_FILE_PATH } } if (waitUntilGoToOption) { @@ -186,12 +202,22 @@ class Playwright_DocumentLoaders implements INode { waitUntil: waitUntilGoToOption } } - if (waitForSelector) { + if (cssSelector || waitForSelector) { config['evaluate'] = async (page: Page, _: Browser): Promise => { - await page.waitForSelector(waitForSelector) + if (waitForSelector) { + await page.waitForSelector(waitForSelector) + } - const result = await page.evaluate(() => document.body.innerHTML) - return result + if (cssSelector) { + const selectorHandle = await page.$(cssSelector) + const result = await page.evaluate( + (htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML, + selectorHandle + ) + return result + } else { + return await page.evaluate(() => document.body.innerHTML) + } } } const loader = new PlaywrightWebBaseLoader(url, config) @@ -208,7 +234,7 @@ class Playwright_DocumentLoaders implements INode { } } - let docs: IDocument[] = [] + let docs: Document[] = [] if (relativeLinksMethod) { if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PlaywrightWebBaseLoader ${relativeLinksMethod}`) // if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined @@ -225,7 +251,10 @@ class Playwright_DocumentLoaders implements INode { options.logger.info(`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { - docs.push(...(await playwrightLoader(page))) + const result = await playwrightLoader(page) + if (result) { + docs.push(...result) + } } if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PlaywrightWebBaseLoader ${relativeLinksMethod}`) } else if (selectedLinks && selectedLinks.length > 0) { @@ -234,10 +263,16 @@ class Playwright_DocumentLoaders implements INode { `[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}` ) for (const page of selectedLinks.slice(0, limit)) { - docs.push(...(await playwrightLoader(page))) + const result = await playwrightLoader(page) + if (result) { + docs.push(...result) + } } } else { - docs = await playwrightLoader(url) + const result = await playwrightLoader(url) + if (result) { + docs.push(...result) + } } if (metadata) { diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index 607a9935..5409ef4f 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -1,10 +1,11 @@ -import { omit } from 'lodash' -import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface' -import { TextSplitter } from 'langchain/text_splitter' import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from '@langchain/community/document_loaders/web/puppeteer' +import { Document } from '@langchain/core/documents' +import { TextSplitter } from 'langchain/text_splitter' import { test } from 'linkifyjs' -import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src' +import { omit } from 'lodash' import { PuppeteerLifeCycleEvent } from 'puppeteer' +import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src' +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' class Puppeteer_DocumentLoaders implements INode { label: string @@ -109,6 +110,14 @@ class Puppeteer_DocumentLoaders implements INode { additionalParams: true, description: 'CSS selectors like .div or #div' }, + { + label: 'CSS Selector (Optional)', + name: 'cssSelector', + type: 'string', + description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.', + optional: true, + additionalParams: true + }, { label: 'Additional Metadata', name: 'metadata', @@ -151,8 +160,9 @@ class Puppeteer_DocumentLoaders implements INode { const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string const selectedLinks = nodeData.inputs?.selectedLinks as string[] let limit = parseInt(nodeData.inputs?.limit as string) - let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent - let waitForSelector = nodeData.inputs?.waitForSelector as string + const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent + const waitForSelector = nodeData.inputs?.waitForSelector as string + const cssSelector = nodeData.inputs?.cssSelector as string const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string const output = nodeData.outputs?.output as string const orgId = options.orgId @@ -168,13 +178,14 @@ class Puppeteer_DocumentLoaders implements INode { throw new Error('Invalid URL') } - async function puppeteerLoader(url: string): Promise { + async function puppeteerLoader(url: string): Promise { try { - let docs = [] + let docs: Document[] = [] const config: PuppeteerWebBaseLoaderOptions = { launchOptions: { args: ['--no-sandbox'], - headless: 'new' + headless: 'new', + executablePath: process.env.PUPPETEER_EXECUTABLE_FILE_PATH } } if (waitUntilGoToOption) { @@ -182,12 +193,22 @@ class Puppeteer_DocumentLoaders implements INode { waitUntil: waitUntilGoToOption } } - if (waitForSelector) { + if (cssSelector || waitForSelector) { config['evaluate'] = async (page: Page, _: Browser): Promise => { - await page.waitForSelector(waitForSelector) + if (waitForSelector) { + await page.waitForSelector(waitForSelector) + } - const result = await page.evaluate(() => document.body.innerHTML) - return result + if (cssSelector) { + const selectorHandle = await page.$(cssSelector) + const result = await page.evaluate( + (htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML, + selectorHandle + ) + return result + } else { + return await page.evaluate(() => document.body.innerHTML) + } } } const loader = new PuppeteerWebBaseLoader(url, config) @@ -204,7 +225,7 @@ class Puppeteer_DocumentLoaders implements INode { } } - let docs: IDocument[] = [] + let docs: Document[] = [] if (relativeLinksMethod) { if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PuppeteerWebBaseLoader ${relativeLinksMethod}`) // if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined @@ -221,7 +242,10 @@ class Puppeteer_DocumentLoaders implements INode { options.logger.info(`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { - docs.push(...(await puppeteerLoader(page))) + const result = await puppeteerLoader(page) + if (result) { + docs.push(...result) + } } if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PuppeteerWebBaseLoader ${relativeLinksMethod}`) } else if (selectedLinks && selectedLinks.length > 0) { @@ -230,10 +254,16 @@ class Puppeteer_DocumentLoaders implements INode { `[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}` ) for (const page of selectedLinks.slice(0, limit)) { - docs.push(...(await puppeteerLoader(page))) + const result = await puppeteerLoader(page) + if (result) { + docs.push(...result) + } } } else { - docs = await puppeteerLoader(url) + const result = await puppeteerLoader(url) + if (result) { + docs.push(...result) + } } if (metadata) { diff --git a/packages/server/.env.example b/packages/server/.env.example index 22713805..fe47880b 100644 --- a/packages/server/.env.example +++ b/packages/server/.env.example @@ -169,4 +169,12 @@ JWT_REFRESH_TOKEN_EXPIRY_IN_MINUTES=43200 ############################################## SECURITY #################################################### ############################################################################################################ -# HTTP_DENY_LIST= \ No newline at end of file +# HTTP_DENY_LIST= + + +############################################################################################################ +########################################### DOCUMENT LOADERS ############################################### +############################################################################################################ + +# PUPPETEER_EXECUTABLE_FILE_PATH='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' +# PLAYWRIGHT_EXECUTABLE_FILE_PATH='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' \ No newline at end of file diff --git a/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.jsx b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.jsx index 788c1998..55d2a00c 100644 --- a/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.jsx +++ b/packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.jsx @@ -1,7 +1,7 @@ import PropTypes from 'prop-types' +import { useEffect, useState } from 'react' import { createPortal } from 'react-dom' import { useDispatch } from 'react-redux' -import { useState, useEffect } from 'react' import { Box, @@ -16,11 +16,11 @@ import { Stack, Typography } from '@mui/material' -import { IconEraser, IconTrash, IconX } from '@tabler/icons-react' +import { IconEraser, IconPlus, IconTrash, IconX } from '@tabler/icons-react' import PerfectScrollbar from 'react-perfect-scrollbar' -import { BackdropLoader } from '@/ui-component/loading/BackdropLoader' import { StyledButton } from '@/ui-component/button/StyledButton' +import { BackdropLoader } from '@/ui-component/loading/BackdropLoader' import scraperApi from '@/api/scraper' @@ -29,8 +29,8 @@ import useNotifier from '@/utils/useNotifier' import { HIDE_CANVAS_DIALOG, SHOW_CANVAS_DIALOG, - enqueueSnackbar as enqueueSnackbarAction, - closeSnackbar as closeSnackbarAction + closeSnackbar as closeSnackbarAction, + enqueueSnackbar as enqueueSnackbarAction } from '@/store/actions' const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => { @@ -112,6 +112,10 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => { setSelectedLinks(links) } + const handleAddLink = () => { + setSelectedLinks([...selectedLinks, '']) + } + const handleRemoveAllLinks = () => { setSelectedLinks([]) } @@ -160,6 +164,16 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => { Scraped Links + + handleAddLink()} + > + + + {selectedLinks.length > 0 ? (