From bfa26a72c4fd6b373aea58a6f5e88a8842367685 Mon Sep 17 00:00:00 2001 From: Ilango Date: Fri, 19 Jan 2024 14:25:04 +0530 Subject: [PATCH] Use selected links if available when scraping in cheerio, puppeteer, and playwright nodes --- .../nodes/documentloaders/Cheerio/Cheerio.ts | 12 +++++++++++- .../nodes/documentloaders/Playwright/Playwright.ts | 12 +++++++++++- .../nodes/documentloaders/Puppeteer/Puppeteer.ts | 12 +++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index aa899bcb..e883c097 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -90,6 +90,7 @@ class Cheerio_DocumentLoaders implements INode { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] let limit = nodeData.inputs?.limit as string let url = nodeData.inputs?.url as string @@ -127,13 +128,22 @@ class Cheerio_DocumentLoaders implements INode { if (!limit) limit = '10' else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, parseInt(limit)) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, parseInt(limit)) + : await xmlScrape(url, parseInt(limit)) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await cheerioLoader(page))) } if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await cheerioLoader(page))) + } } else { docs = await cheerioLoader(url) } diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index eb246045..65be3ce7 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -118,6 +118,7 @@ class Playwright_DocumentLoaders implements INode { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] let limit = nodeData.inputs?.limit as string let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined let waitForSelector = nodeData.inputs?.waitForSelector as string @@ -168,13 +169,22 @@ class Playwright_DocumentLoaders implements INode { if (!limit) limit = '10' else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, parseInt(limit)) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, parseInt(limit)) + : await xmlScrape(url, parseInt(limit)) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await playwrightLoader(page))) } if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await playwrightLoader(page))) + } } else { docs = await playwrightLoader(url) } diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index 4691eb94..d5539659 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -119,6 +119,7 @@ class Puppeteer_DocumentLoaders implements INode { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string + const selectedLinks = nodeData.inputs?.selectedLinks as string[] let limit = nodeData.inputs?.limit as string let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent let waitForSelector = nodeData.inputs?.waitForSelector as string @@ -169,13 +170,22 @@ class Puppeteer_DocumentLoaders implements INode { if (!limit) limit = '10' else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') const pages: string[] = - relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) + selectedLinks && selectedLinks.length > 0 + ? selectedLinks.slice(0, parseInt(limit)) + : relativeLinksMethod === 'webCrawl' + ? await webCrawl(url, parseInt(limit)) + : await xmlScrape(url, parseInt(limit)) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await puppeteerLoader(page))) } if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) + } else if (selectedLinks && selectedLinks.length > 0) { + if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`) + for (const page of selectedLinks) { + docs.push(...(await puppeteerLoader(page))) + } } else { docs = await puppeteerLoader(url) }