From c18e98761af3dd908df5ae6969289c3dc331bf28 Mon Sep 17 00:00:00 2001 From: chungyau97 Date: Wed, 5 Jul 2023 17:07:45 +0800 Subject: [PATCH] modify puppeteer web crawl --- .../documentloaders/Puppeteer/Puppeteer.ts | 53 ++++++++++--------- packages/components/src/utils.ts | 2 +- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts index 1331c736..3f27dc03 100644 --- a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { PuppeteerWebBaseLoader } from 'langchain/document_loaders/web/puppeteer' import { test } from 'linkifyjs' -import { getAvailableURLs } from '../../../src' +import { webCrawl } from '../../../src' class Puppeteer_DocumentLoaders implements INode { label: string @@ -35,19 +35,20 @@ class Puppeteer_DocumentLoaders implements INode { optional: true }, { - label: 'Web Scrape for Relative Links', - name: 'webScrape', + label: 'Web Crawl for Relative Links', + name: 'boolWebCrawl', type: 'boolean', optional: true, additionalParams: true }, { - label: 'Web Scrape Links Limit', + label: 'Web Crawl Links Limit', name: 'limit', type: 'number', default: 10, optional: true, - additionalParams: true + additionalParams: true, + description: 'Set 0 to crawl all relative links' }, { label: 'Metadata', @@ -62,7 +63,7 @@ class Puppeteer_DocumentLoaders implements INode { async init(nodeData: INodeData): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata - const webScrape = nodeData.inputs?.webScrape as boolean + const boolWebCrawl = nodeData.inputs?.boolWebCrawl as boolean let limit = nodeData.inputs?.limit as string let url = nodeData.inputs?.url as string @@ -71,30 +72,32 @@ class Puppeteer_DocumentLoaders implements INode { throw new Error('Invalid URL') } - const puppeteerLoader = async (url: string): Promise => { - let docs = [] - const loader = new PuppeteerWebBaseLoader(url) - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) - } else { - docs = await loader.load() + async function puppeteerLoader(url: string): Promise { + try { + let docs = [] + const loader = new PuppeteerWebBaseLoader(url) + if (textSplitter) { + docs = await loader.loadAndSplit(textSplitter) + } else { + docs = await loader.load() + } + return docs + } catch (err) { + if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`) } - return docs } - let availableUrls: string[] let docs = [] - if (webScrape) { - if (!limit) limit = '10' - availableUrls = await getAvailableURLs(url, parseInt(limit)) - for (let i = 0; i < availableUrls.length; i++) { - try { - docs.push(...(await puppeteerLoader(availableUrls[i]))) - } catch (error) { - console.error('Error loading url with puppeteer. URL: ', availableUrls[i], 'Error: ', error) - continue - } + if (boolWebCrawl) { + if (process.env.DEBUG === 'true') console.info('Start Web Crawl') + if (!limit) throw new Error('Please set a limit to crawl') + else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') + const pages: string[] = await webCrawl(url, parseInt(limit)) + if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + for (const page of pages) { + docs.push(...(await puppeteerLoader(page))) } + if (process.env.DEBUG === 'true') console.info('Finish Web Crawl') } else { docs = await puppeteerLoader(url) } diff --git a/packages/components/src/utils.ts b/packages/components/src/utils.ts index 63bb3969..d99517f0 100644 --- a/packages/components/src/utils.ts +++ b/packages/components/src/utils.ts @@ -242,7 +242,7 @@ export async function crawl(baseURL: string, currentURL: string, pages: string[] const baseURLObj = new URL(baseURL) const currentURLObj = new URL(currentURL) - if (limit !== 0) if (pages.length === limit) return pages + if (limit !== 0 && pages.length === limit) return pages if (baseURLObj.hostname !== currentURLObj.hostname) return pages