From 6bd44e2d47581849024d4f829bd4e2a361199d69 Mon Sep 17 00:00:00 2001 From: chungyau97 Date: Fri, 7 Jul 2023 00:10:12 +0800 Subject: [PATCH] cheerio add xml scraper --- .../nodes/documentloaders/Cheerio/Cheerio.ts | 36 +++++++++++------ packages/components/src/utils.ts | 39 +++++++++++++++++++ 2 files changed, 63 insertions(+), 12 deletions(-) diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index 10eff77e..2106b86f 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio' import { test } from 'linkifyjs' -import { webCrawl } from '../../../src' +import { webCrawl, xmlScrape } from '../../../src' class Cheerio_DocumentLoaders implements INode { label: string @@ -35,20 +35,30 @@ class Cheerio_DocumentLoaders implements INode { optional: true }, { - label: 'Web Crawl for Relative Links', - name: 'boolWebCrawl', - type: 'boolean', + label: 'Get Relative Links Method', + name: 'relativeLinksMethod', + type: 'options', + options: [ + { + label: 'Web Crawl', + name: 'webCrawl' + }, + { + label: 'Scrape XML Sitemap', + name: 'scrapeXMLSitemap' + } + ], optional: true, additionalParams: true }, { - label: 'Web Crawl Links Limit', + label: 'Crawl/Scrape Links Limit', name: 'limit', type: 'number', default: 10, optional: true, additionalParams: true, - description: 'Set 0 to crawl all relative links' + description: 'Set 0 to crawl/scrape all relative links' }, { label: 'Metadata', @@ -63,7 +73,7 @@ class Cheerio_DocumentLoaders implements INode { async init(nodeData: INodeData): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata - const boolWebCrawl = nodeData.inputs?.boolWebCrawl as boolean + const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string let limit = nodeData.inputs?.limit as string let url = nodeData.inputs?.url as string @@ -88,16 +98,18 @@ class Cheerio_DocumentLoaders implements INode { } let docs = [] - if (boolWebCrawl) { - if (process.env.DEBUG === 'true') console.info('Start Web Crawl') - if (!limit) throw new Error('Please set a limit to crawl') + if (relativeLinksMethod) { + if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`) + if (!limit) throw new Error('Please set a limit to crawl/scrape') else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') - const pages: string[] = await webCrawl(url, parseInt(limit)) + const pages: string[] = + relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit)) if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + if (!pages || pages.length === 0) throw new Error('No relative links found') for (const page of pages) { docs.push(...(await cheerioLoader(page))) } - if (process.env.DEBUG === 'true') console.info('Finish Web Crawl') + if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`) } else { docs = await cheerioLoader(url) } diff --git a/packages/components/src/utils.ts b/packages/components/src/utils.ts index 39d7e333..a5f69ad9 100644 --- a/packages/components/src/utils.ts +++ b/packages/components/src/utils.ts @@ -297,6 +297,45 @@ export async function webCrawl(stringURL: string, limit: number): Promise { + let urls: string[] = [] + if (process.env.DEBUG === 'true') console.info(`actively scarping ${currentURL}`) + try { + const resp = await fetch(currentURL) + + if (resp.status > 399) { + if (process.env.DEBUG === 'true') console.error(`error in fetch with status code: ${resp.status}, on page: ${currentURL}`) + return urls + } + + const contentType: string | null = resp.headers.get('content-type') + if ((contentType && !contentType.includes('application/xml')) || !contentType) { + if (process.env.DEBUG === 'true') console.error(`non xml response, content type: ${contentType}, on page: ${currentURL}`) + return urls + } + + const xmlBody = await resp.text() + urls = getURLsFromXML(xmlBody, limit) + } catch (err) { + if (process.env.DEBUG === 'true') console.error(`error in fetch url: ${err.message}, on page: ${currentURL}`) + } + return urls +} + /** * Custom chain handler class */