diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index 4f4f1841..20096ec6 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -1,7 +1,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio' - +import { test } from 'linkifyjs' class Cheerio_DocumentLoaders implements INode { label: string name: string @@ -47,18 +47,12 @@ class Cheerio_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata let url = nodeData.inputs?.url as string + url = url.trim() + if (!test(url)) { + throw new Error('Invalid URL') + } - var urlPattern = new RegExp( - '^(https?:\\/\\/)?' + // validate protocol - '((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // validate domain name - '((\\d{1,3}\\.){3}\\d{1,3}))' + // validate OR ip (v4) address - '(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*' + // validate port and path - '(\\?[;&a-z\\d%_.~+=-]*)?' + // validate query string - '(\\#[-a-z\\d_]*)?$', - 'i' - ) // validate fragment locator - - const loader = new CheerioWebBaseLoader(urlPattern.test(url.trim()) ? url.trim() : '') + const loader = new CheerioWebBaseLoader(url) let docs = [] if (textSplitter) { diff --git a/packages/components/package.json b/packages/components/package.json index 0e13c4fb..b4c56155 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -30,6 +30,7 @@ "form-data": "^4.0.0", "graphql": "^16.6.0", "langchain": "^0.0.73", + "linkifyjs": "^4.1.1", "mammoth": "^1.5.1", "moment": "^2.29.3", "node-fetch": "2",