Merge pull request #107 from FlowiseAI/bugfix/UrlParseError

bugfix/UrlParseError
This commit is contained in:
Henry Heng
2023-05-15 10:13:34 +01:00
committed by GitHub
2 changed files with 7 additions and 12 deletions
@@ -1,7 +1,7 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface' import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter' import { TextSplitter } from 'langchain/text_splitter'
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio' import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
import { test } from 'linkifyjs'
class Cheerio_DocumentLoaders implements INode { class Cheerio_DocumentLoaders implements INode {
label: string label: string
name: string name: string
@@ -47,18 +47,12 @@ class Cheerio_DocumentLoaders implements INode {
const metadata = nodeData.inputs?.metadata const metadata = nodeData.inputs?.metadata
let url = nodeData.inputs?.url as string let url = nodeData.inputs?.url as string
url = url.trim()
if (!test(url)) {
throw new Error('Invalid URL')
}
var urlPattern = new RegExp( const loader = new CheerioWebBaseLoader(url)
'^(https?:\\/\\/)?' + // validate protocol
'((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // validate domain name
'((\\d{1,3}\\.){3}\\d{1,3}))' + // validate OR ip (v4) address
'(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*' + // validate port and path
'(\\?[;&a-z\\d%_.~+=-]*)?' + // validate query string
'(\\#[-a-z\\d_]*)?$',
'i'
) // validate fragment locator
const loader = new CheerioWebBaseLoader(urlPattern.test(url.trim()) ? url.trim() : '')
let docs = [] let docs = []
if (textSplitter) { if (textSplitter) {
+1
View File
@@ -30,6 +30,7 @@
"form-data": "^4.0.0", "form-data": "^4.0.0",
"graphql": "^16.6.0", "graphql": "^16.6.0",
"langchain": "^0.0.73", "langchain": "^0.0.73",
"linkifyjs": "^4.1.1",
"mammoth": "^1.5.1", "mammoth": "^1.5.1",
"moment": "^2.29.3", "moment": "^2.29.3",
"node-fetch": "2", "node-fetch": "2",