mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 13:00:56 +03:00
Merge branch 'main' into bugfix/MultipleDocuments
# Conflicts: # packages/components/src/utils.ts
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
import axios from 'axios'
|
||||
import { load } from 'cheerio'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
|
||||
@@ -149,3 +151,42 @@ export const getInputVariables = (paramValue: string): string[] => {
|
||||
}
|
||||
return inputVariables
|
||||
}
|
||||
|
||||
export const getAvailableURLs = async (url: string, limit: number) => {
|
||||
try {
|
||||
const availableUrls: string[] = []
|
||||
|
||||
console.info(`Crawling: ${url}`)
|
||||
availableUrls.push(url)
|
||||
|
||||
const response = await axios.get(url)
|
||||
const $ = load(response.data)
|
||||
|
||||
const relativeLinks = $("a[href^='/']")
|
||||
console.info(`Available Relative Links: ${relativeLinks.length}`)
|
||||
if (relativeLinks.length === 0) return availableUrls
|
||||
|
||||
limit = Math.min(limit + 1, relativeLinks.length) // limit + 1 is because index start from 0 and index 0 is occupy by url
|
||||
console.info(`True Limit: ${limit}`)
|
||||
|
||||
// availableUrls.length cannot exceed limit
|
||||
for (let i = 0; availableUrls.length < limit; i++) {
|
||||
if (i === limit) break // some links are repetitive so it won't added into the array which cause the length to be lesser
|
||||
console.info(`index: ${i}`)
|
||||
const element = relativeLinks[i]
|
||||
|
||||
const relativeUrl = $(element).attr('href')
|
||||
if (!relativeUrl) continue
|
||||
|
||||
const absoluteUrl = new URL(relativeUrl, url).toString()
|
||||
if (!availableUrls.includes(absoluteUrl)) {
|
||||
availableUrls.push(absoluteUrl)
|
||||
console.info(`Found unique relative link: ${absoluteUrl}`)
|
||||
}
|
||||
}
|
||||
|
||||
return availableUrls
|
||||
} catch (err) {
|
||||
throw new Error(`getAvailableURLs: ${err?.message}`)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user