mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 23:01:09 +03:00
add function desc
This commit is contained in:
@@ -202,6 +202,9 @@ export const getAvailableURLs = async (url: string, limit: number) => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search for href through htmlBody string
|
||||||
|
*/
|
||||||
function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
|
function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
|
||||||
const dom = new JSDOM(htmlBody)
|
const dom = new JSDOM(htmlBody)
|
||||||
const linkElements = dom.window.document.querySelectorAll('a')
|
const linkElements = dom.window.document.querySelectorAll('a')
|
||||||
@@ -228,6 +231,9 @@ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
|
|||||||
return urls
|
return urls
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize URL to prevent crawling the same page
|
||||||
|
*/
|
||||||
function normalizeURL(urlString: string): string {
|
function normalizeURL(urlString: string): string {
|
||||||
const urlObj = new URL(urlString)
|
const urlObj = new URL(urlString)
|
||||||
const hostPath = urlObj.hostname + urlObj.pathname
|
const hostPath = urlObj.hostname + urlObj.pathname
|
||||||
@@ -238,7 +244,10 @@ function normalizeURL(urlString: string): string {
|
|||||||
return hostPath
|
return hostPath
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise<string[]> {
|
/**
|
||||||
|
* Recursive crawl using normalizeURL and getURLsFromHTML
|
||||||
|
*/
|
||||||
|
async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise<string[]> {
|
||||||
const baseURLObj = new URL(baseURL)
|
const baseURLObj = new URL(baseURL)
|
||||||
const currentURLObj = new URL(currentURL)
|
const currentURLObj = new URL(currentURL)
|
||||||
|
|
||||||
@@ -279,6 +288,9 @@ export async function crawl(baseURL: string, currentURL: string, pages: string[]
|
|||||||
return pages
|
return pages
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prep URL before passing into recursive carwl function
|
||||||
|
*/
|
||||||
export async function webCrawl(stringURL: string, limit: number): Promise<string[]> {
|
export async function webCrawl(stringURL: string, limit: number): Promise<string[]> {
|
||||||
const URLObj = new URL(stringURL)
|
const URLObj = new URL(stringURL)
|
||||||
const modifyURL = stringURL.slice(-1) === '/' ? stringURL.slice(0, -1) : stringURL
|
const modifyURL = stringURL.slice(-1) === '/' ? stringURL.slice(0, -1) : stringURL
|
||||||
|
|||||||
Reference in New Issue
Block a user