From af67d70cfcd0514e3462070a531d539630827d12 Mon Sep 17 00:00:00 2001 From: chungyau97 Date: Wed, 5 Jul 2023 17:28:59 +0800 Subject: [PATCH] add function desc --- packages/components/src/utils.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/packages/components/src/utils.ts b/packages/components/src/utils.ts index d99517f0..39d7e333 100644 --- a/packages/components/src/utils.ts +++ b/packages/components/src/utils.ts @@ -202,6 +202,9 @@ export const getAvailableURLs = async (url: string, limit: number) => { } } +/** + * Search for href through htmlBody string + */ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] { const dom = new JSDOM(htmlBody) const linkElements = dom.window.document.querySelectorAll('a') @@ -228,6 +231,9 @@ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] { return urls } +/** + * Normalize URL to prevent crawling the same page + */ function normalizeURL(urlString: string): string { const urlObj = new URL(urlString) const hostPath = urlObj.hostname + urlObj.pathname @@ -238,7 +244,10 @@ function normalizeURL(urlString: string): string { return hostPath } -export async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise { +/** + * Recursive crawl using normalizeURL and getURLsFromHTML + */ +async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise { const baseURLObj = new URL(baseURL) const currentURLObj = new URL(currentURL) @@ -279,6 +288,9 @@ export async function crawl(baseURL: string, currentURL: string, pages: string[] return pages } +/** + * Prep URL before passing into recursive carwl function + */ export async function webCrawl(stringURL: string, limit: number): Promise { const URLObj = new URL(stringURL) const modifyURL = stringURL.slice(-1) === '/' ? stringURL.slice(0, -1) : stringURL