add function desc

This commit is contained in:
chungyau97
2023-07-05 17:28:59 +08:00
parent 607d4a3394
commit af67d70cfc
+13 -1
View File
@@ -202,6 +202,9 @@ export const getAvailableURLs = async (url: string, limit: number) => {
} }
} }
/**
* Search for href through htmlBody string
*/
function getURLsFromHTML(htmlBody: string, baseURL: string): string[] { function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
const dom = new JSDOM(htmlBody) const dom = new JSDOM(htmlBody)
const linkElements = dom.window.document.querySelectorAll('a') const linkElements = dom.window.document.querySelectorAll('a')
@@ -228,6 +231,9 @@ function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
return urls return urls
} }
/**
* Normalize URL to prevent crawling the same page
*/
function normalizeURL(urlString: string): string { function normalizeURL(urlString: string): string {
const urlObj = new URL(urlString) const urlObj = new URL(urlString)
const hostPath = urlObj.hostname + urlObj.pathname const hostPath = urlObj.hostname + urlObj.pathname
@@ -238,7 +244,10 @@ function normalizeURL(urlString: string): string {
return hostPath return hostPath
} }
export async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise<string[]> { /**
* Recursive crawl using normalizeURL and getURLsFromHTML
*/
async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise<string[]> {
const baseURLObj = new URL(baseURL) const baseURLObj = new URL(baseURL)
const currentURLObj = new URL(currentURL) const currentURLObj = new URL(currentURL)
@@ -279,6 +288,9 @@ export async function crawl(baseURL: string, currentURL: string, pages: string[]
return pages return pages
} }
/**
* Prep URL before passing into recursive carwl function
*/
export async function webCrawl(stringURL: string, limit: number): Promise<string[]> { export async function webCrawl(stringURL: string, limit: number): Promise<string[]> {
const URLObj = new URL(stringURL) const URLObj = new URL(stringURL)
const modifyURL = stringURL.slice(-1) === '/' ? stringURL.slice(0, -1) : stringURL const modifyURL = stringURL.slice(-1) === '/' ? stringURL.slice(0, -1) : stringURL