From bf7a7f8a96dc4bd4ad28887a1da8420a08c9b419 Mon Sep 17 00:00:00 2001 From: toi500 <138339291+toi500@users.noreply.github.com> Date: Fri, 9 Aug 2024 12:35:18 +0200 Subject: [PATCH] Spider Search / Scrape as a Tool (#2972) adding tools Co-authored-by: toi500 --- .../server/marketplaces/tools/Spider Web Scraper.json | 8 ++++++++ .../marketplaces/tools/Spider Web Search & Scrape.json | 8 ++++++++ 2 files changed, 16 insertions(+) create mode 100644 packages/server/marketplaces/tools/Spider Web Scraper.json create mode 100644 packages/server/marketplaces/tools/Spider Web Search & Scrape.json diff --git a/packages/server/marketplaces/tools/Spider Web Scraper.json b/packages/server/marketplaces/tools/Spider Web Scraper.json new file mode 100644 index 00000000..06ad3d23 --- /dev/null +++ b/packages/server/marketplaces/tools/Spider Web Scraper.json @@ -0,0 +1,8 @@ +{ + "name": "webpage_scraper", + "description": "This tool is useful for extracting up-to-date information (text) from web pages, making it ideal for gathering data for analysis. If the user provides multiple URLs, process each one separately and then synthesize the extracted information into a single, comprehensive response. Make sure to add the HTTP protocol (https://) to website URLs if the user forgets to do so.\n\nImportant: The webpage_scraper function retrieves the raw text content of any webpage. It does not provide any structural information like headings, paragraphs, or specific elements.", + "color": "linear-gradient(rgb(75,205,223), rgb(4,90,12))", + "iconSrc": "https://raw.githubusercontent.com/FlowiseAI/Flowise/main/packages/components/nodes/documentloaders/Spider/spider.svg", + "schema": "[{\"id\":0,\"property\":\"url\",\"description\":\"This is the URL provided by the user\",\"type\":\"string\",\"required\":true}]", + "func": "const fetch = require('node-fetch');\nconst targetUrl = $url;\nconst data = {\n \"depth\": 1,\n \"limit\": 1,\n \"proxy_enabled\": true,\n \"anti_bot\": true,\n \"request\": \"smart\",\n \"return_format\": \"text\",\n \"cache\": true,\n \"store_data\":true,\n \"url\": `${targetUrl}`\n};\n\nconst url = 'https://api.spider.cloud/crawl';\n\ntry {\n const response = await fetch(url, {\n method: 'POST',\n headers: {\n 'Authorization': `Bearer SPIDER_API_KEY`,\n 'Content-Type': 'application/json'\n },\n body: JSON.stringify(data)\n });\n if (!response.ok) {\n console.error('Network response was not ok:', response.statusText);\n return `Error: ${response.statusText}`; \n }\n const text = await response.text(); \n return text; \n} catch (error) {\n console.error(error);\n return ''; \n}\n\n/*\n * Works well with OpenAI models (gpt-4o and gpt-4o-mini). \n * Inconsistencies may occur with Google models (Gemini 1.5, 1.5 Flash).\n * Other models are untested.\n *\n * For Scraping:\n * depth (number): The maximum scrape depth (0 for no limit).\n * limit (number): The maximum number of pages to scrape per website.\n * proxy_enabled (boolean): Enables the use of premium proxies for scraping.\n * anti_bot (boolean): Enable anti-bot mode using techniques to increase the chance of success\n * request (string): The request type: 'http', 'chrome', or 'smart'.\n * return_format (string): The format for the returned data.\n * cache (boolean): Use HTTP caching for the crawl to speed up repeated runs.\n * store_data (boolean): To collect resources to download and re-use later on.\n * url (string): The URI of the resource to scrape.\n * \n * For more options:\n * https://spider.cloud/docs/api\n */\n" +} diff --git a/packages/server/marketplaces/tools/Spider Web Search & Scrape.json b/packages/server/marketplaces/tools/Spider Web Search & Scrape.json new file mode 100644 index 00000000..788bdcf2 --- /dev/null +++ b/packages/server/marketplaces/tools/Spider Web Search & Scrape.json @@ -0,0 +1,8 @@ +{ + "name": "metasearch_engine", + "description": "This tool provides real-time information from the internet using a Metasearch Engine, ensuring up-to-date and relevant responses. Use it to research complex topics by strategically breaking them down into multiple, targeted search queries, exploring different facets and subtopics to gather a comprehensive understanding. If needed, you can use this tool multiple times, but refine your queries based on previous results rather than repeating the same search. Before using the tool, make sure to improve the user's search query to make it clear, thorough, and optimized for the most relevant results.\n", + "color": "linear-gradient(rgb(181,220,163), rgb(216,1,106))", + "iconSrc": "https://raw.githubusercontent.com/FlowiseAI/Flowise/main/packages/components/nodes/documentloaders/Spider/spider.svg", + "schema": "[{\"id\":0,\"property\":\"query\",\"description\":\"This is the search query\",\"type\":\"string\",\"required\":true}]", + "func": "const fetch = require('node-fetch');\nconst searchQuery = $query;\nconst data = {\n \"search\": `${searchQuery}`,\n \"country\": \"us\",\n \"language\":\"en\",\n \"cache\": true,\n \"store_data\": true,\n \"search_limit\": 3,\n \"depth\": 1,\n \"limit\": 1,\n \"proxy_enabled\": true,\n \"anti_bot\": true,\n \"request\": \"smart\",\n \"return_format\": \"text\"\n};\n\nconst url = 'https://api.spider.cloud/search';\n\ntry {\n const response = await fetch(url, {\n method: 'POST',\n headers: {\n 'Authorization': `Bearer SPIDER_API_KEY`, \n 'Content-Type': 'application/json'\n },\n body: JSON.stringify(data)\n });\n if (!response.ok) {\n console.error('Network response was not ok:', response.statusText);\n return `Error: ${response.statusText}`; \n }\n const text = await response.text(); \n return text; \n} catch (error) {\n console.error(error);\n return ''; \n}\n\n/*\n * This tool performs a meta search on any given topic and immediately scrapes \n * the discovered URLs in a single API call.\n * \n * Works well with OpenAI models (gpt-4o and gpt-4o-mini). \n * Inconsistencies may occur with Google models (Gemini).\n * Other models are untested.\n *\n * For Searching:\n * search (string): The search query you want to search for.\n * country (string): The two-letter country code to use for the search.\n * language (string): The language to use for the search.\n * cache (boolean): Use HTTP caching for the crawl to speed up repeated runs.\n * store_data (boolean): To collect resources to download and re-use later on.\n * search_limit (number): The maximum number of URLs to fetch from the search results.\n *\n * For Scraping:\n * depth (number): The maximum scrape depth (0 for no limit).\n * limit (number): The maximum number of pages to scrape per website.\n * proxy_enabled (boolean): Enables the use of premium proxies for scraping.\n * anti_bot (boolean): Enable anti-bot mode using techniques to increase the chance of success\n * request (string): The request type: 'http', 'chrome', or 'smart'.\n * return_format (string): The format for the returned data.\n *\n * For more options:\n * https://spider.cloud/docs/api\n */" +}