mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 17:01:00 +03:00
Feature/Add Azure Cognitive speech-to-text functionality (#3718)
* feat: Add Azure Cognitive Services integration for speech-to-text functionality - Introduced a new credential class for Azure Cognitive Services. - Updated speech-to-text processing to support Azure Cognitive Services as a provider. - Enhanced UI components to include Azure Cognitive Services options and inputs for configuration. - Added necessary imports and error handling for Azure API requests. * Update SpeechToText.jsx linting * refactor: Update audio file handling in SpeechToText component - Removed the dependency on 'form-data' and replaced it with a Blob for audio file uploads. - Simplified the audio file appending process to the form data. - Cleaned up the headers in the Axios request by removing unnecessary form data headers. This change enhances the efficiency of audio file processing in the speech-to-text functionality. --------- Co-authored-by: Henry Heng <henryheng@flowiseai.com> Co-authored-by: Henry <hzj94@hotmail.com>
This commit is contained in:
committed by
GitHub
parent
fff6319f5d
commit
2360f5fdeb
@@ -0,0 +1,39 @@
|
|||||||
|
import { INodeParams, INodeCredential } from '../src/Interface'
|
||||||
|
|
||||||
|
class AzureCognitiveServices implements INodeCredential {
|
||||||
|
label: string
|
||||||
|
name: string
|
||||||
|
version: number
|
||||||
|
inputs: INodeParams[]
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.label = 'Azure Cognitive Services'
|
||||||
|
this.name = 'azureCognitiveServices'
|
||||||
|
this.version = 1.0
|
||||||
|
this.inputs = [
|
||||||
|
{
|
||||||
|
label: 'Azure Subscription Key',
|
||||||
|
name: 'azureSubscriptionKey',
|
||||||
|
type: 'password',
|
||||||
|
description: 'Your Azure Cognitive Services subscription key'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Service Region',
|
||||||
|
name: 'serviceRegion',
|
||||||
|
type: 'string',
|
||||||
|
description: 'The Azure service region (e.g., "westus", "eastus")',
|
||||||
|
placeholder: 'westus'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'API Version',
|
||||||
|
name: 'apiVersion',
|
||||||
|
type: 'string',
|
||||||
|
description: 'The API version to use (e.g., "2024-05-15-preview")',
|
||||||
|
placeholder: '2024-05-15-preview',
|
||||||
|
default: '2024-05-15-preview'
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { credClass: AzureCognitiveServices }
|
||||||
@@ -3,12 +3,14 @@ import { getCredentialData } from './utils'
|
|||||||
import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai'
|
import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai'
|
||||||
import { AssemblyAI } from 'assemblyai'
|
import { AssemblyAI } from 'assemblyai'
|
||||||
import { getFileFromStorage } from './storageUtils'
|
import { getFileFromStorage } from './storageUtils'
|
||||||
|
import axios from 'axios'
|
||||||
import Groq from 'groq-sdk'
|
import Groq from 'groq-sdk'
|
||||||
|
|
||||||
const SpeechToTextType = {
|
const SpeechToTextType = {
|
||||||
OPENAI_WHISPER: 'openAIWhisper',
|
OPENAI_WHISPER: 'openAIWhisper',
|
||||||
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
|
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
|
||||||
LOCALAI_STT: 'localAISTT',
|
LOCALAI_STT: 'localAISTT',
|
||||||
|
AZURE_COGNITIVE: 'azureCognitive',
|
||||||
GROQ_WHISPER: 'groqWhisper'
|
GROQ_WHISPER: 'groqWhisper'
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -72,6 +74,40 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
|
|||||||
}
|
}
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
case SpeechToTextType.AZURE_COGNITIVE: {
|
||||||
|
try {
|
||||||
|
const baseUrl = `https://${credentialData.serviceRegion}.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe`
|
||||||
|
const apiVersion = credentialData.apiVersion || '2024-05-15-preview'
|
||||||
|
|
||||||
|
const formData = new FormData()
|
||||||
|
const audioBlob = new Blob([audio_file], { type: upload.type })
|
||||||
|
formData.append('audio', audioBlob, upload.name)
|
||||||
|
|
||||||
|
const channelsStr = speechToTextConfig.channels || '0,1'
|
||||||
|
const channels = channelsStr.split(',').map(Number)
|
||||||
|
|
||||||
|
const definition = {
|
||||||
|
locales: [speechToTextConfig.language || 'en-US'],
|
||||||
|
profanityFilterMode: speechToTextConfig.profanityFilterMode || 'Masked',
|
||||||
|
channels
|
||||||
|
}
|
||||||
|
formData.append('definition', JSON.stringify(definition))
|
||||||
|
|
||||||
|
const response = await axios.post(`${baseUrl}?api-version=${apiVersion}`, formData, {
|
||||||
|
headers: {
|
||||||
|
'Ocp-Apim-Subscription-Key': credentialData.azureSubscriptionKey,
|
||||||
|
Accept: 'application/json'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if (response.data && response.data.combinedPhrases.length > 0) {
|
||||||
|
return response.data.combinedPhrases[0]?.text || ''
|
||||||
|
}
|
||||||
|
return ''
|
||||||
|
} catch (error) {
|
||||||
|
throw error.response?.data || error
|
||||||
|
}
|
||||||
|
}
|
||||||
case SpeechToTextType.GROQ_WHISPER: {
|
case SpeechToTextType.GROQ_WHISPER: {
|
||||||
const groqClient = new Groq({
|
const groqClient = new Groq({
|
||||||
apiKey: credentialData.groqApiKey
|
apiKey: credentialData.groqApiKey
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown'
|
|||||||
import openAISVG from '@/assets/images/openai.svg'
|
import openAISVG from '@/assets/images/openai.svg'
|
||||||
import assemblyAIPng from '@/assets/images/assemblyai.png'
|
import assemblyAIPng from '@/assets/images/assemblyai.png'
|
||||||
import localAiPng from '@/assets/images/localai.png'
|
import localAiPng from '@/assets/images/localai.png'
|
||||||
|
import azureSvg from '@/assets/images/azure_openai.svg'
|
||||||
import groqPng from '@/assets/images/groq.png'
|
import groqPng from '@/assets/images/groq.png'
|
||||||
|
|
||||||
// store
|
// store
|
||||||
@@ -31,6 +32,7 @@ const SpeechToTextType = {
|
|||||||
OPENAI_WHISPER: 'openAIWhisper',
|
OPENAI_WHISPER: 'openAIWhisper',
|
||||||
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
|
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
|
||||||
LOCALAI_STT: 'localAISTT',
|
LOCALAI_STT: 'localAISTT',
|
||||||
|
AZURE_COGNITIVE: 'azureCognitive',
|
||||||
GROQ_WHISPER: 'groqWhisper'
|
GROQ_WHISPER: 'groqWhisper'
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -142,6 +144,58 @@ const speechToTextProviders = {
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
[SpeechToTextType.AZURE_COGNITIVE]: {
|
||||||
|
label: 'Azure Cognitive Services',
|
||||||
|
name: SpeechToTextType.AZURE_COGNITIVE,
|
||||||
|
icon: azureSvg,
|
||||||
|
url: 'https://azure.microsoft.com/en-us/products/cognitive-services/speech-services',
|
||||||
|
inputs: [
|
||||||
|
{
|
||||||
|
label: 'Connect Credential',
|
||||||
|
name: 'credential',
|
||||||
|
type: 'credential',
|
||||||
|
credentialNames: ['azureCognitiveServices']
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Language',
|
||||||
|
name: 'language',
|
||||||
|
type: 'string',
|
||||||
|
description: 'The recognition language (e.g., "en-US", "es-ES")',
|
||||||
|
placeholder: 'en-US',
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Profanity Filter Mode',
|
||||||
|
name: 'profanityFilterMode',
|
||||||
|
type: 'options',
|
||||||
|
description: 'How to handle profanity in the transcription',
|
||||||
|
options: [
|
||||||
|
{
|
||||||
|
label: 'None',
|
||||||
|
name: 'None'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Masked',
|
||||||
|
name: 'Masked'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Removed',
|
||||||
|
name: 'Removed'
|
||||||
|
}
|
||||||
|
],
|
||||||
|
default: 'Masked',
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Audio Channels',
|
||||||
|
name: 'channels',
|
||||||
|
type: 'string',
|
||||||
|
description: 'Comma-separated list of audio channels to process (e.g., "0,1")',
|
||||||
|
placeholder: '0,1',
|
||||||
|
default: '0,1'
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
[SpeechToTextType.GROQ_WHISPER]: {
|
[SpeechToTextType.GROQ_WHISPER]: {
|
||||||
label: 'Groq Whisper',
|
label: 'Groq Whisper',
|
||||||
name: SpeechToTextType.GROQ_WHISPER,
|
name: SpeechToTextType.GROQ_WHISPER,
|
||||||
|
|||||||
Reference in New Issue
Block a user