mirror of
https://github.com/farcasclaudiu/openclaw.git
synced 2026-06-28 15:01:41 +03:00
feat: move TTS into core (#1559) (thanks @Glucksberg)
This commit is contained in:
@@ -0,0 +1,234 @@
|
||||
import { describe, expect, it, vi, beforeEach, afterEach } from "vitest";
|
||||
|
||||
import { _test } from "./tts.js";
|
||||
|
||||
const {
|
||||
isValidVoiceId,
|
||||
isValidOpenAIVoice,
|
||||
isValidOpenAIModel,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
summarizeText,
|
||||
resolveOutputFormat,
|
||||
} = _test;
|
||||
|
||||
describe("tts", () => {
|
||||
describe("isValidVoiceId", () => {
|
||||
it("accepts valid ElevenLabs voice IDs", () => {
|
||||
expect(isValidVoiceId("pMsXgVXv3BLzUgSXRplE")).toBe(true);
|
||||
expect(isValidVoiceId("21m00Tcm4TlvDq8ikWAM")).toBe(true);
|
||||
expect(isValidVoiceId("EXAVITQu4vr4xnSDxMaL")).toBe(true);
|
||||
});
|
||||
|
||||
it("accepts voice IDs of varying valid lengths", () => {
|
||||
expect(isValidVoiceId("a1b2c3d4e5")).toBe(true);
|
||||
expect(isValidVoiceId("a".repeat(40))).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects too short voice IDs", () => {
|
||||
expect(isValidVoiceId("")).toBe(false);
|
||||
expect(isValidVoiceId("abc")).toBe(false);
|
||||
expect(isValidVoiceId("123456789")).toBe(false);
|
||||
});
|
||||
|
||||
it("rejects too long voice IDs", () => {
|
||||
expect(isValidVoiceId("a".repeat(41))).toBe(false);
|
||||
expect(isValidVoiceId("a".repeat(100))).toBe(false);
|
||||
});
|
||||
|
||||
it("rejects voice IDs with invalid characters", () => {
|
||||
expect(isValidVoiceId("pMsXgVXv3BLz-gSXRplE")).toBe(false);
|
||||
expect(isValidVoiceId("pMsXgVXv3BLz_gSXRplE")).toBe(false);
|
||||
expect(isValidVoiceId("pMsXgVXv3BLz gSXRplE")).toBe(false);
|
||||
expect(isValidVoiceId("../../../etc/passwd")).toBe(false);
|
||||
expect(isValidVoiceId("voice?param=value")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isValidOpenAIVoice", () => {
|
||||
it("accepts all valid OpenAI voices", () => {
|
||||
for (const voice of OPENAI_TTS_VOICES) {
|
||||
expect(isValidOpenAIVoice(voice)).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
it("rejects invalid voice names", () => {
|
||||
expect(isValidOpenAIVoice("invalid")).toBe(false);
|
||||
expect(isValidOpenAIVoice("")).toBe(false);
|
||||
expect(isValidOpenAIVoice("ALLOY")).toBe(false);
|
||||
expect(isValidOpenAIVoice("alloy ")).toBe(false);
|
||||
expect(isValidOpenAIVoice(" alloy")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isValidOpenAIModel", () => {
|
||||
it("accepts gpt-4o-mini-tts model", () => {
|
||||
expect(isValidOpenAIModel("gpt-4o-mini-tts")).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects other models", () => {
|
||||
expect(isValidOpenAIModel("tts-1")).toBe(false);
|
||||
expect(isValidOpenAIModel("tts-1-hd")).toBe(false);
|
||||
expect(isValidOpenAIModel("invalid")).toBe(false);
|
||||
expect(isValidOpenAIModel("")).toBe(false);
|
||||
expect(isValidOpenAIModel("gpt-4")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("OPENAI_TTS_MODELS", () => {
|
||||
it("contains only gpt-4o-mini-tts", () => {
|
||||
expect(OPENAI_TTS_MODELS).toContain("gpt-4o-mini-tts");
|
||||
expect(OPENAI_TTS_MODELS).toHaveLength(1);
|
||||
});
|
||||
|
||||
it("is a non-empty array", () => {
|
||||
expect(Array.isArray(OPENAI_TTS_MODELS)).toBe(true);
|
||||
expect(OPENAI_TTS_MODELS.length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveOutputFormat", () => {
|
||||
it("uses Opus for Telegram", () => {
|
||||
const output = resolveOutputFormat("telegram");
|
||||
expect(output.openai).toBe("opus");
|
||||
expect(output.elevenlabs).toBe("opus_48000_64");
|
||||
expect(output.extension).toBe(".opus");
|
||||
expect(output.voiceCompatible).toBe(true);
|
||||
});
|
||||
|
||||
it("uses MP3 for other channels", () => {
|
||||
const output = resolveOutputFormat("discord");
|
||||
expect(output.openai).toBe("mp3");
|
||||
expect(output.elevenlabs).toBe("mp3_44100_128");
|
||||
expect(output.extension).toBe(".mp3");
|
||||
expect(output.voiceCompatible).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("summarizeText", () => {
|
||||
const mockApiKey = "test-api-key";
|
||||
const originalFetch = globalThis.fetch;
|
||||
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers({ shouldAdvanceTime: true });
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = originalFetch;
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("summarizes text and returns result with metrics", async () => {
|
||||
const mockSummary = "This is a summarized version of the text.";
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [{ message: { content: mockSummary } }],
|
||||
}),
|
||||
});
|
||||
|
||||
const longText = "A".repeat(2000);
|
||||
const result = await summarizeText(longText, 1500, mockApiKey, 30_000);
|
||||
|
||||
expect(result.summary).toBe(mockSummary);
|
||||
expect(result.inputLength).toBe(2000);
|
||||
expect(result.outputLength).toBe(mockSummary.length);
|
||||
expect(result.latencyMs).toBeGreaterThanOrEqual(0);
|
||||
expect(globalThis.fetch).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("calls OpenAI API with correct parameters", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [{ message: { content: "Summary" } }],
|
||||
}),
|
||||
});
|
||||
|
||||
await summarizeText("Long text to summarize", 500, mockApiKey, 30_000);
|
||||
|
||||
expect(globalThis.fetch).toHaveBeenCalledWith(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
expect.objectContaining({
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${mockApiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const callArgs = (globalThis.fetch as ReturnType<typeof vi.fn>).mock.calls[0];
|
||||
const body = JSON.parse(callArgs[1].body);
|
||||
expect(body.model).toBe("gpt-4o-mini");
|
||||
expect(body.temperature).toBe(0.3);
|
||||
expect(body.max_tokens).toBe(250);
|
||||
});
|
||||
|
||||
it("rejects targetLength below minimum (100)", async () => {
|
||||
await expect(summarizeText("text", 99, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"Invalid targetLength: 99",
|
||||
);
|
||||
});
|
||||
|
||||
it("rejects targetLength above maximum (10000)", async () => {
|
||||
await expect(summarizeText("text", 10001, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"Invalid targetLength: 10001",
|
||||
);
|
||||
});
|
||||
|
||||
it("accepts targetLength at boundaries", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [{ message: { content: "Summary" } }],
|
||||
}),
|
||||
});
|
||||
|
||||
await expect(summarizeText("text", 100, mockApiKey, 30_000)).resolves.toBeDefined();
|
||||
await expect(summarizeText("text", 10000, mockApiKey, 30_000)).resolves.toBeDefined();
|
||||
});
|
||||
|
||||
it("throws error when API returns non-ok response", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: false,
|
||||
status: 500,
|
||||
});
|
||||
|
||||
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"Summarization service unavailable",
|
||||
);
|
||||
});
|
||||
|
||||
it("throws error when no summary is returned", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [],
|
||||
}),
|
||||
});
|
||||
|
||||
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"No summary returned",
|
||||
);
|
||||
});
|
||||
|
||||
it("throws error when summary content is empty", async () => {
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
choices: [{ message: { content: " " } }],
|
||||
}),
|
||||
});
|
||||
|
||||
await expect(summarizeText("text", 500, mockApiKey, 30_000)).rejects.toThrow(
|
||||
"No summary returned",
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
+630
@@ -0,0 +1,630 @@
|
||||
import {
|
||||
existsSync,
|
||||
mkdirSync,
|
||||
readFileSync,
|
||||
writeFileSync,
|
||||
mkdtempSync,
|
||||
rmSync,
|
||||
renameSync,
|
||||
unlinkSync,
|
||||
} from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import type { ReplyPayload } from "../auto-reply/types.js";
|
||||
import { normalizeChannelId } from "../channels/plugins/index.js";
|
||||
import type { ChannelId } from "../channels/plugins/types.js";
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type { TtsConfig, TtsMode, TtsProvider } from "../config/types.tts.js";
|
||||
import { logVerbose } from "../globals.js";
|
||||
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 30_000;
|
||||
const DEFAULT_TTS_MAX_LENGTH = 1500;
|
||||
const DEFAULT_TTS_SUMMARIZE = true;
|
||||
const DEFAULT_MAX_TEXT_LENGTH = 4000;
|
||||
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
|
||||
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
|
||||
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
|
||||
const DEFAULT_OPENAI_VOICE = "alloy";
|
||||
|
||||
const TELEGRAM_OUTPUT = {
|
||||
openai: "opus" as const,
|
||||
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
|
||||
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
|
||||
elevenlabs: "opus_48000_64",
|
||||
extension: ".opus",
|
||||
voiceCompatible: true,
|
||||
};
|
||||
|
||||
const DEFAULT_OUTPUT = {
|
||||
openai: "mp3" as const,
|
||||
elevenlabs: "mp3_44100_128",
|
||||
extension: ".mp3",
|
||||
voiceCompatible: false,
|
||||
};
|
||||
|
||||
export type ResolvedTtsConfig = {
|
||||
enabled: boolean;
|
||||
mode: TtsMode;
|
||||
provider: TtsProvider;
|
||||
elevenlabs: {
|
||||
apiKey?: string;
|
||||
voiceId: string;
|
||||
modelId: string;
|
||||
};
|
||||
openai: {
|
||||
apiKey?: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
};
|
||||
prefsPath?: string;
|
||||
maxTextLength: number;
|
||||
timeoutMs: number;
|
||||
};
|
||||
|
||||
type TtsUserPrefs = {
|
||||
tts?: {
|
||||
enabled?: boolean;
|
||||
provider?: TtsProvider;
|
||||
maxLength?: number;
|
||||
summarize?: boolean;
|
||||
};
|
||||
};
|
||||
|
||||
export type TtsResult = {
|
||||
success: boolean;
|
||||
audioPath?: string;
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
outputFormat?: string;
|
||||
voiceCompatible?: boolean;
|
||||
};
|
||||
|
||||
type TtsStatusEntry = {
|
||||
timestamp: number;
|
||||
success: boolean;
|
||||
textLength: number;
|
||||
summarized: boolean;
|
||||
provider?: string;
|
||||
latencyMs?: number;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
let lastTtsAttempt: TtsStatusEntry | undefined;
|
||||
|
||||
export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
|
||||
const raw: TtsConfig = cfg.messages?.tts ?? {};
|
||||
return {
|
||||
enabled: raw.enabled ?? false,
|
||||
mode: raw.mode ?? "final",
|
||||
provider: raw.provider ?? "elevenlabs",
|
||||
elevenlabs: {
|
||||
apiKey: raw.elevenlabs?.apiKey,
|
||||
voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID,
|
||||
modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID,
|
||||
},
|
||||
openai: {
|
||||
apiKey: raw.openai?.apiKey,
|
||||
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
||||
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
||||
},
|
||||
prefsPath: raw.prefsPath,
|
||||
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
|
||||
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
|
||||
};
|
||||
}
|
||||
|
||||
export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string {
|
||||
if (config.prefsPath?.trim()) return resolveUserPath(config.prefsPath.trim());
|
||||
const envPath = process.env.CLAWDBOT_TTS_PREFS?.trim();
|
||||
if (envPath) return resolveUserPath(envPath);
|
||||
return path.join(CONFIG_DIR, "settings", "tts.json");
|
||||
}
|
||||
|
||||
function readPrefs(prefsPath: string): TtsUserPrefs {
|
||||
try {
|
||||
if (!existsSync(prefsPath)) return {};
|
||||
return JSON.parse(readFileSync(prefsPath, "utf8")) as TtsUserPrefs;
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
function atomicWriteFileSync(filePath: string, content: string): void {
|
||||
const tmpPath = `${filePath}.tmp.${Date.now()}.${Math.random().toString(36).slice(2)}`;
|
||||
writeFileSync(tmpPath, content);
|
||||
try {
|
||||
renameSync(tmpPath, filePath);
|
||||
} catch (err) {
|
||||
try {
|
||||
unlinkSync(tmpPath);
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void): void {
|
||||
const prefs = readPrefs(prefsPath);
|
||||
update(prefs);
|
||||
mkdirSync(path.dirname(prefsPath), { recursive: true });
|
||||
atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2));
|
||||
}
|
||||
|
||||
export function isTtsEnabled(config: ResolvedTtsConfig, prefsPath: string): boolean {
|
||||
const prefs = readPrefs(prefsPath);
|
||||
if (prefs.tts?.enabled !== undefined) return prefs.tts.enabled === true;
|
||||
return config.enabled;
|
||||
}
|
||||
|
||||
export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
prefs.tts = { ...prefs.tts, enabled };
|
||||
});
|
||||
}
|
||||
|
||||
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
|
||||
const prefs = readPrefs(prefsPath);
|
||||
return prefs.tts?.provider ?? config.provider;
|
||||
}
|
||||
|
||||
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
prefs.tts = { ...prefs.tts, provider };
|
||||
});
|
||||
}
|
||||
|
||||
export function getTtsMaxLength(prefsPath: string): number {
|
||||
const prefs = readPrefs(prefsPath);
|
||||
return prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH;
|
||||
}
|
||||
|
||||
export function setTtsMaxLength(prefsPath: string, maxLength: number): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
prefs.tts = { ...prefs.tts, maxLength };
|
||||
});
|
||||
}
|
||||
|
||||
export function isSummarizationEnabled(prefsPath: string): boolean {
|
||||
const prefs = readPrefs(prefsPath);
|
||||
return prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE;
|
||||
}
|
||||
|
||||
export function setSummarizationEnabled(prefsPath: string, enabled: boolean): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
prefs.tts = { ...prefs.tts, summarize: enabled };
|
||||
});
|
||||
}
|
||||
|
||||
export function getLastTtsAttempt(): TtsStatusEntry | undefined {
|
||||
return lastTtsAttempt;
|
||||
}
|
||||
|
||||
export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
|
||||
lastTtsAttempt = entry;
|
||||
}
|
||||
|
||||
function resolveOutputFormat(channelId?: string | null) {
|
||||
if (channelId === "telegram") return TELEGRAM_OUTPUT;
|
||||
return DEFAULT_OUTPUT;
|
||||
}
|
||||
|
||||
function resolveChannelId(channel: string | undefined): ChannelId | null {
|
||||
return channel ? normalizeChannelId(channel) : null;
|
||||
}
|
||||
|
||||
export function resolveTtsApiKey(
|
||||
config: ResolvedTtsConfig,
|
||||
provider: TtsProvider,
|
||||
): string | undefined {
|
||||
if (provider === "elevenlabs") {
|
||||
return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
|
||||
}
|
||||
if (provider === "openai") {
|
||||
return config.openai.apiKey || process.env.OPENAI_API_KEY;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function isValidVoiceId(voiceId: string): boolean {
|
||||
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
|
||||
}
|
||||
|
||||
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] as const;
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"onyx",
|
||||
"nova",
|
||||
"sage",
|
||||
"shimmer",
|
||||
] as const;
|
||||
|
||||
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
function isValidOpenAIModel(model: string): boolean {
|
||||
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
||||
}
|
||||
|
||||
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
|
||||
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
|
||||
}
|
||||
|
||||
type SummarizeResult = {
|
||||
summary: string;
|
||||
latencyMs: number;
|
||||
inputLength: number;
|
||||
outputLength: number;
|
||||
};
|
||||
|
||||
async function summarizeText(
|
||||
text: string,
|
||||
targetLength: number,
|
||||
apiKey: string,
|
||||
timeoutMs: number,
|
||||
): Promise<SummarizeResult> {
|
||||
if (targetLength < 100 || targetLength > 10_000) {
|
||||
throw new Error(`Invalid targetLength: ${targetLength}`);
|
||||
}
|
||||
|
||||
const startTime = Date.now();
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetch("https://api.openai.com/v1/chat/completions", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: "gpt-4o-mini",
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: `You are an assistant that summarizes texts concisely while keeping the most important information. Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. Reply only with the summary, without additional explanations.`,
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `<text_to_summarize>\n${text}\n</text_to_summarize>`,
|
||||
},
|
||||
],
|
||||
max_tokens: Math.ceil(targetLength / 2),
|
||||
temperature: 0.3,
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error("Summarization service unavailable");
|
||||
}
|
||||
|
||||
const data = (await response.json()) as {
|
||||
choices?: Array<{ message?: { content?: string } }>;
|
||||
};
|
||||
const summary = data.choices?.[0]?.message?.content?.trim();
|
||||
|
||||
if (!summary) {
|
||||
throw new Error("No summary returned");
|
||||
}
|
||||
|
||||
return {
|
||||
summary,
|
||||
latencyMs: Date.now() - startTime,
|
||||
inputLength: text.length,
|
||||
outputLength: summary.length,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void {
|
||||
const timer = setTimeout(() => {
|
||||
try {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// ignore cleanup errors
|
||||
}
|
||||
}, delayMs);
|
||||
timer.unref();
|
||||
}
|
||||
|
||||
async function elevenLabsTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
voiceId: string;
|
||||
modelId: string;
|
||||
outputFormat: string;
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, voiceId, modelId, outputFormat, timeoutMs } = params;
|
||||
if (!isValidVoiceId(voiceId)) {
|
||||
throw new Error("Invalid voiceId format");
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`);
|
||||
if (outputFormat) {
|
||||
url.searchParams.set("output_format", outputFormat);
|
||||
}
|
||||
|
||||
const response = await fetch(url.toString(), {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"xi-api-key": apiKey,
|
||||
"Content-Type": "application/json",
|
||||
Accept: "audio/mpeg",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: modelId,
|
||||
voice_settings: {
|
||||
stability: 0.5,
|
||||
similarity_boost: 0.75,
|
||||
style: 0.0,
|
||||
use_speaker_boost: true,
|
||||
},
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`ElevenLabs API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
async function openaiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
responseFormat: "mp3" | "opus";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
||||
|
||||
if (!isValidOpenAIModel(model)) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
}
|
||||
if (!isValidOpenAIVoice(voice)) {
|
||||
throw new Error(`Invalid voice: ${voice}`);
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: responseFormat,
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`OpenAI TTS API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
export async function textToSpeech(params: {
|
||||
text: string;
|
||||
cfg: ClawdbotConfig;
|
||||
prefsPath?: string;
|
||||
channel?: string;
|
||||
}): Promise<TtsResult> {
|
||||
const config = resolveTtsConfig(params.cfg);
|
||||
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
|
||||
const channelId = resolveChannelId(params.channel);
|
||||
const output = resolveOutputFormat(channelId);
|
||||
|
||||
if (params.text.length > config.maxTextLength) {
|
||||
return {
|
||||
success: false,
|
||||
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
|
||||
};
|
||||
}
|
||||
|
||||
const userProvider = getTtsProvider(config, prefsPath);
|
||||
const providers: TtsProvider[] = [
|
||||
userProvider,
|
||||
userProvider === "openai" ? "elevenlabs" : "openai",
|
||||
];
|
||||
|
||||
let lastError: string | undefined;
|
||||
|
||||
for (const provider of providers) {
|
||||
const apiKey = resolveTtsApiKey(config, provider);
|
||||
if (!apiKey) {
|
||||
lastError = `No API key for ${provider}`;
|
||||
continue;
|
||||
}
|
||||
|
||||
const providerStart = Date.now();
|
||||
try {
|
||||
let audioBuffer: Buffer;
|
||||
if (provider === "elevenlabs") {
|
||||
audioBuffer = await elevenLabsTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
voiceId: config.elevenlabs.voiceId,
|
||||
modelId: config.elevenlabs.modelId,
|
||||
outputFormat: output.elevenlabs,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
} else {
|
||||
audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
model: config.openai.model,
|
||||
voice: config.openai.voice,
|
||||
responseFormat: output.openai,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
}
|
||||
|
||||
const latencyMs = Date.now() - providerStart;
|
||||
|
||||
const tempDir = mkdtempSync(path.join(tmpdir(), "tts-"));
|
||||
const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
|
||||
writeFileSync(audioPath, audioBuffer);
|
||||
scheduleCleanup(tempDir);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioPath,
|
||||
latencyMs,
|
||||
provider,
|
||||
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
|
||||
voiceCompatible: output.voiceCompatible,
|
||||
};
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
if (error.name === "AbortError") {
|
||||
lastError = `${provider}: request timed out`;
|
||||
} else {
|
||||
lastError = `${provider}: ${error.message}`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `TTS conversion failed: ${lastError || "no providers available"}`,
|
||||
};
|
||||
}
|
||||
|
||||
export async function maybeApplyTtsToPayload(params: {
|
||||
payload: ReplyPayload;
|
||||
cfg: ClawdbotConfig;
|
||||
channel?: string;
|
||||
kind?: "tool" | "block" | "final";
|
||||
}): Promise<ReplyPayload> {
|
||||
const config = resolveTtsConfig(params.cfg);
|
||||
const prefsPath = resolveTtsPrefsPath(config);
|
||||
if (!isTtsEnabled(config, prefsPath)) return params.payload;
|
||||
|
||||
const mode = config.mode ?? "final";
|
||||
if (mode === "final" && params.kind && params.kind !== "final") return params.payload;
|
||||
|
||||
const text = params.payload.text ?? "";
|
||||
if (!text.trim()) return params.payload;
|
||||
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return params.payload;
|
||||
if (text.includes("MEDIA:")) return params.payload;
|
||||
if (text.trim().length < 10) return params.payload;
|
||||
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
let textForAudio = text.trim();
|
||||
let wasSummarized = false;
|
||||
|
||||
if (textForAudio.length > maxLength) {
|
||||
if (!isSummarizationEnabled(prefsPath)) {
|
||||
logVerbose(
|
||||
`TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
|
||||
);
|
||||
return params.payload;
|
||||
}
|
||||
|
||||
const openaiKey = resolveTtsApiKey(config, "openai");
|
||||
if (!openaiKey) {
|
||||
logVerbose("TTS: skipping summarization - OpenAI key missing.");
|
||||
return params.payload;
|
||||
}
|
||||
|
||||
try {
|
||||
const summary = await summarizeText(textForAudio, maxLength, openaiKey, config.timeoutMs);
|
||||
textForAudio = summary.summary;
|
||||
wasSummarized = true;
|
||||
if (textForAudio.length > config.maxTextLength) {
|
||||
logVerbose(
|
||||
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
|
||||
);
|
||||
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
|
||||
}
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logVerbose(`TTS: summarization failed: ${error.message}`);
|
||||
return params.payload;
|
||||
}
|
||||
}
|
||||
|
||||
const ttsStart = Date.now();
|
||||
const result = await textToSpeech({
|
||||
text: textForAudio,
|
||||
cfg: params.cfg,
|
||||
prefsPath,
|
||||
channel: params.channel,
|
||||
});
|
||||
|
||||
if (result.success && result.audioPath) {
|
||||
lastTtsAttempt = {
|
||||
timestamp: Date.now(),
|
||||
success: true,
|
||||
textLength: text.length,
|
||||
summarized: wasSummarized,
|
||||
provider: result.provider,
|
||||
latencyMs: result.latencyMs,
|
||||
};
|
||||
|
||||
const channelId = resolveChannelId(params.channel);
|
||||
const shouldVoice = channelId === "telegram" && result.voiceCompatible === true;
|
||||
|
||||
return {
|
||||
...params.payload,
|
||||
mediaUrl: result.audioPath,
|
||||
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
|
||||
};
|
||||
}
|
||||
|
||||
lastTtsAttempt = {
|
||||
timestamp: Date.now(),
|
||||
success: false,
|
||||
textLength: text.length,
|
||||
summarized: wasSummarized,
|
||||
error: result.error,
|
||||
};
|
||||
|
||||
const latency = Date.now() - ttsStart;
|
||||
logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`);
|
||||
return params.payload;
|
||||
}
|
||||
|
||||
export const _test = {
|
||||
isValidVoiceId,
|
||||
isValidOpenAIVoice,
|
||||
isValidOpenAIModel,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
summarizeText,
|
||||
resolveOutputFormat,
|
||||
};
|
||||
Reference in New Issue
Block a user