mirror of
https://github.com/farcasclaudiu/openclaw.git
synced 2026-06-29 09:02:02 +03:00
fix: fix: transcribe audio before mention check in groups with requireMention (openclaw#9973) thanks @mcinteerj
Verified: - pnpm install --frozen-lockfile - pnpm build - pnpm check - pnpm test Co-authored-by: mcinteerj <3613653+mcinteerj@users.noreply.github.com>
This commit is contained in:
@@ -107,8 +107,27 @@ Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI
|
|||||||
- Transcript is available to templates as `{{Transcript}}`.
|
- Transcript is available to templates as `{{Transcript}}`.
|
||||||
- CLI stdout is capped (5MB); keep CLI output concise.
|
- CLI stdout is capped (5MB); keep CLI output concise.
|
||||||
|
|
||||||
|
## Mention Detection in Groups
|
||||||
|
|
||||||
|
When `requireMention: true` is set for a group chat, OpenClaw now transcribes audio **before** checking for mentions. This allows voice notes to be processed even when they contain mentions.
|
||||||
|
|
||||||
|
**How it works:**
|
||||||
|
|
||||||
|
1. If a voice message has no text body and the group requires mentions, OpenClaw performs a "preflight" transcription.
|
||||||
|
2. The transcript is checked for mention patterns (e.g., `@BotName`, emoji triggers).
|
||||||
|
3. If a mention is found, the message proceeds through the full reply pipeline.
|
||||||
|
4. The transcript is used for mention detection so voice notes can pass the mention gate.
|
||||||
|
|
||||||
|
**Fallback behavior:**
|
||||||
|
|
||||||
|
- If transcription fails during preflight (timeout, API error, etc.), the message is processed based on text-only mention detection.
|
||||||
|
- This ensures that mixed messages (text + audio) are never incorrectly dropped.
|
||||||
|
|
||||||
|
**Example:** A user sends a voice note saying "Hey @Claude, what's the weather?" in a Telegram group with `requireMention: true`. The voice note is transcribed, the mention is detected, and the agent replies.
|
||||||
|
|
||||||
## Gotchas
|
## Gotchas
|
||||||
|
|
||||||
- Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`.
|
- Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`.
|
||||||
- Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
|
- Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
|
||||||
- Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue.
|
- Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue.
|
||||||
|
- Preflight transcription only processes the **first** audio attachment for mention detection. Additional audio is processed during the main media understanding phase.
|
||||||
|
|||||||
@@ -90,18 +90,24 @@ export function matchesMentionWithExplicit(params: {
|
|||||||
text: string;
|
text: string;
|
||||||
mentionRegexes: RegExp[];
|
mentionRegexes: RegExp[];
|
||||||
explicit?: ExplicitMentionSignal;
|
explicit?: ExplicitMentionSignal;
|
||||||
|
transcript?: string;
|
||||||
}): boolean {
|
}): boolean {
|
||||||
const cleaned = normalizeMentionText(params.text ?? "");
|
const cleaned = normalizeMentionText(params.text ?? "");
|
||||||
const explicit = params.explicit?.isExplicitlyMentioned === true;
|
const explicit = params.explicit?.isExplicitlyMentioned === true;
|
||||||
const explicitAvailable = params.explicit?.canResolveExplicit === true;
|
const explicitAvailable = params.explicit?.canResolveExplicit === true;
|
||||||
const hasAnyMention = params.explicit?.hasAnyMention === true;
|
const hasAnyMention = params.explicit?.hasAnyMention === true;
|
||||||
|
|
||||||
|
// Check transcript if text is empty and transcript is provided
|
||||||
|
const transcriptCleaned = params.transcript ? normalizeMentionText(params.transcript) : "";
|
||||||
|
const textToCheck = cleaned || transcriptCleaned;
|
||||||
|
|
||||||
if (hasAnyMention && explicitAvailable) {
|
if (hasAnyMention && explicitAvailable) {
|
||||||
return explicit || params.mentionRegexes.some((re) => re.test(cleaned));
|
return explicit || params.mentionRegexes.some((re) => re.test(textToCheck));
|
||||||
}
|
}
|
||||||
if (!cleaned) {
|
if (!textToCheck) {
|
||||||
return explicit;
|
return explicit;
|
||||||
}
|
}
|
||||||
return explicit || params.mentionRegexes.some((re) => re.test(cleaned));
|
return explicit || params.mentionRegexes.some((re) => re.test(textToCheck));
|
||||||
}
|
}
|
||||||
|
|
||||||
export function stripStructuralPrefixes(text: string): string {
|
export function stripStructuralPrefixes(text: string): string {
|
||||||
|
|||||||
@@ -242,28 +242,6 @@ export async function preflightDiscordMessage(
|
|||||||
(message.mentionedUsers?.length ?? 0) > 0 ||
|
(message.mentionedUsers?.length ?? 0) > 0 ||
|
||||||
(message.mentionedRoles?.length ?? 0) > 0),
|
(message.mentionedRoles?.length ?? 0) > 0),
|
||||||
);
|
);
|
||||||
const wasMentioned =
|
|
||||||
!isDirectMessage &&
|
|
||||||
matchesMentionWithExplicit({
|
|
||||||
text: baseText,
|
|
||||||
mentionRegexes,
|
|
||||||
explicit: {
|
|
||||||
hasAnyMention,
|
|
||||||
isExplicitlyMentioned: explicitlyMentioned,
|
|
||||||
canResolveExplicit: Boolean(botId),
|
|
||||||
},
|
|
||||||
});
|
|
||||||
const implicitMention = Boolean(
|
|
||||||
!isDirectMessage &&
|
|
||||||
botId &&
|
|
||||||
message.referencedMessage?.author?.id &&
|
|
||||||
message.referencedMessage.author.id === botId,
|
|
||||||
);
|
|
||||||
if (shouldLogVerbose()) {
|
|
||||||
logVerbose(
|
|
||||||
`discord: inbound id=${message.id} guild=${message.guild?.id ?? "dm"} channel=${message.channelId} mention=${wasMentioned ? "yes" : "no"} type=${isDirectMessage ? "dm" : isGroupDm ? "group-dm" : "guild"} content=${messageText ? "yes" : "no"}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
isGuildMessage &&
|
isGuildMessage &&
|
||||||
@@ -400,6 +378,74 @@ export async function preflightDiscordMessage(
|
|||||||
channelConfig,
|
channelConfig,
|
||||||
guildInfo,
|
guildInfo,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Preflight audio transcription for mention detection in guilds
|
||||||
|
// This allows voice notes to be checked for mentions before being dropped
|
||||||
|
let preflightTranscript: string | undefined;
|
||||||
|
const hasAudioAttachment = message.attachments?.some((att: { contentType?: string }) =>
|
||||||
|
att.contentType?.startsWith("audio/"),
|
||||||
|
);
|
||||||
|
const needsPreflightTranscription =
|
||||||
|
!isDirectMessage &&
|
||||||
|
shouldRequireMention &&
|
||||||
|
hasAudioAttachment &&
|
||||||
|
!baseText &&
|
||||||
|
mentionRegexes.length > 0;
|
||||||
|
|
||||||
|
if (needsPreflightTranscription) {
|
||||||
|
try {
|
||||||
|
const { transcribeFirstAudio } = await import("../../media-understanding/audio-preflight.js");
|
||||||
|
const audioPaths =
|
||||||
|
message.attachments
|
||||||
|
?.filter((att: { contentType?: string; url: string }) =>
|
||||||
|
att.contentType?.startsWith("audio/"),
|
||||||
|
)
|
||||||
|
.map((att: { url: string }) => att.url) ?? [];
|
||||||
|
if (audioPaths.length > 0) {
|
||||||
|
const tempCtx = {
|
||||||
|
MediaUrls: audioPaths,
|
||||||
|
MediaTypes: message.attachments
|
||||||
|
?.filter((att: { contentType?: string; url: string }) =>
|
||||||
|
att.contentType?.startsWith("audio/"),
|
||||||
|
)
|
||||||
|
.map((att: { contentType?: string }) => att.contentType)
|
||||||
|
.filter(Boolean) as string[],
|
||||||
|
};
|
||||||
|
preflightTranscript = await transcribeFirstAudio({
|
||||||
|
ctx: tempCtx,
|
||||||
|
cfg: params.cfg,
|
||||||
|
agentDir: undefined,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
logVerbose(`discord: audio preflight transcription failed: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const wasMentioned =
|
||||||
|
!isDirectMessage &&
|
||||||
|
matchesMentionWithExplicit({
|
||||||
|
text: baseText,
|
||||||
|
mentionRegexes,
|
||||||
|
explicit: {
|
||||||
|
hasAnyMention,
|
||||||
|
isExplicitlyMentioned: explicitlyMentioned,
|
||||||
|
canResolveExplicit: Boolean(botId),
|
||||||
|
},
|
||||||
|
transcript: preflightTranscript,
|
||||||
|
});
|
||||||
|
const implicitMention = Boolean(
|
||||||
|
!isDirectMessage &&
|
||||||
|
botId &&
|
||||||
|
message.referencedMessage?.author?.id &&
|
||||||
|
message.referencedMessage.author.id === botId,
|
||||||
|
);
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(
|
||||||
|
`discord: inbound id=${message.id} guild=${message.guild?.id ?? "dm"} channel=${message.channelId} mention=${wasMentioned ? "yes" : "no"} type=${isDirectMessage ? "dm" : isGroupDm ? "group-dm" : "guild"} content=${messageText ? "yes" : "no"}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
const allowTextCommands = shouldHandleTextCommands({
|
const allowTextCommands = shouldHandleTextCommands({
|
||||||
cfg: params.cfg,
|
cfg: params.cfg,
|
||||||
surface: "discord",
|
surface: "discord",
|
||||||
|
|||||||
@@ -182,6 +182,10 @@ export function selectAttachments(params: {
|
|||||||
}): MediaAttachment[] {
|
}): MediaAttachment[] {
|
||||||
const { capability, attachments, policy } = params;
|
const { capability, attachments, policy } = params;
|
||||||
const matches = attachments.filter((item) => {
|
const matches = attachments.filter((item) => {
|
||||||
|
// Skip already-transcribed audio attachments from preflight
|
||||||
|
if (capability === "audio" && item.alreadyTranscribed) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if (capability === "image") {
|
if (capability === "image") {
|
||||||
return isImageAttachment(item);
|
return isImageAttachment(item);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,97 @@
|
|||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
|
import type { OpenClawConfig } from "../config/config.js";
|
||||||
|
import type { MediaUnderstandingProvider } from "./types.js";
|
||||||
|
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||||
|
import { isAudioAttachment } from "./attachments.js";
|
||||||
|
import {
|
||||||
|
type ActiveMediaModel,
|
||||||
|
buildProviderRegistry,
|
||||||
|
createMediaAttachmentCache,
|
||||||
|
normalizeMediaAttachments,
|
||||||
|
runCapability,
|
||||||
|
} from "./runner.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transcribes the first audio attachment BEFORE mention checking.
|
||||||
|
* This allows voice notes to be processed in group chats with requireMention: true.
|
||||||
|
* Returns the transcript or undefined if transcription fails or no audio is found.
|
||||||
|
*/
|
||||||
|
export async function transcribeFirstAudio(params: {
|
||||||
|
ctx: MsgContext;
|
||||||
|
cfg: OpenClawConfig;
|
||||||
|
agentDir?: string;
|
||||||
|
providers?: Record<string, MediaUnderstandingProvider>;
|
||||||
|
activeModel?: ActiveMediaModel;
|
||||||
|
}): Promise<string | undefined> {
|
||||||
|
const { ctx, cfg } = params;
|
||||||
|
|
||||||
|
// Check if audio transcription is enabled in config
|
||||||
|
const audioConfig = cfg.tools?.media?.audio;
|
||||||
|
if (!audioConfig || audioConfig.enabled === false) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
const attachments = normalizeMediaAttachments(ctx);
|
||||||
|
if (!attachments || attachments.length === 0) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find first audio attachment
|
||||||
|
const firstAudio = attachments.find(
|
||||||
|
(att) => att && isAudioAttachment(att) && !att.alreadyTranscribed,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!firstAudio) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`audio-preflight: transcribing attachment ${firstAudio.index} for mention check`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const providerRegistry = buildProviderRegistry(params.providers);
|
||||||
|
const cache = createMediaAttachmentCache(attachments);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await runCapability({
|
||||||
|
capability: "audio",
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachments: cache,
|
||||||
|
media: attachments,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
providerRegistry,
|
||||||
|
config: audioConfig,
|
||||||
|
activeModel: params.activeModel,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!result || result.outputs.length === 0) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract transcript from first audio output
|
||||||
|
const audioOutput = result.outputs.find((output) => output.kind === "audio.transcription");
|
||||||
|
if (!audioOutput || !audioOutput.text) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark this attachment as transcribed to avoid double-processing
|
||||||
|
firstAudio.alreadyTranscribed = true;
|
||||||
|
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(
|
||||||
|
`audio-preflight: transcribed ${audioOutput.text.length} chars from attachment ${firstAudio.index}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return audioOutput.text;
|
||||||
|
} catch (err) {
|
||||||
|
// Log but don't throw - let the message proceed with text-only mention check
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`audio-preflight: transcription failed: ${String(err)}`);
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
} finally {
|
||||||
|
await cache.cleanup();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -10,6 +10,7 @@ export type MediaAttachment = {
|
|||||||
url?: string;
|
url?: string;
|
||||||
mime?: string;
|
mime?: string;
|
||||||
index: number;
|
index: number;
|
||||||
|
alreadyTranscribed?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type MediaUnderstandingOutput = {
|
export type MediaUnderstandingOutput = {
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import type { Bot } from "grammy";
|
import type { Bot } from "grammy";
|
||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
import type { OpenClawConfig } from "../config/config.js";
|
import type { OpenClawConfig } from "../config/config.js";
|
||||||
import type { DmPolicy, TelegramGroupConfig, TelegramTopicConfig } from "../config/types.js";
|
import type { DmPolicy, TelegramGroupConfig, TelegramTopicConfig } from "../config/types.js";
|
||||||
import type { StickerMetadata, TelegramContext } from "./bot/types.js";
|
import type { StickerMetadata, TelegramContext } from "./bot/types.js";
|
||||||
@@ -203,6 +204,21 @@ export const buildTelegramMessageContext = async ({
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compute requireMention early for preflight transcription gating
|
||||||
|
const activationOverride = resolveGroupActivation({
|
||||||
|
chatId,
|
||||||
|
messageThreadId: resolvedThreadId,
|
||||||
|
sessionKey: sessionKey,
|
||||||
|
agentId: route.agentId,
|
||||||
|
});
|
||||||
|
const baseRequireMention = resolveGroupRequireMention(chatId);
|
||||||
|
const requireMention = firstDefined(
|
||||||
|
activationOverride,
|
||||||
|
topicConfig?.requireMention,
|
||||||
|
groupConfig?.requireMention,
|
||||||
|
baseRequireMention,
|
||||||
|
);
|
||||||
|
|
||||||
const sendTyping = async () => {
|
const sendTyping = async () => {
|
||||||
await withTelegramApiErrorLogging({
|
await withTelegramApiErrorLogging({
|
||||||
operation: "sendChatAction",
|
operation: "sendChatAction",
|
||||||
@@ -370,6 +386,7 @@ export const buildTelegramMessageContext = async ({
|
|||||||
const locationText = locationData ? formatLocationText(locationData) : undefined;
|
const locationText = locationData ? formatLocationText(locationData) : undefined;
|
||||||
const rawTextSource = msg.text ?? msg.caption ?? "";
|
const rawTextSource = msg.text ?? msg.caption ?? "";
|
||||||
const rawText = expandTextLinks(rawTextSource, msg.entities ?? msg.caption_entities).trim();
|
const rawText = expandTextLinks(rawTextSource, msg.entities ?? msg.caption_entities).trim();
|
||||||
|
const hasUserText = Boolean(rawText || locationText);
|
||||||
let rawBody = [rawText, locationText].filter(Boolean).join("\n").trim();
|
let rawBody = [rawText, locationText].filter(Boolean).join("\n").trim();
|
||||||
if (!rawBody) {
|
if (!rawBody) {
|
||||||
rawBody = placeholder;
|
rawBody = placeholder;
|
||||||
@@ -386,6 +403,35 @@ export const buildTelegramMessageContext = async ({
|
|||||||
(ent) => ent.type === "mention",
|
(ent) => ent.type === "mention",
|
||||||
);
|
);
|
||||||
const explicitlyMentioned = botUsername ? hasBotMention(msg, botUsername) : false;
|
const explicitlyMentioned = botUsername ? hasBotMention(msg, botUsername) : false;
|
||||||
|
|
||||||
|
// Preflight audio transcription for mention detection in groups
|
||||||
|
// This allows voice notes to be checked for mentions before being dropped
|
||||||
|
let preflightTranscript: string | undefined;
|
||||||
|
const hasAudio = allMedia.some((media) => media.contentType?.startsWith("audio/"));
|
||||||
|
const needsPreflightTranscription =
|
||||||
|
isGroup && requireMention && hasAudio && !hasUserText && mentionRegexes.length > 0;
|
||||||
|
|
||||||
|
if (needsPreflightTranscription) {
|
||||||
|
try {
|
||||||
|
const { transcribeFirstAudio } = await import("../media-understanding/audio-preflight.js");
|
||||||
|
// Build a minimal context for transcription
|
||||||
|
const tempCtx: MsgContext = {
|
||||||
|
MediaPaths: allMedia.length > 0 ? allMedia.map((m) => m.path) : undefined,
|
||||||
|
MediaTypes:
|
||||||
|
allMedia.length > 0
|
||||||
|
? (allMedia.map((m) => m.contentType).filter(Boolean) as string[])
|
||||||
|
: undefined,
|
||||||
|
};
|
||||||
|
preflightTranscript = await transcribeFirstAudio({
|
||||||
|
ctx: tempCtx,
|
||||||
|
cfg,
|
||||||
|
agentDir: undefined,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const computedWasMentioned = matchesMentionWithExplicit({
|
const computedWasMentioned = matchesMentionWithExplicit({
|
||||||
text: msg.text ?? msg.caption ?? "",
|
text: msg.text ?? msg.caption ?? "",
|
||||||
mentionRegexes,
|
mentionRegexes,
|
||||||
@@ -394,6 +440,7 @@ export const buildTelegramMessageContext = async ({
|
|||||||
isExplicitlyMentioned: explicitlyMentioned,
|
isExplicitlyMentioned: explicitlyMentioned,
|
||||||
canResolveExplicit: Boolean(botUsername),
|
canResolveExplicit: Boolean(botUsername),
|
||||||
},
|
},
|
||||||
|
transcript: preflightTranscript,
|
||||||
});
|
});
|
||||||
const wasMentioned = options?.forceWasMentioned === true ? true : computedWasMentioned;
|
const wasMentioned = options?.forceWasMentioned === true ? true : computedWasMentioned;
|
||||||
if (isGroup && commandGate.shouldBlock) {
|
if (isGroup && commandGate.shouldBlock) {
|
||||||
@@ -405,19 +452,6 @@ export const buildTelegramMessageContext = async ({
|
|||||||
});
|
});
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
const activationOverride = resolveGroupActivation({
|
|
||||||
chatId,
|
|
||||||
messageThreadId: resolvedThreadId,
|
|
||||||
sessionKey: sessionKey,
|
|
||||||
agentId: route.agentId,
|
|
||||||
});
|
|
||||||
const baseRequireMention = resolveGroupRequireMention(chatId);
|
|
||||||
const requireMention = firstDefined(
|
|
||||||
activationOverride,
|
|
||||||
topicConfig?.requireMention,
|
|
||||||
groupConfig?.requireMention,
|
|
||||||
baseRequireMention,
|
|
||||||
);
|
|
||||||
// Reply-chain detection: replying to a bot message acts like an implicit mention.
|
// Reply-chain detection: replying to a bot message acts like an implicit mention.
|
||||||
const botId = primaryCtx.me?.id;
|
const botId = primaryCtx.me?.id;
|
||||||
const replyFromId = msg.reply_to_message?.from?.id;
|
const replyFromId = msg.reply_to_message?.from?.id;
|
||||||
|
|||||||
Reference in New Issue
Block a user