mirror of
https://github.com/farcasclaudiu/openclaw.git
synced 2026-06-29 01:02:03 +03:00
fix(media): strip audio attachments after successful transcription (openclaw#9076) thanks @nobrainer-tech
Verified: - pnpm install --frozen-lockfile - pnpm build - pnpm check - pnpm test (fails in known unrelated telegram suite) - pnpm vitest run src/auto-reply/media-note.test.ts src/auto-reply/reply.media-note.test.ts Co-authored-by: nobrainer-tech <445466+nobrainer-tech@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
a6003d6711
commit
7081dee1af
@@ -106,4 +106,93 @@ describe("buildInboundMediaNote", () => {
|
|||||||
});
|
});
|
||||||
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
|
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("strips audio attachments when transcription succeeded via MediaUnderstanding (issue #4197)", () => {
|
||||||
|
const note = buildInboundMediaNote({
|
||||||
|
MediaPaths: ["/tmp/voice.ogg", "/tmp/image.png"],
|
||||||
|
MediaUrls: ["https://example.com/voice.ogg", "https://example.com/image.png"],
|
||||||
|
MediaTypes: ["audio/ogg", "image/png"],
|
||||||
|
MediaUnderstanding: [
|
||||||
|
{
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "Hello world",
|
||||||
|
provider: "whisper",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
// Audio attachment should be stripped (already transcribed), image should remain
|
||||||
|
expect(note).toBe(
|
||||||
|
"[media attached: /tmp/image.png (image/png) | https://example.com/image.png]",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("only strips audio attachments that were transcribed", () => {
|
||||||
|
const note = buildInboundMediaNote({
|
||||||
|
MediaPaths: ["/tmp/voice-1.ogg", "/tmp/voice-2.ogg"],
|
||||||
|
MediaUrls: ["https://example.com/voice-1.ogg", "https://example.com/voice-2.ogg"],
|
||||||
|
MediaTypes: ["audio/ogg", "audio/ogg"],
|
||||||
|
MediaUnderstanding: [
|
||||||
|
{
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "First transcript",
|
||||||
|
provider: "whisper",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(note).toBe(
|
||||||
|
"[media attached: /tmp/voice-2.ogg (audio/ogg) | https://example.com/voice-2.ogg]",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("strips audio attachments when Transcript is present (issue #4197)", () => {
|
||||||
|
const note = buildInboundMediaNote({
|
||||||
|
MediaPaths: ["/tmp/voice.opus"],
|
||||||
|
MediaTypes: ["audio/opus"],
|
||||||
|
Transcript: "Hello world from Whisper",
|
||||||
|
});
|
||||||
|
// Audio should be stripped when transcript is available
|
||||||
|
expect(note).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not strip multiple audio attachments using transcript-only fallback", () => {
|
||||||
|
const note = buildInboundMediaNote({
|
||||||
|
MediaPaths: ["/tmp/voice-1.ogg", "/tmp/voice-2.ogg"],
|
||||||
|
MediaTypes: ["audio/ogg", "audio/ogg"],
|
||||||
|
Transcript: "Transcript text without per-attachment mapping",
|
||||||
|
});
|
||||||
|
expect(note).toBe(
|
||||||
|
[
|
||||||
|
"[media attached: 2 files]",
|
||||||
|
"[media attached 1/2: /tmp/voice-1.ogg (audio/ogg)]",
|
||||||
|
"[media attached 2/2: /tmp/voice-2.ogg (audio/ogg)]",
|
||||||
|
].join("\n"),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("strips audio by extension even without mime type (issue #4197)", () => {
|
||||||
|
const note = buildInboundMediaNote({
|
||||||
|
MediaPaths: ["/tmp/voice_message.ogg", "/tmp/document.pdf"],
|
||||||
|
MediaUnderstanding: [
|
||||||
|
{
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "Transcribed audio content",
|
||||||
|
provider: "whisper",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
// Only PDF should remain, audio stripped by extension
|
||||||
|
expect(note).toBe("[media attached: /tmp/document.pdf]");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("keeps audio attachments when no transcription available", () => {
|
||||||
|
const note = buildInboundMediaNote({
|
||||||
|
MediaPaths: ["/tmp/voice.ogg"],
|
||||||
|
MediaTypes: ["audio/ogg"],
|
||||||
|
});
|
||||||
|
// No transcription = keep audio attachment as fallback
|
||||||
|
expect(note).toBe("[media attached: /tmp/voice.ogg (audio/ogg)]");
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -17,12 +17,45 @@ function formatMediaAttachedLine(params: {
|
|||||||
return `${prefix}${params.path}${typePart}${urlPart}]`;
|
return `${prefix}${params.path}${typePart}${urlPart}]`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Common audio file extensions for transcription detection
|
||||||
|
const AUDIO_EXTENSIONS = new Set([
|
||||||
|
".ogg",
|
||||||
|
".opus",
|
||||||
|
".mp3",
|
||||||
|
".m4a",
|
||||||
|
".wav",
|
||||||
|
".webm",
|
||||||
|
".flac",
|
||||||
|
".aac",
|
||||||
|
".wma",
|
||||||
|
".aiff",
|
||||||
|
".alac",
|
||||||
|
".oga",
|
||||||
|
]);
|
||||||
|
|
||||||
|
function isAudioPath(path: string | undefined): boolean {
|
||||||
|
if (!path) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const lower = path.toLowerCase();
|
||||||
|
for (const ext of AUDIO_EXTENSIONS) {
|
||||||
|
if (lower.endsWith(ext)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
||||||
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
|
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
|
||||||
const suppressed = new Set<number>();
|
const suppressed = new Set<number>();
|
||||||
|
const transcribedAudioIndices = new Set<number>();
|
||||||
if (Array.isArray(ctx.MediaUnderstanding)) {
|
if (Array.isArray(ctx.MediaUnderstanding)) {
|
||||||
for (const output of ctx.MediaUnderstanding) {
|
for (const output of ctx.MediaUnderstanding) {
|
||||||
suppressed.add(output.attachmentIndex);
|
suppressed.add(output.attachmentIndex);
|
||||||
|
if (output.kind === "audio.transcription") {
|
||||||
|
transcribedAudioIndices.add(output.attachmentIndex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Array.isArray(ctx.MediaUnderstandingDecisions)) {
|
if (Array.isArray(ctx.MediaUnderstandingDecisions)) {
|
||||||
@@ -33,6 +66,9 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
|||||||
for (const attachment of decision.attachments) {
|
for (const attachment of decision.attachments) {
|
||||||
if (attachment.chosen?.outcome === "success") {
|
if (attachment.chosen?.outcome === "success") {
|
||||||
suppressed.add(attachment.attachmentIndex);
|
suppressed.add(attachment.attachmentIndex);
|
||||||
|
if (decision.capability === "audio") {
|
||||||
|
transcribedAudioIndices.add(attachment.attachmentIndex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -56,6 +92,10 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
|||||||
Array.isArray(ctx.MediaTypes) && ctx.MediaTypes.length === paths.length
|
Array.isArray(ctx.MediaTypes) && ctx.MediaTypes.length === paths.length
|
||||||
? ctx.MediaTypes
|
? ctx.MediaTypes
|
||||||
: undefined;
|
: undefined;
|
||||||
|
const hasTranscript = Boolean(ctx.Transcript?.trim());
|
||||||
|
// Transcript alone does not identify an attachment index; only use it as a fallback
|
||||||
|
// when there is a single attachment to avoid stripping unrelated audio files.
|
||||||
|
const canStripSingleAttachmentByTranscript = hasTranscript && paths.length === 1;
|
||||||
|
|
||||||
const entries = paths
|
const entries = paths
|
||||||
.map((entry, index) => ({
|
.map((entry, index) => ({
|
||||||
@@ -64,7 +104,28 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
|||||||
url: urls?.[index] ?? ctx.MediaUrl,
|
url: urls?.[index] ?? ctx.MediaUrl,
|
||||||
index,
|
index,
|
||||||
}))
|
}))
|
||||||
.filter((entry) => !suppressed.has(entry.index));
|
.filter((entry) => {
|
||||||
|
if (suppressed.has(entry.index)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Strip audio attachments when transcription succeeded - the transcript is already
|
||||||
|
// available in the context, raw audio binary would only waste tokens (issue #4197)
|
||||||
|
// Note: Only trust MIME type from per-entry types array, not fallback ctx.MediaType
|
||||||
|
// which could misclassify non-audio attachments (greptile review feedback)
|
||||||
|
const hasPerEntryType = types !== undefined;
|
||||||
|
const isAudioByMime = hasPerEntryType && entry.type?.toLowerCase().startsWith("audio/");
|
||||||
|
const isAudioEntry = isAudioPath(entry.path) || isAudioByMime;
|
||||||
|
if (!isAudioEntry) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (
|
||||||
|
transcribedAudioIndices.has(entry.index) ||
|
||||||
|
(canStripSingleAttachmentByTranscript && entry.index === 0)
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
});
|
||||||
if (entries.length === 0) {
|
if (entries.length === 0) {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user