ResponsibleAI - Input Moderation - Simplifying the Options for OpenAI Moderation.

This commit is contained in:
vinodkiran
2023-11-22 19:19:59 +05:30
parent ad8281e553
commit 619fb4f5c1
2 changed files with 5 additions and 144 deletions
@@ -3,19 +3,7 @@ import { BaseLanguageModel } from 'langchain/base_language'
import { OpenAIModerationChain } from 'langchain/chains'
export class OpenAIModerationRunner implements Moderation {
private moderationConfig: string = 'useDefault'
private moderationErrorMessage: string = "Text was found that violates OpenAI's content policy."
private sexual: number = 0.01
private sexualMinors: number = 0.01
private hate: number = 0.01
private hateThreatening: number = 0.01
private harassment: number = 0.01
private harassmentThreatening: number = 0.01
private selfHarm: number = 0.01
private selfHarmIntent: number = 0.01
private selfHarmInstructions: number = 0.01
private violence: number = 0.01
private violenceGraphic: number = 0.01
async checkForViolations(llm: BaseLanguageModel, input: string): Promise<string> {
const openAIApiKey = (llm as any).openAIApiKey
@@ -31,32 +19,13 @@ export class OpenAIModerationRunner implements Moderation {
const { output: moderationOutput, results } = await moderation.call({
input: input
})
if (this.moderationConfig != 'useCustom' && results[0].flagged) {
if (results[0].flagged) {
throw Error(this.moderationErrorMessage)
}
if (this.moderationConfig != 'useDefault') {
const categoryScores = results[0].category_scores
if (
categoryScores['harassment'] > this.harassment ||
categoryScores['harassment/threatening'] > this.harassmentThreatening ||
categoryScores['self-harm'] > this.selfHarm ||
categoryScores['self-harm/intent'] > this.selfHarmIntent ||
categoryScores['self-harm/instructions'] > this.selfHarmInstructions ||
categoryScores['sexual'] > this.sexual ||
categoryScores['sexual/minors'] > this.sexualMinors ||
categoryScores['hate'] > this.hate ||
categoryScores['hate/threatening'] > this.hateThreatening ||
categoryScores['violence'] > this.violence ||
categoryScores['violence/graphic'] > this.violenceGraphic
) {
throw Error(this.moderationErrorMessage)
}
}
return moderationOutput
}
setParameter(category: string, value: number) {
// @ts-ignore
this[category] = value
setErrorMessage(message: string) {
this.moderationErrorMessage = message
}
}