35.4 Escalation logic and confidence thresholds
Overview and links for this section of the guide.
On this page
Understanding Confidence Scores
Confidence scores determine what action to take. But LLM confidence isn't always well-calibrated—a model might say 0.95 when it should say 0.7.
Key insight: Treat confidence as a signal for routing, not as a measure of correctness.
| Confidence | Action | Rationale |
|---|---|---|
| ≥ 0.9 | Auto-reply (if safe category) | High confidence + safe topic = minimal risk |
| 0.7 - 0.9 | Draft for review | Probably right, but human should verify |
| 0.5 - 0.7 | Route to human with context | Too uncertain for automation |
| < 0.5 | Route to human, flag for review | Model is confused—investigate why |
// confidence-based-routing.ts
interface RoutingConfig {
autoReplyThreshold: number;
draftThreshold: number;
allowAutoReply: boolean;
}
const ROUTING_CONFIGS: Record = {
// Safe, frequent queries - allow full automation
SHIPPING_STATUS: {
autoReplyThreshold: 0.9,
draftThreshold: 0.7,
allowAutoReply: true,
},
BILLING_INVOICE: {
autoReplyThreshold: 0.9,
draftThreshold: 0.7,
allowAutoReply: true,
},
TECHNICAL_LOGIN: {
autoReplyThreshold: 0.9,
draftThreshold: 0.7,
allowAutoReply: true, // Password reset is safe
},
// Risky or complex - never auto-reply
BILLING_REFUND: {
autoReplyThreshold: 1.1, // Never auto-reply (impossible threshold)
draftThreshold: 0.7,
allowAutoReply: false,
},
TECHNICAL_BUG: {
autoReplyThreshold: 1.1,
draftThreshold: 0.8, // Higher bar for tech issues
allowAutoReply: false,
},
OTHER: {
autoReplyThreshold: 1.1,
draftThreshold: 1.1, // Always route to human
allowAutoReply: false,
},
};
Calibrating Thresholds
Don't guess at thresholds—calibrate them with real data:
// threshold-calibration.ts
interface CalibrationResult {
threshold: number;
precision: number; // Of predicted positives, how many are actually correct
recall: number; // Of actual positives, how many did we catch
volume: number; // How many tickets at this threshold
}
async function calibrateThresholds(
testSet: Array<{ email: string, correctCategory: string }>,
classifier: EmailClassifier
): Promise {
const results = await Promise.all(
testSet.map(async (t) => ({
predicted: await classifier.classify(t.email),
actual: t.correctCategory
}))
);
const thresholds = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95];
return thresholds.map(threshold => {
const atOrAboveThreshold = results.filter(r => r.predicted.confidence >= threshold);
const correct = atOrAboveThreshold.filter(r => r.predicted.category === r.actual);
return {
threshold,
precision: atOrAboveThreshold.length > 0
? correct.length / atOrAboveThreshold.length
: 0,
recall: correct.length / results.length,
volume: atOrAboveThreshold.length
};
});
}
// Example output:
// threshold | precision | recall | volume
// 0.9 | 0.98 | 0.45 | 450 ← High precision, misses many
// 0.7 | 0.92 | 0.78 | 780 ← Good balance
// 0.5 | 0.85 | 0.95 | 950 ← Catches most, more errors
Launch with high thresholds (0.9+) and only auto-reply for very safe categories. As you gather data and confirm accuracy, gradually lower thresholds to automate more.
Hard Escalation Triggers
Some situations should always escalate, regardless of confidence:
// escalation-triggers.ts
const ESCALATION_RULES = {
// Keywords that indicate legal/serious issues
keywords: {
legal: ['lawyer', 'sue', 'lawsuit', 'attorney', 'legal action', 'court'],
safety: ['injury', 'hurt', 'hospital', 'allergic', 'poisoned', 'dangerous'],
fraud: ['fraud', 'scam', 'stolen', 'identity theft', 'unauthorized'],
media: ['journalist', 'news', 'reporter', 'viral', 'twitter', 'going public'],
regulatory: ['fda', 'ftc', 'consumer protection', 'bbb', 'report to'],
},
// Sentiment-based
sentiment: {
angry: true, // Always escalate angry customers
frustrated: false,
},
// Value-based
orderValue: {
immediate: 1000, // Escalate immediately if > $1000
priority: 500, // Higher priority if > $500
},
// Repeat contact (customer has contacted multiple times)
repeatContact: {
threshold: 3, // Escalate if 3+ contacts in 7 days
windowDays: 7,
}
};
interface EscalationCheck {
shouldEscalate: boolean;
reason: string;
priority: 'normal' | 'high' | 'urgent' | 'immediate';
}
function checkEscalationTriggers(
email: string,
extractedData: ExtractedTicketData,
customerHistory: CustomerHistory
): EscalationCheck {
const emailLower = email.toLowerCase();
// Check legal keywords
for (const [category, keywords] of Object.entries(ESCALATION_RULES.keywords)) {
const matched = keywords.find(kw => emailLower.includes(kw));
if (matched) {
return {
shouldEscalate: true,
reason: `Legal/sensitive keyword detected: "${matched}" (${category})`,
priority: category === 'legal' || category === 'safety' ? 'immediate' : 'urgent'
};
}
}
// Check sentiment
if (extractedData.sentiment === 'angry') {
return {
shouldEscalate: true,
reason: 'Customer sentiment detected as angry',
priority: 'high'
};
}
// Check repeat contact
const recentContacts = customerHistory.contactsInLastDays(
ESCALATION_RULES.repeatContact.windowDays
);
if (recentContacts >= ESCALATION_RULES.repeatContact.threshold) {
return {
shouldEscalate: true,
reason: `Repeat contact: ${recentContacts} contacts in last ${ESCALATION_RULES.repeatContact.windowDays} days`,
priority: 'high'
};
}
return {
shouldEscalate: false,
reason: '',
priority: 'normal'
};
}
Full Escalation Logic
// escalation-logic.ts
interface TicketDecision {
action: 'auto_reply' | 'draft' | 'assign_to_human';
queue: string;
priority: string;
response?: GeneratedResponse;
escalationReason?: string;
humanContext?: string;
}
export async function processTicket(
email: string,
classifier: EmailClassifier,
extractor: TicketDataExtractor,
responseGenerator: SupportResponseGenerator,
customerHistory: CustomerHistory
): Promise {
// Step 1: Classify
const classification = await classifier.classify(email);
// Step 2: Extract data
const extractedData = await extractor.extract(email);
// Step 3: Check hard escalation triggers
const escalationCheck = checkEscalationTriggers(email, extractedData, customerHistory);
if (escalationCheck.shouldEscalate) {
return {
action: 'assign_to_human',
queue: 'escalation',
priority: escalationCheck.priority,
escalationReason: escalationCheck.reason,
humanContext: buildHumanContext(classification, extractedData)
};
}
// Step 4: Get routing config for this category
const config = ROUTING_CONFIGS[classification.category] || ROUTING_CONFIGS.OTHER;
// Step 5: Apply confidence-based routing
if (config.allowAutoReply && classification.confidence >= config.autoReplyThreshold) {
const response = await responseGenerator.generateResponse({
email,
classification,
extractedData
});
return {
action: 'auto_reply',
queue: config.category,
priority: 'low',
response
};
}
if (classification.confidence >= config.draftThreshold) {
const response = await responseGenerator.generateResponse({
email,
classification,
extractedData
});
return {
action: 'draft',
queue: config.category,
priority: 'medium',
response,
humanContext: `AI suggests category: ${classification.category} (${Math.round(classification.confidence * 100)}% confidence)`
};
}
// Step 6: Low confidence - pure human handling
return {
action: 'assign_to_human',
queue: getQueueForCategory(classification.category),
priority: extractedData.urgency,
humanContext: buildHumanContext(classification, extractedData),
escalationReason: `Low confidence: ${Math.round(classification.confidence * 100)}%`
};
}
function buildHumanContext(
classification: ClassificationResult,
extractedData: ExtractedTicketData
): string {
return `
**AI Analysis:**
- Likely category: ${classification.category} (${Math.round(classification.confidence * 100)}%)
- Reasoning: ${classification.reasoning}
**Extracted Data:**
- Order IDs: ${extractedData.order_ids.join(', ') || 'None'}
- Sentiment: ${extractedData.sentiment}
- Urgency: ${extractedData.urgency}
- Issue: ${extractedData.issue_description}
`.trim();
}
The Human Handoff
When escalating to a human agent, provide maximum context:
// human-handoff.ts
interface HandoffPacket {
ticketId: string;
email: string;
// AI analysis
suggestedCategory: string;
confidence: number;
extractedData: ExtractedTicketData;
suggestedResponse?: string;
// Escalation context
escalationReason: string;
priority: string;
// Customer context
customerHistory: {
totalTickets: number;
recentTickets: number;
lifetimeValue: number;
vipStatus: boolean;
};
// Action guidance
suggestedActions: string[];
}
function buildHandoffPacket(/* ... */): HandoffPacket {
return {
ticketId: 'TKT-12345',
email: originalEmail,
suggestedCategory: 'BILLING_REFUND',
confidence: 0.65,
extractedData,
escalationReason: 'Customer threatened legal action',
priority: 'urgent',
customerHistory: {
totalTickets: 5,
recentTickets: 3,
lifetimeValue: 1250,
vipStatus: true
},
suggestedActions: [
'Review refund request - order is within 30-day window',
'Customer is VIP - consider goodwill gesture',
'De-escalation needed - customer mentioned lawyer'
]
};
}
Think of the AI as a triage nurse. It handles the scrapes and bruises (order status queries), but calls the doctor for the heart attacks (legal threats, angry VIPs). The goal is appropriate routing, not replacing humans entirely.