Precision	Recall	F1-Score
{negativeLabel}	{negClass.precision.toFixed(2)}	{negClass.recall.toFixed(2)}	{negClass.f1.toFixed(2)}
{positiveLabel}	{posClass.precision.toFixed(2)}	{posClass.recall.toFixed(2)}	{posClass.f1.toFixed(2)}

Precision

Recall

F1-Score

{negativeLabel}

{negClass.precision.toFixed(2)}

{negClass.recall.toFixed(2)}

{negClass.f1.toFixed(2)}

{positiveLabel}

{posClass.precision.toFixed(2)}

{posClass.recall.toFixed(2)}

{posClass.f1.toFixed(2)}

{}

{titlePrefix}Confusion Matrix (Normalized)

{}

Predicted

{}

{displayPredictedLabels.left}

{displayPredictedLabels.right}

{}

Actual

{displayActualLabels.top}

{showCounts &&

{displayMatrix.tl.count}

}

{formatValue(displayMatrix.tl.pct)}

{showCounts &&

{displayMatrix.tr.count}

}

{formatValue(displayMatrix.tr.pct)}

{}

{displayActualLabels.bottom}

{showCounts &&

{displayMatrix.bl.count}

}

{formatValue(displayMatrix.bl.pct)}

{showCounts &&

{displayMatrix.br.count}

}

{formatValue(displayMatrix.br.pct)}

{}

{displayFormat === "fraction" ? "0.0" : "0%"}

{palette.map((color, idx) =>

)}

{displayFormat === "fraction" ? "1.0" : "100%"}

; }; export const Scale = ({low, mid, high, lowLabel = "Low", midLabel = "Mid", highLabel = "High", lowDescription, midDescription, highDescription, midColor = "yellow", inverted = false}) => { const lowColor = inverted ? "green" : "red"; const highColor = inverted ? "red" : "green"; const gradientId = inverted ? "greenToRed" : "redToGreen"; return

{low}

{mid &&

{mid}

}

{high}

{lowLabel}

{lowDescription &&

{lowDescription}

}

{mid &&

{midLabel}

{midDescription &&

{midDescription}

}

{highLabel}

{highDescription &&

{highDescription}

}

; }; export const MetricWhenToUse = ({description, useCases}) => { return

When to Use This Metric

{description} {useCases != null && useCases.map((useCase, index) =>

{useCase.title}{useCase.description ? `: ${useCase.description}` : ''}

)} ; }; export const DefinitionCard = ({children}) => { return

{children}

; }; Conversation Quality is a binary metric that assesses whether a chatbot interaction left the user feeling satisfied and positive or frustrated and dissatisfied, based on tone, engagement, and overall experience. The Conversation Quality metric evaluates user satisfaction across an entire chatbot session by analyzing tone, engagement, and sentiment. It classifies each conversation as GOOD or BAD depending on whether the user’s overall experience reflects positive engagement or frustration directed at the bot. The metric focuses on conversational flow rather than task success, emphasizing how naturally and politely the user and bot interact. It excludes non-textual or purely action-based agent outputs (e.g., button clicks). This is a **boolean** metric, returning a confidence score that the conversation quality is good. The score ranges from 0% (no confidence the conversation quality is good) to 100% (complete confidence that the conversation quality is good). ## Conversation Quality at a glance | Property | Description | | :----------------------------- | :--------------------------------------------- | | **Name** | Conversation Quality | | **Category** | Agentic AI | | **Can be applied to** | Session | | **LLM-as-a-judge Support** | ✅ | | **Luna Support** | ❌ | | **Protect Runtime Protection** | ❌ | | **Value Type** | Boolean shown as a percentage confidence score | ## When to use this metric ## Score interpretation **Expected Score:** 80%-100%. ## How to improve Conversation Quality scores Some techniques to improve Conversation Quality scores are: * Ensure bots provide clear, empathetic, and concise responses * Detect and mitigate repeated clarification loops * Train models to de-escalate external frustration effectively * Log complete sessions to allow accurate tone assessment Common issues that can cause low scores are: * Mislabeling external frustration as bot-directed * Incomplete logs * Abrupt session truncation ## Performance Benchmarks We evaluated Conversation Quality against human expert labels on an internal dataset of agentic conversation samples using top frontier models. | Model | F1 (True) | | :---------------------- | :-------: | | GPT-4.1 | 0.89 | | GPT-4.1-mini (judges=3) | 0.85 | | Claude Sonnet 4.5 | 0.85 | | Gemini 3 Flash | 0.88 | ### GPT-4.1 Classification Report Benchmarks based on internal evaluation dataset. Performance may vary by use case. ## Related Resources If you would like to dive deeper or start implementing Conversation Quality, check out the following resources: ### Examples * [Conversation Quality Examples](https://app.galileo.ai) - Log in and explore the "Conversation Quality" Log Stream in the "Preset Metric Examples" Project to see this metric in action. ### How-to guides * [Agentic AI Basic Example](/how-to-guides/agentic-ai/basic-example) * [Creating Custom Metrics](/how-to-guides/metrics/create-local-metric/create-local-metric) ### Related Concepts * [Agentic Metrics Overview](/concepts/metrics/agentic/agentic-overview) * [Action Completion](/concepts/metrics/agentic/action-completion) * [Action Advancement](/concepts/metrics/agentic/action-advancement)