> ## Documentation Index
> Fetch the complete documentation index at: https://docs.galileo.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Agent Flow

> Learn how to measure the correctness and coherence of an agentic trajectory by validating it against user-specified natural language tests

export const BooleanClassificationReport = ({report, negativeLabel = "Not Advanced", positiveLabel = "Advanced", negativeClass = "False", positiveClass = "True", maxWidth = 520}) => {
  const parseReport = reportStr => {
    const lines = reportStr.trim().split('\n').filter(line => line.trim());
    const result = {
      classes: [],
      accuracy: null,
      macroAvg: null,
      weightedAvg: null,
      totalSupport: null
    };
    for (const line of lines) {
      const parts = line.trim().split(/\s+/);
      if (parts[0] === 'precision') continue;
      if (parts.length >= 5 && !['accuracy', 'macro', 'weighted'].includes(parts[0])) {
        result.classes.push({
          name: parts[0],
          precision: parseFloat(parts[1]),
          recall: parseFloat(parts[2]),
          f1: parseFloat(parts[3]),
          support: parseInt(parts[4], 10)
        });
      }
      if (parts[0] === 'accuracy') {
        result.accuracy = parseFloat(parts[1]);
        result.totalSupport = parseInt(parts[2], 10);
      }
      if (parts[0] === 'macro' && parts[1] === 'avg') {
        result.macroAvg = {
          precision: parseFloat(parts[2]),
          recall: parseFloat(parts[3]),
          f1: parseFloat(parts[4]),
          support: parseInt(parts[5], 10)
        };
      }
      if (parts[0] === 'weighted' && parts[1] === 'avg') {
        result.weightedAvg = {
          precision: parseFloat(parts[2]),
          recall: parseFloat(parts[3]),
          f1: parseFloat(parts[4]),
          support: parseInt(parts[5], 10)
        };
      }
    }
    return result;
  };
  const parsed = parseReport(report);
  if (parsed.classes.length < 2) {
    return <div style={{
      color: "red",
      padding: "1rem",
      border: "1px solid red"
    }}>BooleanClassificationReport: Could not parse report. Expected at least 2 classes.</div>;
  }
  const negClass = parsed.classes.find(c => c.name === negativeClass) || parsed.classes[0];
  const posClass = parsed.classes.find(c => c.name === positiveClass) || parsed.classes[1];
  const tnPlusFp = negClass.support;
  const tpPlusFn = posClass.support;
  const tn = Math.round(negClass.recall * tnPlusFp);
  const fp = tnPlusFp - tn;
  const tp = Math.round(posClass.recall * tpPlusFn);
  const fn = tpPlusFn - tp;
  const tnPct = tn / tnPlusFp * 100;
  const fpPct = fp / tnPlusFp * 100;
  const fnPct = fn / tpPlusFn * 100;
  const tpPct = tp / tpPlusFn * 100;
  const rowStyle = {
    borderBottom: "1px solid rgba(148, 163, 184, 0.3)"
  };
  const cellStyle = {
    padding: "0.5rem 0.125rem"
  };
  const centerCellStyle = {
    textAlign: "center",
    padding: "0.5rem 0.125rem"
  };
  return <div>
      {}
      <table style={{
    width: "auto",
    borderCollapse: "collapse",
    marginBottom: "1.5rem",
    fontSize: "0.875rem"
  }}>
        <thead>
          <tr style={{
    borderBottom: "2px solid rgba(148, 163, 184, 0.5)"
  }}>
            <th style={{
    textAlign: "center",
    padding: "0.5rem 0.125rem",
    fontWeight: "600"
  }}></th>
            <th style={{
    textAlign: "center",
    padding: "0.5rem 0.125rem",
    fontWeight: "600"
  }}>Precision</th>
            <th style={{
    textAlign: "center",
    padding: "0.5rem 0.125rem",
    fontWeight: "600"
  }}>Recall</th>
            <th style={{
    textAlign: "center",
    padding: "0.5rem 0.125rem",
    fontWeight: "600"
  }}>F1-Score</th>
          </tr>
        </thead>
        <tbody>
          {}
          <tr style={rowStyle}>
            <td style={cellStyle}>{negativeLabel}</td>
            <td style={centerCellStyle}>{negClass.precision.toFixed(2)}</td>
            <td style={centerCellStyle}>{negClass.recall.toFixed(2)}</td>
            <td style={centerCellStyle}>{negClass.f1.toFixed(2)}</td>
          </tr>
          <tr style={rowStyle}>
            <td style={cellStyle}>{positiveLabel}</td>
            <td style={centerCellStyle}>{posClass.precision.toFixed(2)}</td>
            <td style={centerCellStyle}>{posClass.recall.toFixed(2)}</td>
            <td style={centerCellStyle}>{posClass.f1.toFixed(2)}</td>
          </tr>
          
        </tbody>
      </table>

      {}
      <BooleanConfusionMatrix actualNegativeLabel={negativeLabel} actualPositiveLabel={positiveLabel} predictedNegativeLabel={negativeLabel} predictedPositiveLabel={positiveLabel} tnPct={tnPct.toString()} fpPct={fpPct.toString()} fnPct={fnPct.toString()} tpPct={tpPct.toString()} displayFormat="fraction" maxWidth={maxWidth} />
    </div>;
};

export const BooleanConfusionMatrix = ({actualNegativeLabel = "Not Advanced", actualPositiveLabel = "Advanced", predictedNegativeLabel = "Not Advanced", predictedPositiveLabel = "Advanced", tnCount, tnPct, fpCount, fpPct, fnCount, fnPct, tpCount, tpPct, matrix, maxWidth = 520, displayFormat = "percentage", fractionDigits = 3, percentageDigits = 1, titlePrefix = ""}) => {
  const parseNum = val => val !== undefined && val !== null ? Number(val) : undefined;
  const clampPct = pct => Math.max(0, Math.min(100, Number(pct) || 0));
  const formatValue = pct => {
    const p = clampPct(pct);
    if (displayFormat === "fraction") {
      const digits = Number.isFinite(Number(fractionDigits)) ? Number(fractionDigits) : 3;
      return (p / 100).toFixed(digits);
    }
    const digits = Number.isFinite(Number(percentageDigits)) ? Number(percentageDigits) : 1;
    return `${p.toFixed(digits)}%`;
  };
  const palette = ["#f8fafc", "#eff6ff", "#dbeafe", "#bfdbfe", "#93c5fd", "#60a5fa", "#3b82f6", "#2563eb", "#1d4ed8", "#1e40af"];
  const getBg = pct => {
    const p = clampPct(pct);
    const idx = p === 100 ? 9 : Math.floor(p / 10);
    return palette[idx];
  };
  const getColor = pct => clampPct(pct) >= 60 ? "#ffffff" : "#1e3a8a";
  const rawTn = parseNum(tnCount);
  const rawFp = parseNum(fpCount);
  const rawFn = parseNum(fnCount);
  const rawTp = parseNum(tpCount);
  const rawTnPct = parseNum(tnPct);
  const rawFpPct = parseNum(fpPct);
  const rawFnPct = parseNum(fnPct);
  const rawTpPct = parseNum(tpPct);
  const hasCounts = rawTn !== undefined && rawFp !== undefined && rawFn !== undefined && rawTp !== undefined;
  const hasPcts = rawTnPct !== undefined && rawFpPct !== undefined && rawFnPct !== undefined && rawTpPct !== undefined;
  let resolvedMatrix;
  let showCounts;
  if (matrix) {
    resolvedMatrix = matrix;
    showCounts = matrix.tn?.count !== undefined;
  } else if (hasCounts) {
    const actualNegTotal = rawTn + rawFp;
    const actualPosTotal = rawFn + rawTp;
    resolvedMatrix = {
      tn: {
        count: rawTn,
        pct: actualNegTotal > 0 ? rawTn / actualNegTotal * 100 : 0
      },
      fp: {
        count: rawFp,
        pct: actualNegTotal > 0 ? rawFp / actualNegTotal * 100 : 0
      },
      fn: {
        count: rawFn,
        pct: actualPosTotal > 0 ? rawFn / actualPosTotal * 100 : 0
      },
      tp: {
        count: rawTp,
        pct: actualPosTotal > 0 ? rawTp / actualPosTotal * 100 : 0
      }
    };
    showCounts = true;
  } else if (hasPcts) {
    resolvedMatrix = {
      tn: {
        pct: rawTnPct
      },
      fp: {
        pct: rawFpPct
      },
      fn: {
        pct: rawFnPct
      },
      tp: {
        pct: rawTpPct
      }
    };
    showCounts = false;
  } else {
    return <div style={{
      color: "red",
      padding: "1rem",
      border: "1px solid red"
    }}>BooleanConfusionMatrix: Provide either all counts or all percentages</div>;
  }
  const cellStyle = pct => ({
    background: getBg(pct),
    color: getColor(pct),
    padding: "1rem",
    textAlign: "center",
    borderRadius: "8px",
    aspectRatio: "1 / 1",
    width: "100%",
    display: "flex",
    flexDirection: "column",
    alignItems: "center",
    justifyContent: "center",
    border: "1px solid rgba(148, 163, 184, 0.35)"
  });
  const displayPredictedLabels = {
    left: predictedPositiveLabel,
    right: predictedNegativeLabel
  };
  const displayActualLabels = {
    top: actualPositiveLabel,
    bottom: actualNegativeLabel
  };
  const displayMatrix = {
    tl: resolvedMatrix.tp,
    tr: resolvedMatrix.fn,
    bl: resolvedMatrix.fp,
    br: resolvedMatrix.tn
  };
  return <div style={{
    maxWidth: maxWidth + "px",
    margin: "1rem 0"
  }}>
      <div style={{
    display: "grid",
    gridTemplateColumns: "auto auto 1fr 1fr",
    gridTemplateRows: "auto auto auto 1fr 1fr auto",
    gap: "2px"
  }}>
        {}
        <div></div>
        <div></div>
        <div style={{
    gridColumn: "3 / 5",
    textAlign: "center",
    padding: "0.5rem",
    fontWeight: "600",
    fontSize: "1rem"
  }}>
          {titlePrefix}Confusion Matrix (Normalized)
        </div>

        {}
        <div></div>
        <div></div>
        <div style={{
    gridColumn: "3 / 5",
    textAlign: "center",
    padding: "0.5rem",
    fontWeight: "600",
    fontSize: "0.875rem"
  }}>
          Predicted
        </div>

        {}
        <div></div>
        <div></div>
        <div style={{
    textAlign: "center",
    padding: "0.5rem",
    fontSize: "0.75rem",
    fontWeight: "500",
    display: "flex",
    alignItems: "center",
    justifyContent: "center"
  }}>{displayPredictedLabels.left}</div>
        <div style={{
    textAlign: "center",
    padding: "0.5rem",
    fontSize: "0.75rem",
    fontWeight: "500",
    display: "flex",
    alignItems: "center",
    justifyContent: "center"
  }}>{displayPredictedLabels.right}</div>

        {}
        <div style={{
    gridRow: "4 / 6",
    writingMode: "vertical-rl",
    transform: "rotate(180deg)",
    textAlign: "center",
    fontWeight: "600",
    fontSize: "0.875rem",
    padding: "0 0.5rem",
    display: "flex",
    alignItems: "center",
    justifyContent: "center"
  }}>
          Actual
        </div>
        <div style={{
    padding: "0.5rem",
    fontSize: "0.75rem",
    fontWeight: "500",
    display: "flex",
    alignItems: "center",
    justifyContent: "flex-end"
  }}>{displayActualLabels.top}</div>
        <div style={cellStyle(displayMatrix.tl.pct)}>
          {showCounts && <div style={{
    fontSize: "1.5rem",
    fontWeight: "700",
    lineHeight: 1
  }}>{displayMatrix.tl.count}</div>}
          <div style={{
    fontSize: showCounts ? "0.75rem" : "1rem",
    fontWeight: showCounts ? "400" : "700",
    opacity: showCounts ? 0.8 : 1
  }}>{formatValue(displayMatrix.tl.pct)}</div>
        </div>
        <div style={cellStyle(displayMatrix.tr.pct)}>
          {showCounts && <div style={{
    fontSize: "1.5rem",
    fontWeight: "700",
    lineHeight: 1
  }}>{displayMatrix.tr.count}</div>}
          <div style={{
    fontSize: showCounts ? "0.75rem" : "1rem",
    fontWeight: showCounts ? "400" : "700",
    opacity: showCounts ? 0.8 : 1
  }}>{formatValue(displayMatrix.tr.pct)}</div>
        </div>

        {}
        <div style={{
    padding: "0.5rem",
    fontSize: "0.75rem",
    fontWeight: "500",
    display: "flex",
    alignItems: "center",
    justifyContent: "flex-end"
  }}>{displayActualLabels.bottom}</div>
        <div style={cellStyle(displayMatrix.bl.pct)}>
          {showCounts && <div style={{
    fontSize: "1.5rem",
    fontWeight: "700",
    lineHeight: 1
  }}>{displayMatrix.bl.count}</div>}
          <div style={{
    fontSize: showCounts ? "0.75rem" : "1rem",
    fontWeight: showCounts ? "400" : "700",
    opacity: showCounts ? 0.8 : 1
  }}>{formatValue(displayMatrix.bl.pct)}</div>
        </div>
        <div style={cellStyle(displayMatrix.br.pct)}>
          {showCounts && <div style={{
    fontSize: "1.5rem",
    fontWeight: "700",
    lineHeight: 1
  }}>{displayMatrix.br.count}</div>}
          <div style={{
    fontSize: showCounts ? "0.75rem" : "1rem",
    fontWeight: showCounts ? "400" : "700",
    opacity: showCounts ? 0.8 : 1
  }}>{formatValue(displayMatrix.br.pct)}</div>
        </div>

        {}
        <div></div>
        <div></div>
        <div style={{
    gridColumn: "3 / 5",
    marginTop: "0.5rem",
    display: "flex",
    alignItems: "center",
    gap: "0.5rem"
  }}>
          <span style={{
    fontSize: "0.75rem",
    fontWeight: "500"
  }}>{displayFormat === "fraction" ? "0.0" : "0%"}</span>
          <div style={{
    display: "flex",
    flex: 1,
    height: "12px",
    borderRadius: "4px",
    overflow: "hidden",
    border: "1px solid rgba(148, 163, 184, 0.35)"
  }}>
            {palette.map((color, idx) => <div key={idx} style={{
    flex: 1,
    height: "100%",
    background: color
  }} />)}
          </div>
          <span style={{
    fontSize: "0.75rem",
    fontWeight: "500"
  }}>{displayFormat === "fraction" ? "1.0" : "100%"}</span>
        </div>
      </div>
    </div>;
};

export const Scale = ({low, mid, high, lowLabel = "Low", midLabel = "Mid", highLabel = "High", lowDescription, midDescription, highDescription, midColor = "yellow", inverted = false}) => {
  const lowColor = inverted ? "green" : "red";
  const highColor = inverted ? "red" : "green";
  const gradientId = inverted ? "greenToRed" : "redToGreen";
  return <div style={{
    display: 'flex',
    flexDirection: 'column',
    width: '100%'
  }}>
      <svg width="100%" height="30" style={{
    marginBottom: '8px'
  }}>
        <defs>
          <linearGradient id={gradientId} x1="0%" y1="0%" x2="100%" y2="0%">
            <stop offset="0%" stopColor={lowColor} />
            <stop offset="100%" stopColor={highColor} />
          </linearGradient>
        </defs>
        <rect width="100%" height="100%" fill={`url(#${gradientId})`} rx="4" ry="4" />
      </svg>

      <div style={{
    display: 'flex',
    justifyContent: 'space-between',
    width: '100%',
    marginBottom: '16px'
  }}>
        <p style={{
    margin: 0,
    fontSize: '12px'
  }}>{low}</p>
        {mid && <p style={{
    margin: 0,
    fontSize: '12px'
  }}>{mid}</p>}
        <p style={{
    margin: 0,
    fontSize: '12px'
  }}>{high}</p>
      </div>

      <div style={{
    display: 'flex',
    justifyContent: 'space-between',
    width: '100%'
  }}>
        <div style={{
    maxWidth: '40%'
  }}>
          <div style={{
    display: 'flex',
    alignItems: 'center',
    marginBottom: '4px'
  }}>
            <div style={{
    width: '12px',
    height: '12px',
    backgroundColor: lowColor,
    borderRadius: '50%',
    marginRight: '8px'
  }}></div>
            <p style={{
    margin: 0,
    fontWeight: 'bold',
    fontSize: '14px'
  }}>{lowLabel}</p>
          </div>
          {lowDescription && <p style={{
    margin: 0,
    fontSize: '14px',
    color: '#666',
    maxWidth: '250px',
    lineHeight: '1.4'
  }}>{lowDescription}</p>}
        </div>
        {mid && <div style={{
    maxWidth: '40%',
    textAlign: 'center'
  }}>
            <div style={{
    display: 'flex',
    alignItems: 'center',
    justifyContent: 'center',
    marginBottom: '4px'
  }}>
              <div style={{
    width: '12px',
    height: '12px',
    backgroundColor: midColor,
    borderRadius: '50%',
    marginRight: '8px'
  }}></div>
              <p style={{
    margin: 0,
    fontWeight: 'bold',
    fontSize: '14px'
  }}>{midLabel}</p>
            </div>
            {midDescription && <p style={{
    margin: 0,
    fontSize: '14px',
    color: '#666',
    maxWidth: '250px',
    textAlign: 'center',
    lineHeight: '1.4'
  }}>{midDescription}</p>}
          </div>}


        <div style={{
    maxWidth: '40%',
    textAlign: 'right'
  }}>
          <div style={{
    display: 'flex',
    alignItems: 'center',
    justifyContent: 'flex-end',
    marginBottom: '4px'
  }}>
            <p style={{
    margin: 0,
    fontWeight: 'bold',
    fontSize: '14px'
  }}>{highLabel}</p>
            <div style={{
    width: '12px',
    height: '12px',
    backgroundColor: highColor,
    borderRadius: '50%',
    marginLeft: '8px'
  }}></div>
          </div>
          {highDescription && <p style={{
    margin: 0,
    fontSize: '14px',
    color: '#666',
    maxWidth: '250px',
    marginLeft: 'auto',
    lineHeight: '1.4'
  }}>{highDescription}</p>}
        </div>
      </div>
    </div>;
};

export const MetricWhenToUse = ({description, useCases}) => {
  return <Card>
      <div style={{
    display: 'flex',
    alignItems: 'center',
    gap: '0.5rem',
    marginBottom: '0.75rem'
  }}>
        <div style={{
    fontSize: '1.25rem',
    color: 'var(--primary-color)'
  }}>
          <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
            <path d="M12 22c5.523 0 10-4.477 10-10S17.523 2 12 2 2 6.477 2 12s4.477 10 10 10z" />
            <path d="m9 12 2 2 4-4" />
          </svg>
        </div>
        <h3 style={{
    margin: 0,
    fontSize: '1.25rem',
    fontWeight: '600'
  }}>When to Use This Metric</h3>
      </div>

      {description}

      {useCases != null && useCases.map((useCase, index) => <div key={index} style={{
    marginTop: "1rem",
    paddingTop: "0.75rem",
    borderTop: "1px solid rgba(209, 213, 219, 0.33)"
  }}>
          <strong>{useCase.title}</strong>{useCase.description ? `: ${useCase.description}` : ''}
        </div>)}
    </Card>;
};

export const DefinitionCard = ({children}) => {
  return <Card variant="secondary">
    <div style={{
    padding: '0.5rem',
    border: '5px solid var(--primary-light)',
    borderRadius: '0.5rem',
    fontSize: '1.3rem',
    lineHeight: '1.4',
    boxShadow: '0 0 10px 10px var(--primary-light)'
  }}>
        {children}
      </div>

</Card>;
};

<DefinitionCard>
  <strong>Agent Flow</strong> is a binary metric that checks if an agent's behavior satisfies all user-defined natural language conditions.
</DefinitionCard>

Agent Flow is a binary evaluation metric that measures the correctness and coherence of an agentic trajectory by validating it against user-specified natural language tests.
A trajectory is said to pass the Agent Flow metric if and only if all the user-defined natural language conditions are successfully satisfied by the agent's realized behavior or output.

<Note>
  To use this metric, you will need to create a copy and edit the prompt to provide your natural language tests.
</Note>

This is a **boolean** metric, returning a confidence score that the agent flow satisfies all conditions. The score ranges from 0% (no confidence the agent flow satisfies all conditions) to 100% (complete confidence that the agent flow satisfies all conditions).

## Agent Flow at a glance

| Property                       | Description                                    |
| :----------------------------- | :--------------------------------------------- |
| **Name**                       | Agent Flow                                     |
| **Category**                   | Agentic AI                                     |
| **Can be applied to**          | Session                                        |
| **LLM-as-a-judge Support**     | ✅                                              |
| **Luna Support**               | ❌                                              |
| **Protect Runtime Protection** | ❌                                              |
| **Value Type**                 | Boolean shown as a percentage confidence score |

## When to use this metric

<MetricWhenToUse
  description="Agent Flow is useful when measuring multi-agent systems that have well-defined paths or interactions"
  useCases={[
{
  title: "Agents with multiple possible paths",
  description: "Agent Flow can evaluate an agentic application that has multiple possible paths where you know the expected behavior for each user response. You can validate that the agent performs the expected behavior."
},
{
  title: "Agents with specific intention views",
  description: "Agent flow can validate specific interaction rules. For example, ensuring the agent asks for confirmation before completing a purchase."
},
{
  title: "Agents with unconditional behaviors",
  description: "Agent flow can check for unconditional behaviors, such as verifying that the agent always calls the authentication tool during a conversation."
}
]}
/>

## Score interpretation

**Expected Score:** 80%-100%.

<Scale low="0" mid="60%" high="100%" lowLabel="Poor" midLabel="Fair" highLabel="Excellent" />

## Configure Agent Flow

This metric needs to be manually customized to include your own natural language tests.

<Steps>
  <Step title="Create a copy of the Agent Flow metric">
    From the **Metrics Hub**, select the **Agent Flow** metric. You will get a popup asking you to duplicate the metric. Select **Duplicate metric** to create a copy.

    <img src="https://mintcdn.com/v2galileo/0tSw9xJ93-LLfoKu/concepts/metrics/agentic/configure-agent-flow.webp?fit=max&auto=format&n=0tSw9xJ93-LLfoKu&q=85&s=d3fe9c806f61753fb82b1d5600eebaaa" alt="The agent flow metric with the duplicate metric popup" width="2312" height="1616" data-path="concepts/metrics/agentic/configure-agent-flow.webp" />
  </Step>

  <Step title="Locate the user defined tests section">
    Locate the user defined tests section in the prompt.

    ```xml theme={null}
    <user-defined-tests>
    {{ Add your tests here }}
    </user-defined-tests>
    ```
  </Step>

  <Step title="Customize the prompt by adding your user-defined tests">
    This prompt needs to be customized based on your application, and the inputs and outputs you are expecting. Replace `{{ Add your tests here }}` with a numbered list of tests in natural language that can be used to evaluate the agent efficiency. This can include:

    * Expected tool or agent calls, using the tool or agent names
    * Conditions on tool or agent calling (e.g. if tool x is called, don't call agent y)
    * Expectations around the input or output parameters to tools and agents
    * Limitations on the number of tool or agent calls

    For example, imagine you were creating an agent to provide advice on exercises for different body parts, such as for a physical therapy application. This has multiple tools, including `list_by_target_muscle_for_exercised`, `list_by_body_part_for_exercised`, `list_of_bodyparts_for_exercised`. Some user tests might be:

    ```output wrap theme={null}
    1. If a call to "list_by_target_muscle_for_exercised" returns an error that contains the text "target not found", the agent should subsequently attempt an alternative lookup by calling either "list_by_body_part_for_exercised" or "list_of_bodyparts_for_exercised"
    2. When the user asks for exercises that target leg muscles, the agent must call at least one of the tools ["list_by_target_muscle_for_exercised", "list_by_body_part_for_exercised"] during the conversation
    3. After receiving a successful response from "list_by_body_part_for_exercised", the agent's following natural-language message must contain at least one exercise name, the corresponding equipment, and an animated demonstration URL taken from the tool output
    4. Every invocation of the tool "list_by_body_part_for_exercised" must include the required parameter "bodypart"
    5. After receiving data from list_by_body_part_for_exercised, the agent response must include the exercise id for every exercise it presents to the user
    6. No assistant message should include more than one tool invocation
    7. The agent should conclude the conversation with a human-readable answer that summarizes the requested leg exercises using data returned from the tools
    ```
  </Step>

  <Step title="Save the metric">
    Save the metric, then turn it on for your Log Stream.
  </Step>
</Steps>

## Best practices

Trajectory tests are similar to unit tests for the agents trajectory, to check if certain conditions are followed during the agents path.

You should write all the tests in a numbered list. For example:

```md theme={null}
1. If X happens then ask the user Y and call tool Z.
2. X tool is always called before Y tool.
3. When user asks X reply with Y
4. The tool Y should be called once in the conversation.
```

Each test should check for one single condition only. Tests should be logically consistent, and well defined.

## Performance Benchmarks

We evaluated Agent Flow against human expert labels on an internal dataset of agentic conversation samples using top frontier models.

| Model                   | F1 (True) |
| :---------------------- | :-------: |
| GPT-4.1                 |    0.93   |
| GPT-4.1-mini (judges=3) |    0.92   |
| Claude Sonnet 4.5       |    0.95   |
| Gemini 3 Flash          |    0.92   |

### GPT-4.1 Classification Report

<BooleanClassificationReport
  report={`            precision    recall  f1-score   support

False       0.9452    0.9200    0.9324        75
True       0.9221    0.9467    0.9342        75

accuracy                          0.9333       150
macro avg     0.9336    0.9333    0.9333       150
weighted avg  0.9336    0.9333    0.9333       150`}
  negativeLabel="False"
  positiveLabel="True"
  negativeClass="False"
  positiveClass="True"
/>

<Note>
  Benchmarks based on internal evaluation dataset. Performance may vary by use case.
</Note>

## Related Resources

If you would like to dive deeper or start implementing Agent Flow, check out the following resources:

### How-to guides

* [Agentic AI Basic Example](/how-to-guides/agentic-ai/basic-example)

### Related Concepts

* [Agentic Metrics Overview](/concepts/metrics/agentic/agentic-overview)
* [Action Advancement](/concepts/metrics/agentic/action-advancement)
* [Action Completion](/concepts/metrics/agentic/action-completion)
