> ## Documentation Index
> Fetch the complete documentation index at: https://docs.galileo.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# SQL Correctness

> Evaluate whether generated SQL queries are syntactically valid and adhere to the provided database schema

export const MultiLabelConfusionMatrix = ({report, labelOrder, labelDisplayNames = {}, decimals = 4, maxWidth = 520, microNegativeLabel = "False", microPositiveLabel = "True", showPerLabelMatrices = true}) => {
  const toNum = v => {
    if (v == null) return undefined;
    const n = Number(v);
    return Number.isFinite(n) ? n : undefined;
  };
  const clamp01 = v => Math.max(0, Math.min(1, v));
  const sumVals = (obj, keys) => (keys || Object.keys(obj || ({}))).reduce((a, k) => a + (toNum(obj?.[k]) ?? 0), 0);
  const getLabels = (lo, pcs) => {
    if (Array.isArray(lo) && lo.length) return lo;
    if (pcs && typeof pcs === "object") return Object.keys(pcs);
    return [];
  };
  const deriveCM = ({precision, recall, positiveSupport, negativeSupport}) => {
    const P = toNum(positiveSupport), N = toNum(negativeSupport), prec = toNum(precision), rec = toNum(recall);
    if (P === undefined || N === undefined || prec === undefined || rec === undefined || P < 0 || N < 0) return null;
    const tp = clamp01(rec) * P, fn = P - tp;
    let fp = clamp01(prec) > 0 ? tp / clamp01(prec) - tp : 0;
    if (!Number.isFinite(fp) || fp < 0) fp = 0;
    if (fp > N) fp = N;
    const tn = N - fp;
    return {
      tnPct: N > 0 ? tn / N * 100 : 0,
      fpPct: N > 0 ? fp / N * 100 : 0,
      fnPct: P > 0 ? fn / P * 100 : 0,
      tpPct: P > 0 ? tp / P * 100 : 0
    };
  };
  const labels = getLabels(labelOrder, report?.per_class_support);
  const perSupport = report?.per_class_support || ({});
  const perNegSupport = report?.per_class_negative_support || ({});
  if (!report || labels.length === 0) {
    return <div style={{
      color: "red",
      padding: "1rem",
      border: "1px solid red"
    }}>
        MultiLabelConfusionMatrix: Missing or invalid report/labels.
      </div>;
  }
  const totalPositiveSupport = sumVals(perSupport, labels);
  const totalNegativeSupport = sumVals(perNegSupport, labels);
  const microMatrix = deriveCM({
    precision: report.micro_precision,
    recall: report.micro_recall,
    positiveSupport: totalPositiveSupport,
    negativeSupport: totalNegativeSupport
  });
  return <div>
      {microMatrix ? <BooleanConfusionMatrix actualPositiveLabel={microPositiveLabel} actualNegativeLabel={microNegativeLabel} predictedPositiveLabel={microPositiveLabel} predictedNegativeLabel={microNegativeLabel} matrix={{
    tp: {
      pct: microMatrix.tpPct
    },
    fn: {
      pct: microMatrix.fnPct
    },
    fp: {
      pct: microMatrix.fpPct
    },
    tn: {
      pct: microMatrix.tnPct
    }
  }} displayFormat="fraction" fractionDigits={decimals} maxWidth={maxWidth} titlePrefix="Micro-Averaged " /> : <div style={{
    color: "red",
    padding: "1rem",
    border: "1px solid red"
  }}>
          MultiLabelConfusionMatrix: Could not derive micro confusion matrix from report.
        </div>}

      {showPerLabelMatrices && <>
          <div style={{
    fontWeight: "600",
    fontSize: "0.95rem",
    margin: "1.25rem 0 0.5rem"
  }}>Per-label confusion matrices</div>
          <div style={{
    display: "grid",
    gridTemplateColumns: "repeat(auto-fit, minmax(280px, 1fr))",
    gap: "1.25rem"
  }}>
            {labels.map(label => {
    const labelName = labelDisplayNames?.[label] ?? label;
    const matrix = deriveCM({
      precision: report?.per_class_precision?.[label],
      recall: report?.per_class_recall?.[label],
      positiveSupport: perSupport?.[label],
      negativeSupport: perNegSupport?.[label]
    });
    const negativeLabel = `Not ${labelName}`;
    return <div key={label}>
                  <div style={{
      fontWeight: "600",
      fontSize: "0.875rem",
      marginBottom: "0.25rem"
    }}>{labelName}</div>
                  {matrix ? <BooleanConfusionMatrix actualPositiveLabel={labelName} actualNegativeLabel={negativeLabel} predictedPositiveLabel={labelName} predictedNegativeLabel={negativeLabel} matrix={{
      tp: {
        pct: matrix.tpPct
      },
      fn: {
        pct: matrix.fnPct
      },
      fp: {
        pct: matrix.fpPct
      },
      tn: {
        pct: matrix.tnPct
      }
    }} displayFormat="fraction" fractionDigits={decimals} maxWidth={maxWidth} /> : <div style={{
      color: "red",
      padding: "0.75rem",
      border: "1px solid red"
    }}>
                      Could not derive confusion matrix for label: <code>{label}</code>
                    </div>}
                </div>;
  })}
          </div>
        </>}
    </div>;
};

export const MultiLabelClassificationReport = ({report: reportProp, labelOrder: labelOrderProp, labelDisplayNames: labelDisplayNamesProp = {}, decimals = 4, maxWidth = 520, showConfusionMatrices = true, showPerLabelMatrices = true, showAverageRows = true}) => {
  const toNum = v => {
    if (v == null) return undefined;
    const n = Number(v);
    return Number.isFinite(n) ? n : undefined;
  };
  const getLabels = (lo, pcs) => {
    if (Array.isArray(lo) && lo.length) return lo;
    if (pcs && typeof pcs === "object") return Object.keys(pcs);
    return [];
  };
  const fmtMetric = (v, d) => {
    const n = toNum(v);
    if (n === undefined) return "—";
    return n.toFixed(Number.isFinite(Number(d)) ? Number(d) : 4);
  };
  let report, labelOrder, labelDisplayNames;
  try {
    report = typeof reportProp === 'string' ? JSON.parse(reportProp) : reportProp;
    labelOrder = typeof labelOrderProp === 'string' ? JSON.parse(labelOrderProp) : labelOrderProp;
    labelDisplayNames = typeof labelDisplayNamesProp === 'string' ? JSON.parse(labelDisplayNamesProp) : labelDisplayNamesProp;
  } catch (e) {
    return <div style={{
      color: "red",
      padding: "1rem",
      border: "1px solid red"
    }}>
        MultiLabelClassificationReport: JSON parse error - {e.message}
      </div>;
  }
  const labels = getLabels(labelOrder, report?.per_class_support);
  if (!report || labels.length === 0) {
    return <div style={{
      color: "red",
      padding: "1rem",
      border: "1px solid red"
    }}>
        MultiLabelClassificationReport: Missing or invalid report/labels.
      </div>;
  }
  const rowStyle = {
    borderBottom: "1px solid rgba(148, 163, 184, 0.3)"
  };
  const cellStyle = {
    padding: "0.5rem 0.125rem"
  };
  const centerCellStyle = {
    textAlign: "center",
    padding: "0.5rem 0.125rem"
  };
  const avgRowStyle = {
    ...rowStyle,
    background: "rgba(148, 163, 184, 0.08)",
    fontWeight: 600
  };
  return <div>
      <table style={{
    width: "auto",
    borderCollapse: "collapse",
    marginBottom: "1.25rem",
    fontSize: "0.875rem"
  }}>
        <thead>
          <tr style={{
    borderBottom: "2px solid rgba(148, 163, 184, 0.5)"
  }}>
            <th style={{
    textAlign: "center",
    padding: "0.5rem 0.125rem",
    fontWeight: "600"
  }}></th>
            <th style={{
    textAlign: "center",
    padding: "0.5rem 0.125rem",
    fontWeight: "600"
  }}>Precision</th>
            <th style={{
    textAlign: "center",
    padding: "0.5rem 0.125rem",
    fontWeight: "600"
  }}>Recall</th>
            <th style={{
    textAlign: "center",
    padding: "0.5rem 0.125rem",
    fontWeight: "600"
  }}>F1-Score</th>
          </tr>
        </thead>
        <tbody>
          {labels.map(label => {
    const labelName = labelDisplayNames?.[label] ?? label;
    return <tr key={label} style={rowStyle}>
                <td style={cellStyle}>{labelName}</td>
                <td style={centerCellStyle}>{fmtMetric(report?.per_class_precision?.[label], decimals)}</td>
                <td style={centerCellStyle}>{fmtMetric(report?.per_class_recall?.[label], decimals)}</td>
                <td style={centerCellStyle}>{fmtMetric(report?.per_class_f1?.[label], decimals)}</td>
              </tr>;
  })}

          {showAverageRows && <>
              <tr style={avgRowStyle}>
                <td style={cellStyle}>Micro avg</td>
                <td style={centerCellStyle}>{fmtMetric(report.micro_precision, decimals)}</td>
                <td style={centerCellStyle}>{fmtMetric(report.micro_recall, decimals)}</td>
                <td style={centerCellStyle}>{fmtMetric(report.micro_f1, decimals)}</td>
              </tr>
              <tr style={avgRowStyle}>
                <td style={cellStyle}>Macro avg</td>
                <td style={centerCellStyle}>{fmtMetric(report.macro_precision, decimals)}</td>
                <td style={centerCellStyle}>{fmtMetric(report.macro_recall, decimals)}</td>
                <td style={centerCellStyle}>{fmtMetric(report.macro_f1, decimals)}</td>
              </tr>
              <tr style={avgRowStyle}>
                <td style={cellStyle}>Weighted avg</td>
                <td style={centerCellStyle}>{fmtMetric(report.weighted_precision, decimals)}</td>
                <td style={centerCellStyle}>{fmtMetric(report.weighted_recall, decimals)}</td>
                <td style={centerCellStyle}>{fmtMetric(report.weighted_f1, decimals)}</td>
              </tr>
            </>}
        </tbody>
      </table>

      {showConfusionMatrices && <MultiLabelConfusionMatrix report={report} labelOrder={labels} labelDisplayNames={labelDisplayNames} decimals={decimals} maxWidth={maxWidth} showPerLabelMatrices={showPerLabelMatrices} />}
    </div>;
};

export const BooleanConfusionMatrix = ({actualNegativeLabel = "Not Advanced", actualPositiveLabel = "Advanced", predictedNegativeLabel = "Not Advanced", predictedPositiveLabel = "Advanced", tnCount, tnPct, fpCount, fpPct, fnCount, fnPct, tpCount, tpPct, matrix, maxWidth = 520, displayFormat = "percentage", fractionDigits = 3, percentageDigits = 1, titlePrefix = ""}) => {
  const parseNum = val => val !== undefined && val !== null ? Number(val) : undefined;
  const clampPct = pct => Math.max(0, Math.min(100, Number(pct) || 0));
  const formatValue = pct => {
    const p = clampPct(pct);
    if (displayFormat === "fraction") {
      const digits = Number.isFinite(Number(fractionDigits)) ? Number(fractionDigits) : 3;
      return (p / 100).toFixed(digits);
    }
    const digits = Number.isFinite(Number(percentageDigits)) ? Number(percentageDigits) : 1;
    return `${p.toFixed(digits)}%`;
  };
  const palette = ["#f8fafc", "#eff6ff", "#dbeafe", "#bfdbfe", "#93c5fd", "#60a5fa", "#3b82f6", "#2563eb", "#1d4ed8", "#1e40af"];
  const getBg = pct => {
    const p = clampPct(pct);
    const idx = p === 100 ? 9 : Math.floor(p / 10);
    return palette[idx];
  };
  const getColor = pct => clampPct(pct) >= 60 ? "#ffffff" : "#1e3a8a";
  const rawTn = parseNum(tnCount);
  const rawFp = parseNum(fpCount);
  const rawFn = parseNum(fnCount);
  const rawTp = parseNum(tpCount);
  const rawTnPct = parseNum(tnPct);
  const rawFpPct = parseNum(fpPct);
  const rawFnPct = parseNum(fnPct);
  const rawTpPct = parseNum(tpPct);
  const hasCounts = rawTn !== undefined && rawFp !== undefined && rawFn !== undefined && rawTp !== undefined;
  const hasPcts = rawTnPct !== undefined && rawFpPct !== undefined && rawFnPct !== undefined && rawTpPct !== undefined;
  let resolvedMatrix;
  let showCounts;
  if (matrix) {
    resolvedMatrix = matrix;
    showCounts = matrix.tn?.count !== undefined;
  } else if (hasCounts) {
    const actualNegTotal = rawTn + rawFp;
    const actualPosTotal = rawFn + rawTp;
    resolvedMatrix = {
      tn: {
        count: rawTn,
        pct: actualNegTotal > 0 ? rawTn / actualNegTotal * 100 : 0
      },
      fp: {
        count: rawFp,
        pct: actualNegTotal > 0 ? rawFp / actualNegTotal * 100 : 0
      },
      fn: {
        count: rawFn,
        pct: actualPosTotal > 0 ? rawFn / actualPosTotal * 100 : 0
      },
      tp: {
        count: rawTp,
        pct: actualPosTotal > 0 ? rawTp / actualPosTotal * 100 : 0
      }
    };
    showCounts = true;
  } else if (hasPcts) {
    resolvedMatrix = {
      tn: {
        pct: rawTnPct
      },
      fp: {
        pct: rawFpPct
      },
      fn: {
        pct: rawFnPct
      },
      tp: {
        pct: rawTpPct
      }
    };
    showCounts = false;
  } else {
    return <div style={{
      color: "red",
      padding: "1rem",
      border: "1px solid red"
    }}>BooleanConfusionMatrix: Provide either all counts or all percentages</div>;
  }
  const cellStyle = pct => ({
    background: getBg(pct),
    color: getColor(pct),
    padding: "1rem",
    textAlign: "center",
    borderRadius: "8px",
    aspectRatio: "1 / 1",
    width: "100%",
    display: "flex",
    flexDirection: "column",
    alignItems: "center",
    justifyContent: "center",
    border: "1px solid rgba(148, 163, 184, 0.35)"
  });
  const displayPredictedLabels = {
    left: predictedPositiveLabel,
    right: predictedNegativeLabel
  };
  const displayActualLabels = {
    top: actualPositiveLabel,
    bottom: actualNegativeLabel
  };
  const displayMatrix = {
    tl: resolvedMatrix.tp,
    tr: resolvedMatrix.fn,
    bl: resolvedMatrix.fp,
    br: resolvedMatrix.tn
  };
  return <div style={{
    maxWidth: maxWidth + "px",
    margin: "1rem 0"
  }}>
      <div style={{
    display: "grid",
    gridTemplateColumns: "auto auto 1fr 1fr",
    gridTemplateRows: "auto auto auto 1fr 1fr auto",
    gap: "2px"
  }}>
        {}
        <div></div>
        <div></div>
        <div style={{
    gridColumn: "3 / 5",
    textAlign: "center",
    padding: "0.5rem",
    fontWeight: "600",
    fontSize: "1rem"
  }}>
          {titlePrefix}Confusion Matrix (Normalized)
        </div>

        {}
        <div></div>
        <div></div>
        <div style={{
    gridColumn: "3 / 5",
    textAlign: "center",
    padding: "0.5rem",
    fontWeight: "600",
    fontSize: "0.875rem"
  }}>
          Predicted
        </div>

        {}
        <div></div>
        <div></div>
        <div style={{
    textAlign: "center",
    padding: "0.5rem",
    fontSize: "0.75rem",
    fontWeight: "500",
    display: "flex",
    alignItems: "center",
    justifyContent: "center"
  }}>{displayPredictedLabels.left}</div>
        <div style={{
    textAlign: "center",
    padding: "0.5rem",
    fontSize: "0.75rem",
    fontWeight: "500",
    display: "flex",
    alignItems: "center",
    justifyContent: "center"
  }}>{displayPredictedLabels.right}</div>

        {}
        <div style={{
    gridRow: "4 / 6",
    writingMode: "vertical-rl",
    transform: "rotate(180deg)",
    textAlign: "center",
    fontWeight: "600",
    fontSize: "0.875rem",
    padding: "0 0.5rem",
    display: "flex",
    alignItems: "center",
    justifyContent: "center"
  }}>
          Actual
        </div>
        <div style={{
    padding: "0.5rem",
    fontSize: "0.75rem",
    fontWeight: "500",
    display: "flex",
    alignItems: "center",
    justifyContent: "flex-end"
  }}>{displayActualLabels.top}</div>
        <div style={cellStyle(displayMatrix.tl.pct)}>
          {showCounts && <div style={{
    fontSize: "1.5rem",
    fontWeight: "700",
    lineHeight: 1
  }}>{displayMatrix.tl.count}</div>}
          <div style={{
    fontSize: showCounts ? "0.75rem" : "1rem",
    fontWeight: showCounts ? "400" : "700",
    opacity: showCounts ? 0.8 : 1
  }}>{formatValue(displayMatrix.tl.pct)}</div>
        </div>
        <div style={cellStyle(displayMatrix.tr.pct)}>
          {showCounts && <div style={{
    fontSize: "1.5rem",
    fontWeight: "700",
    lineHeight: 1
  }}>{displayMatrix.tr.count}</div>}
          <div style={{
    fontSize: showCounts ? "0.75rem" : "1rem",
    fontWeight: showCounts ? "400" : "700",
    opacity: showCounts ? 0.8 : 1
  }}>{formatValue(displayMatrix.tr.pct)}</div>
        </div>

        {}
        <div style={{
    padding: "0.5rem",
    fontSize: "0.75rem",
    fontWeight: "500",
    display: "flex",
    alignItems: "center",
    justifyContent: "flex-end"
  }}>{displayActualLabels.bottom}</div>
        <div style={cellStyle(displayMatrix.bl.pct)}>
          {showCounts && <div style={{
    fontSize: "1.5rem",
    fontWeight: "700",
    lineHeight: 1
  }}>{displayMatrix.bl.count}</div>}
          <div style={{
    fontSize: showCounts ? "0.75rem" : "1rem",
    fontWeight: showCounts ? "400" : "700",
    opacity: showCounts ? 0.8 : 1
  }}>{formatValue(displayMatrix.bl.pct)}</div>
        </div>
        <div style={cellStyle(displayMatrix.br.pct)}>
          {showCounts && <div style={{
    fontSize: "1.5rem",
    fontWeight: "700",
    lineHeight: 1
  }}>{displayMatrix.br.count}</div>}
          <div style={{
    fontSize: showCounts ? "0.75rem" : "1rem",
    fontWeight: showCounts ? "400" : "700",
    opacity: showCounts ? 0.8 : 1
  }}>{formatValue(displayMatrix.br.pct)}</div>
        </div>

        {}
        <div></div>
        <div></div>
        <div style={{
    gridColumn: "3 / 5",
    marginTop: "0.5rem",
    display: "flex",
    alignItems: "center",
    gap: "0.5rem"
  }}>
          <span style={{
    fontSize: "0.75rem",
    fontWeight: "500"
  }}>{displayFormat === "fraction" ? "0.0" : "0%"}</span>
          <div style={{
    display: "flex",
    flex: 1,
    height: "12px",
    borderRadius: "4px",
    overflow: "hidden",
    border: "1px solid rgba(148, 163, 184, 0.35)"
  }}>
            {palette.map((color, idx) => <div key={idx} style={{
    flex: 1,
    height: "100%",
    background: color
  }} />)}
          </div>
          <span style={{
    fontSize: "0.75rem",
    fontWeight: "500"
  }}>{displayFormat === "fraction" ? "1.0" : "100%"}</span>
        </div>
      </div>
    </div>;
};

export const DefinitionCard = ({children}) => {
  return <Card variant="secondary">
    <div style={{
    padding: '0.5rem',
    border: '5px solid var(--primary-light)',
    borderRadius: '0.5rem',
    fontSize: '1.3rem',
    lineHeight: '1.4',
    boxShadow: '0 0 10px 10px var(--primary-light)'
  }}>
        {children}
      </div>

</Card>;
};

<DefinitionCard>
  <strong>SQL Correctness</strong> assesses whether a generated SQL query is grammatically correct and grounded in the provided database schema.
</DefinitionCard>

## Metric definition

SQL Correctness — A multi-label metric that evaluates the generated SQL query on two distinct dimensions: syntactic correctness and schematic correctness.

* Type: Multi-label
* Possible labels:
  * `syntactic`: The query is syntactically valid for the specified SQL dialect.
  * `schematic`: The query only references tables, columns, and data types present in the input schema.

This metric is designed for Text-to-SQL workflows where you need to validate that generated SQL queries are both grammatically correct and properly grounded in the database schema information provided.

<Note>The `syntactic` label requires the SQL dialect to be provided. The `schematic` label requires schema information to be provided. Without these inputs, the corresponding labels cannot be evaluated.</Note>

The metric produces a list of labels indicating which checks passed:

| Output Labels                | Interpretation                                                |
| :--------------------------- | :------------------------------------------------------------ |
| `["syntactic", "schematic"]` | Query passes both validations—fully correct.                  |
| `["syntactic"]`              | Syntax is correct, but uses tables/columns not in the schema. |
| `["schematic"]`              | Adheres to schema, but has syntax errors for the dialect.     |
| `[]`                         | Query fails both syntactic and schematic validation.          |

## Calculation method

SQL Correctness is computed through a multi-step evaluation process:

<Steps>
  <Step title="Model Request">
    One or more evaluation requests are sent to an LLM evaluator to analyze the generated SQL query against the dialect and schema.
  </Step>

  <Step title="Prompt Engineering">
    A specialized chain-of-thought prompt guides the model to evaluate both syntactic correctness (grammar and dialect compliance) and schematic correctness (valid table/column references).
  </Step>

  <Step title="Evaluation Process">
    The evaluator analyzes the query and produces a detailed assessment of both dimensions, checking for grammar errors, dialect-specific function usage, schema grounding, and proper alias usage.
  </Step>

  <Step title="Label Assignment">
    Based on the evaluation, labels are assigned: include `syntactic` if syntax is correct, include `schematic` if schema adherence passes.
  </Step>
</Steps>

<Note>This metric is computed by prompting an LLM and may require multiple LLM calls to compute, which can impact usage and billing.</Note>

## Supported nodes

* LLM span

Required inputs:

* The generated SQL query (output)
* The SQL dialect (for `syntactic` label): PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, T-SQL, Spark SQL, etc.
* Database/table schema information (for `schematic` label)

Optional inputs:

* Domain knowledge or hints

## What constitutes syntactic correctness (label "syntactic")

* The query is syntactically grammatically correct for the specific SQL dialect provided.
* The SQL query does not contain grammatical errors, misplaced keywords, or use functions not supported by the specified dialect.
* Syntax is evaluated independently of schema correctness.

## What constitutes schematic correctness (label "schematic")

* The SQL query uses only the exact table names, column names, and data types defined in the provided schema information.
* The query is grounded in the database information provided—no "hallucinated" table names, column names, or data types.
* In queries with table aliases (e.g., in complex JOINs), all column references are correctly prefixed with the appropriate alias.
* Data type comparisons and aggregation functions are appropriate for the column types.

## Failure cases

### Syntactic failures

* Grammatical errors or misplaced keywords in the SQL query.
* Use of functions or syntax not supported by the specified dialect.
* No SQL dialect provided in the input.

### Schematic failures

* Reference to table names, column names, or data types not present in the schema ("schematic hallucination").
* Missing or incorrect table alias prefixes for column references in complex queries.
* Comparing a column to a literal of an incompatible data type.
* Applying an aggregation function unsuitable for a column's data type.
* No schema information provided in the input.

## Example use cases

* Catching schema hallucinations where the LLM invents table or column names that don't exist in your database.
* Validating dialect-specific syntax before executing queries against production databases (e.g., catching MySQL syntax in a PostgreSQL environment).
* Pre-execution validation to prevent runtime errors from malformed SQL reaching your database.
* Regression testing SQL generation models after prompt or model updates to ensure schema grounding doesn't degrade.

**Example**: A data warehouse assistant generates a query referencing `customer_revenue` when the actual column in your schema is `total_revenue`. SQL Correctness catches this schema hallucination before the query fails at execution time, providing immediate feedback rather than a cryptic database error.

## Best practices

<CardGroup cols={2}>
  <Card title="Provide complete schema" icon="database">
    Include comprehensive schema information with table names, column names, and data types to enable accurate schematic validation.
  </Card>

  <Card title="Combine with other Text-to-SQL metrics" icon="layer-group">
    Use alongside SQL Adherence and SQL Efficiency for comprehensive validation of generated queries.
  </Card>

  <Card title="Include domain hints" icon="lightbulb">
    Provide domain knowledge or hints (e.g., "implements daylight savings refers to daylight\_savings = 'Yes'") for more contextual evaluation.
  </Card>

  <Card title="Iterate with CLHF" icon="graduation-cap">
    Use continuous learning via human feedback to improve the evaluator's accuracy for your specific database and domain.
  </Card>
</CardGroup>

## Performance Benchmarks

We evaluated SQL Correctness against human expert labels on an internal dataset of Text-to-SQL samples using top frontier models.

| Model                  | Macro F1 |
| :--------------------- | :------: |
| GPT-4.1                |   0.92   |
| GPT-4.1 Mini           |   0.92   |
| Gemini 3 Flash Preview |   0.89   |
| Claude Sonnet 4.5      |   0.93   |

### GPT-4.1 Multi-Label Classification Report

<MultiLabelClassificationReport
  report={{
micro_f1: 0.9211,
micro_precision: 0.8912,
micro_recall: 0.9532,
macro_f1: 0.9212,
macro_precision: 0.8917,
macro_recall: 0.9532,
weighted_f1: 0.9212,
weighted_precision: 0.8917,
weighted_recall: 0.9532,
per_class_f1: { schematic: 0.9279, syntactic: 0.9145 },
per_class_precision: { schematic: 0.9104, syntactic: 0.8731 },
per_class_recall: { schematic: 0.9462, syntactic: 0.9602 },
per_class_support: { schematic: 2340, syntactic: 2335 },
per_class_negative_support: { schematic: 2340, syntactic: 2345 },
}}
  labelOrder={["schematic", "syntactic"]}
  decimals={4}
  showConfusionMatrices={true}
  showPerLabelMatrices={false}
  showAverageRows={false}
/>

<Note>
  Benchmarks based on internal evaluation dataset. Performance may vary by use case.
</Note>

## Related Resources

If you would like to dive deeper or start implementing SQL Correctness, check out the following resources:

### Examples

* [SQL Correctness Examples](https://app.galileo.ai) - Log in and explore the "SQL Correctness" Log Stream in the "Preset Metric Examples" Project to see this metric in action.

### Related Concepts

* [SQL Adherence](/concepts/metrics/text2sql/sql-adherence)
* [SQL Efficiency](/concepts/metrics/text2sql/sql-efficiency)
* [SQL Injection](/concepts/metrics/text2sql/sql-injection)
