Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.galileo.ai/llms.txt

Use this file to discover all available pages before exploring further.

Experiment

Object-centric interface for Galileo experiments. An experiment represents a systematic evaluation framework for running controlled tests on datasets to measure and compare AI model performance.

Important Notes

Two-Phase Execution: Experiments are created in two phases:
  1. Create the experiment metadata (name, dataset, optional prompt)
  2. Run the experiment by creating a job that executes on the dataset
This allows you to set up the experiment structure before execution. Prompt Settings Hierarchy: When running an experiment with a prompt template, the prompt_settings parameter passed to run() completely overrides any settings stored in the prompt template itself. The Runners service uses ONLY the settings provided at job creation time. If you don’t provide prompt_settings to run(), default values will be used. To use the template’s settings, retrieve them first using get_prompt_template_settings() and pass them explicitly. Experiment Immutability: Once an experiment has been run and has traces, it cannot be run again. To re-run with the same configuration, create a new experiment with a different name. This ensures experiment results remain comparable and auditable. Dataset Requirements: While dataset is optional during creation, it is required when running the experiment with either a prompt template or a function. Examples
# Prompt-based experiment
experiment = Experiment(
    name="ml-expert-evaluation",
    dataset_name="ml-knowledge-dataset",
    prompt_name="ml-expert-v1",
    metrics=["correctness", "completeness"],
    project_name="My AI Project"
)
experiment.create()

# Function-based / generated-output experiment (no prompt required)
experiment = Experiment(
    name="otel-trace-eval",
    dataset_name="trace-dataset",
    metrics=["correctness"],
    project_name="My AI Project"
)
experiment.create()

# Check results after the run completes
experiment.refresh()
metrics = experiment.aggregate_metrics
print(f"Average correctness: {metrics['average_correctness']}")

# Re-run with a different name
experiment2 = Experiment(
    name=f"{experiment.name}-rerun-1",
    dataset_name=experiment.dataset_name,
    prompt_name=experiment.prompt_name,
    metrics=experiment.metrics,
    project_name=experiment.project_name,
).create()

add_tag

def add_tag(self, key: str, value: str, tag_type: str='generic') -> None
Add a tag to this experiment. Tags can be used to categorize, filter, and organize experiments. Common use cases include environment labels, version tracking, and team ownership. Arguments
  • key (str): Tag key (e.g., “environment”, “version”, “team”)
  • value (str): Tag value (e.g., “production”, “1.0.0”, “ml-team”)
  • tag_type (str): Tag category, defaults to “generic”. Other options: “rag”

aggregate_metrics

def aggregate_metrics(self) -> dict[str, float] | None
Get computed aggregate metrics for this experiment. Returns aggregate metrics like average_cost, average_latency, total_responses, and quality metrics (e.g., average_factuality, average_correctness). Note: Call refresh() first to get the latest metric values after experiment completion. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My Project")
experiment.refresh()

metrics = experiment.aggregate_metrics
if metrics:
    print(f"Average cost: ${metrics.get('average_cost', 0):.4f}")
    print(f"Total responses: {metrics.get('total_responses', 0)}")
    print(f"Average latency: {metrics.get('average_latency', 0):.2f}ms")

    # Quality metrics (if configured)
    if 'average_correctness' in metrics:
        print(f"Average correctness: {metrics['average_correctness']:.2%}")

create

def create(self) -> Experiment
Persist this experiment to the API. Examples
experiment = Experiment(
    name="ml-evaluation",
    dataset_name="ml-dataset",
    project_name="My AI Project"
).create()
assert experiment.is_synced()

dataset

def dataset(self) -> Dataset | None
Get the dataset associated with this experiment.

delete

def delete(self) -> None
Delete this experiment. This is a destructive operation that permanently removes the experiment and all associated data (traces, spans, metrics, results) from the API. WARNING: This operation cannot be undone! After successful deletion, the object state is set to DELETED. The local object still exists in memory but no longer represents a remote resource. Examples
# Delete an experiment
experiment = Experiment.get(
    name="old-experiment",
    project_name="My AI Project"
)
experiment.delete()
assert experiment.is_deleted()

# After deletion, the experiment no longer exists remotely
# The local object is marked as DELETED
print(experiment.sync_state)  # SyncState.DELETED

experiment_columns

def experiment_columns(self) -> ColumnCollection
Get available metric columns for this experiment. Returns a :class:~galileo.shared.column.ColumnCollection of all columns available in the experiment comparison table. Scorer-backed metric columns carry UUID-based IDs of the form "metrics/{scorer-uuid}", which map directly to the keys returned by

export_records

def export_records(self,
                   record_type: RecordType=RecordType.TRACE,
                   filters: builtins.list[FilterType] | None=None,
                   sort: LogRecordsSortClause=LogRecordsSortClause(column_id='created_at', ascending=False),
                   export_format: LLMExportFormat=LLMExportFormat.JSONL,
                   column_ids: builtins.list[str] | None=None,
                   redact: bool=True) -> Iterator[dict[str, Any]]
Export records from this experiment. Arguments
  • record_type: The type of records to export (SPAN, TRACE, or SESSION).
  • filters: A list of filters to apply to the export.
  • sort: A sort clause to order the exported records.
  • export_format: The desired format for the exported data.
  • column_ids: A list of column IDs to include in the export.
  • redact: Redact sensitive data from the response.

get

def get(cls,
        *,
        name: str,
        project_id: str | None=None,
        project_name: str | None=None) -> Experiment | None
Get an existing experiment by name. Arguments
  • name (str): The experiment name.
  • project_id (Optional[str]): The project ID. If neither project_id nor project_name is provided, falls back to GALILEO_PROJECT_ID or GALILEO_PROJECT environment variables.
  • project_name (Optional[str]): The project name. If neither project_id nor project_name is provided, falls back to GALILEO_PROJECT environment variable.

get_metric_aggregate

def get_metric_aggregate(self,
                         metric: GalileoMetrics | str) -> MetricAggregates | None
Return aggregate statistics for a specific metric. Looks up a metric by any of the following identifiers, tried in order:
  1. :class:~galileo.schema.metrics.GalileoMetrics enum value — its value IS the human-readable label (e.g. GalileoMetrics.correctness"Correctness").
  2. Scorer UUID string — direct lookup in :attr:metric_aggregates, no column resolution needed.
  3. Human-readable label string (e.g. "Correctness") — resolved via :attr:experiment_columns.
  4. Legacy metric_key_alias string (e.g. "correctness") — fallback after label matching fails.
Returns None if :attr:metric_aggregates is not yet populated (metrics still computing) or the metric is not found. Arguments
  • metric: Any of: a :class:GalileoMetrics enum value, scorer UUID string, human-readable label, or legacy metric_key_alias.
Returns
  • MetricAggregates | None: Aggregate stats with avg, min_, max_, p50, p90, p95, p99, count, and value_distribution fields; or None if not available.
Examples
Poll until a specific metric is computed, then assert::

    from galileo.schema.metrics import GalileoMetrics

    while experiment.get_metric_aggregate(GalileoMetrics.correctness) is None:
        time.sleep(5)
        experiment.refresh()

    agg = experiment.get_metric_aggregate(GalileoMetrics.correctness)
    assert agg.avg >= 0.95

get_prompt_template_settings

def get_prompt_template_settings(self) -> PromptRunSettings | None
Get the settings from the associated prompt template. WARNING: These settings are NOT automatically used when running the experiment. The Runners service ignores template settings and only uses the prompt_settings passed to the run() method. Use this method to retrieve template settings if you want to apply them to the job. Examples
experiment = Experiment(
    name="ml-evaluation",
    prompt_name="ml-prompt",
    dataset_name="ml-dataset",
    project_name="My Project"
).create()

# Get settings from template
template_settings = experiment.get_prompt_template_settings()

# Note: Current run() doesn't accept prompt_settings parameter
# This would require updating the run() signature

get_sessions

def get_sessions(self,
                 filters: builtins.list[FilterType] | None=None,
                 sort: LogRecordsSortClause | None=None,
                 limit: int=100,
                 starting_token: int=0) -> QueryResult
Query sessions in this experiment. This is a convenience method that queries for sessions specifically. Arguments
  • filters: A list of filters to apply to the query.
  • sort: A sort clause to order the query results.
  • limit: The maximum number of records to return.
  • starting_token: The token for the next page of results.

get_spans

def get_spans(self,
              filters: builtins.list[FilterType] | None=None,
              sort: LogRecordsSortClause | None=None,
              limit: int=100,
              starting_token: int=0) -> QueryResult
Query spans in this experiment. This is a convenience method that queries for spans specifically. Arguments
  • filters: A list of filters to apply to the query.
  • sort: A sort clause to order the query results.
  • limit: The maximum number of records to return.
  • starting_token: The token for the next page of results.

get_status

def get_status(self) -> ExperimentStatusInfo
Get the current status of this experiment in human-readable format. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My AI Project")
status = experiment.get_status()

print(status)  # Human-readable status
print(f"Progress: {status.overall_progress}%")

if status.is_complete:
    print("Experiment completed!")
elif status.is_in_progress:
    print(f"Running: {status.log_generation}")

get_traces

def get_traces(self,
               filters: builtins.list[FilterType] | None=None,
               sort: LogRecordsSortClause | None=None,
               limit: int=100,
               starting_token: int=0) -> QueryResult
Query traces in this experiment. This is a convenience method that queries for traces specifically. Arguments
  • filters: A list of filters to apply to the query.
  • sort: A sort clause to order the query results.
  • limit: The maximum number of records to return.
  • starting_token: The token for the next page of results.

has_traces

def has_traces(self) -> bool
Check if this experiment has any traces. Experiments with traces cannot have new jobs created on them. To re-run an experiment, create a new experiment with a different name. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My Project")
if experiment.has_traces():
    print("This experiment has already been run")
    # Create a new one for re-run
    new_exp = Experiment(
        name=f"{experiment.name}-rerun-1",
        dataset_name=experiment.dataset_name,
        prompt_name=experiment.prompt_name,
        project_name=experiment.project_name
    ).create()

is_winner

def is_winner(self) -> bool
Check if this experiment is marked as the winner. The winner is the best-performing experiment in a set of comparisons, typically the one with rank=1 and the highest ranking score. Examples
experiments = Experiment.list(project_name="My Project")
winner = next((exp for exp in experiments if exp.is_winner), None)

if winner:
    print(f"Best experiment: {winner.name}")
    print(f"Score: {winner.ranking_score}")

list

def list(cls,
         *,
         project_id: str | None=None,
         project_name: str | None=None) -> list[Experiment]
List all experiments for a project. Arguments
  • project_id (Optional[str]): The project ID. If neither project_id nor project_name is provided, falls back to GALILEO_PROJECT_ID or GALILEO_PROJECT environment variables.
  • project_name (Optional[str]): The project name. If neither project_id nor project_name is provided, falls back to GALILEO_PROJECT environment variable.

metric_aggregates

def metric_aggregates(self) -> dict[str, MetricAggregates] | None
Get structured aggregate metrics for this experiment, keyed by metric identifier. Returns full statistical aggregates (avg, min, max, sum, count, p50, p90, p95, p99, value_distribution) for each metric.

Key types

  • UUID keys (36-char strings, e.g. "550e8400-e29b-41d4-a716-446655440000") — scorer-backed metrics. The UUID matches column.id.removeprefix("metrics/") for the corresponding entry in :attr:experiment_columns.
  • Raw-string keys (e.g. "cost", "duration_ns") — system metrics computed without a scorer. These do not appear in :attr:experiment_columns.

Resolving UUIDs to human labels

Use :attr:experiment_columns to look up the display label and legacy metric name for each UUID:
cols = experiment.experiment_columns
for metric_id, agg in (experiment.metric_aggregates or {}).items():
    col = cols.get(f"metrics/{metric_id}")   # None for system metrics
    label = col.label if col else metric_id  # fall back to raw key
    print(f"{label}: avg={agg.avg:.3f}")
The MetricAggregates object exposes: avg, min_, max_, sum_, count, pct, p50, p90, p95, p99, value_distribution. For boolean metrics, value_distribution holds {"0": count_false, "1": count_true}. Note: Call :meth:refresh first to get the latest values after experiment completion. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My Project")
experiment.refresh()

for metric_id, agg in (experiment.metric_aggregates or {}).items():
    print(f"{metric_id}: avg={agg.avg}")

model

def model(self) -> Model | None
Get the Model object for this experiment. Returns the Model if it was set during initialization, otherwise attempts to create a basic Model representation from the model_alias. Examples
experiment = Experiment(
    name="ml-evaluation",
    dataset_name="ml-dataset",
    prompt_name="ml-prompt",
    model="gpt-4o-mini",
    project_name="My Project"
)
print(f"Model: {experiment.model.alias}")

monitor_progress

def monitor_progress(self, job_id: str | None=None) -> str
Monitor the progress of the experiment job with a progress bar. Arguments
  • job_id: Optional job ID to monitor. If not provided, will attempt to find the primary job for this experiment.

playground_name

def playground_name(self) -> str | None
Get the name of the playground this experiment was created from, if any.

Returns

str | None: Playground name if this is a playground experiment, None otherwise.

project

def project(self) -> Project | None
Get the project this experiment belongs to.

prompt

def prompt(self) -> Prompt | None
Get the prompt template associated with this experiment. Note: For playground-created experiments that haven’t been run yet, the prompt information may not be available automatically. In such cases, use set_prompt() to manually set the prompt before running the experiment.

prompt_model

def prompt_model(self) -> str | None
Get the model used in the prompt for this experiment. This is the model alias that was configured in the prompt settings when the experiment was run (e.g., “Claude 3.5 Haiku”, “GPT-4o”). Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My Project")
print(f"Model used: {experiment.prompt_model}")

query

def query(self,
          record_type: RecordType,
          filters: builtins.list[FilterType] | None=None,
          sort: LogRecordsSortClause | None=None,
          limit: int=100,
          starting_token: int=0) -> QueryResult
Query records in this experiment. This method provides a convenient way to search spans, traces, or sessions within the current experiment results. Arguments
  • record_type: The type of records to query (SPAN, TRACE, or SESSION).
  • filters: A list of filters to apply to the query.
  • sort: A sort clause to order the query results.
  • limit: The maximum number of records to return.
  • starting_token: The token for the next page of results.

rank

def rank(self) -> int | None
Get the rank of this experiment compared to others in the project. Lower rank number means better performance. Rank 1 is the best-performing experiment. Ranking is calculated based on aggregate metrics and quality scores. Examples
experiments = Experiment.list(project_name="My Project")
for exp in sorted(experiments, key=lambda x: x.rank or float('inf')):
    print(f"#{exp.rank}: {exp.name}")

ranking_score

def ranking_score(self) -> float | None
Get the ranking score for this experiment. This score is used to compare experiments. Higher scores indicate better performance. The score is calculated based on a combination of quality metrics and efficiency metrics. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My Project")
experiment.refresh()

if experiment.ranking_score:
    print(f"Ranking score: {experiment.ranking_score:.3f}")

refresh

def refresh(self) -> None
Refresh this experiment’s state from the API. Updates all attributes with the latest values from the remote API and sets the state to SYNCED. Examples
experiment.refresh()
assert experiment.is_synced()

run

def run(self) -> ExperimentRunResult
Returns the experiment run result. The experiment is triggered during create() via trigger=True. This method exists for backward compatibility with the create().run() call pattern.

Returns

ExperimentRunResult: Result object with link and status.

Raises

ValueError: If the experiment has not been created yet.

session_columns

def session_columns(self) -> ColumnCollection
Get available columns for sessions in this experiment. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My AI Project")
columns = experiment.session_columns
model_column = columns["model"]

set_prompt

def set_prompt(self,
               *,
               prompt: Prompt | PromptTemplate | str | None=None,
               prompt_name: str | None=None,
               prompt_id: str | None=None) -> None
Set or update the prompt for this experiment. This is useful for experiments created in the playground where prompt information may not be automatically retrieved from the API. Arguments
  • prompt: Prompt object, prompt name, or PromptTemplate object.
  • prompt_name: Name of the prompt template (alternative to prompt parameter).
  • prompt_id: ID of the prompt template (alternative to prompt parameter).

span_columns

def span_columns(self) -> ColumnCollection
Get available columns for spans in this experiment. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My AI Project")
columns = experiment.span_columns
input_column = columns["input"]

tags

def tags(self) -> dict[str, builtins.list[dict]] | None
Get tags associated with this experiment. Tags are organized by category (e.g., “generic”, “rag”). Each category contains a list of tag objects with key, value, and metadata. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My Project")
tags = experiment.tags

if tags and 'generic' in tags:
    for tag in tags['generic']:
        print(f"{tag['key']}={tag['value']}")

trace_columns

def trace_columns(self) -> ColumnCollection
Get available columns for traces in this experiment. Examples
experiment = Experiment.get(name="ml-evaluation", project_name="My AI Project")
columns = experiment.trace_columns
input_column = columns["input"]