Experiment Tracking: WandB + MLflow
Philosophy
Every experiment must be tracked, versioned, and comparable. We support both WandB (default, superior visualization) and MLflow (optional, data sovereignty).
Architecture: Unified Tracker Interface
# src/feature_forge/experiment/tracker.py
from abc import ABC, abstractmethod
from typing import Any
class ExperimentTracker(ABC):
"""Abstract base for experiment tracking backends."""
@abstractmethod
def init_run(self, config: dict[str, Any]) -> None:
"""Initialize a new run with configuration."""
pass
@abstractmethod
def log_params(self, params: dict[str, Any]) -> None:
"""Log hyperparameters."""
pass
@abstractmethod
def log_metrics(self, metrics: dict[str, float], step: int | None = None) -> None:
"""Log metrics, optionally with step."""
pass
@abstractmethod
def log_artifact(self, path: str, artifact_type: str) -> None:
"""Log a file or directory as an artifact."""
pass
@abstractmethod
def log_table(self, name: str, dataframe: Any) -> None:
"""Log a table (e.g., feature comparison)."""
pass
@abstractmethod
def finish(self) -> None:
"""Finalize the run."""
pass
class TrackerFactory:
"""Create tracker instances based on configuration."""
_backends = {
"wandb": "feature_forge.experiment.wandb_backend.WandBTracker",
"mlflow": "feature_forge.experiment.mlflow_backend.MLflowTracker",
"none": "feature_forge.experiment.tracker.NoOpTracker",
}
@classmethod
def create(cls, backend: str) -> ExperimentTracker:
module_path, class_name = cls._backends[backend].rsplit(".", 1)
module = __import__(module_path, fromlist=[class_name])
return getattr(module, class_name)()
WandB Implementation (Default)
Why WandB?
- Free academic tier: Pro features at no cost
- W&B Weave: Best-in-class LLM evaluation and tracing
- Superior visualization: Side-by-side run comparison, parallel coordinates
- Built-in sweeps: Bayes/Grid/Random hyperparameter search
- Artifacts: Versioned datasets and models with lineage
Implementation
# src/feature_forge/experiment/wandb_backend.py
import wandb
from .tracker import ExperimentTracker
class WandBTracker(ExperimentTracker):
"""Weights & Biases experiment tracker."""
def __init__(self):
self.run = None
def init_run(self, config: dict) -> None:
self.run = wandb.init(
project=config.get("tracker", {}).get("project", "feature-forge"),
entity=config.get("tracker", {}).get("entity"),
config=config,
job_type="feature-engineering",
)
def log_params(self, params: dict) -> None:
wandb.config.update(params)
def log_metrics(self, metrics: dict, step: int | None = None) -> None:
wandb.log(metrics, step=step)
def log_artifact(self, path: str, artifact_type: str) -> None:
art = wandb.Artifact(
name=f"{artifact_type}-{wandb.run.id}",
type=artifact_type,
)
if os.path.isfile(path):
art.add_file(path)
else:
art.add_dir(path)
wandb.log_artifact(art)
def log_table(self, name: str, dataframe) -> None:
table = wandb.Table(dataframe=dataframe)
wandb.log({name: table})
def finish(self) -> None:
if self.run:
self.run.finish()
WandB + Sklearn Integration
# Log sklearn pipeline performance
from sklearn.pipeline import Pipeline
import wandb
with wandb.init(project="feature-forge") as run:
pipeline = Pipeline([
("fe", MALMASFeatureEngineer(task="classification")),
("clf", XGBClassifier()),
])
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
wandb.log({"test_accuracy": score})
wandb.sklearn.plot_classifier(
pipeline.named_steps["clf"],
X_train, X_test, y_train, y_test,
model_name="XGBoost",
)
WandB Sweeps for Hyperparameter Search
# Define sweep configuration
sweep_config = {
"method": "bayes",
"metric": {"name": "final_auc", "goal": "maximize"},
"parameters": {
"n_rounds": {"values": [1, 2, 4, 6]},
"llm_temperature": {"distribution": "uniform", "min": 0.0, "max": 1.0},
"router_strategy": {"values": ["data_driven", "performance_driven", "hybrid"]},
},
"early_terminate": {"type": "hyperband", "min_iter": 2},
}
sweep_id = wandb.sweep(sweep_config, project="feature-forge")
def train():
with wandb.init() as run:
config = run.config
fe = MALMASFeatureEngineer(
n_rounds=config.n_rounds,
router_strategy=config.router_strategy,
)
# ... run experiment ...
wandb.log({"final_auc": auc})
wandb.agent(sweep_id, function=train, count=20)
LLM Cost Tracking with WandB
# Track per-experiment LLM costs
wandb.log({
"llm_cost_total_usd": 1.25,
"llm_cost_per_round": [0.30, 0.35, 0.35, 0.25],
"llm_tokens_total": 45000,
"llm_tokens_per_agent": {
"unary": 12000,
"cross": 15000,
"aggregation": 8000,
},
})
MLflow Implementation (Optional)
Why MLflow?
- Data sovereignty: Everything stays on your servers
- Open source: No vendor lock-in
- Model registry: Strong model lifecycle management
- Local-first: Works offline
Implementation
# src/feature_forge/experiment/mlflow_backend.py
import mlflow
from .tracker import ExperimentTracker
class MLflowTracker(ExperimentTracker):
"""MLflow experiment tracker."""
def __init__(self, tracking_uri: str | None = None):
if tracking_uri:
mlflow.set_tracking_uri(tracking_uri)
self.run = None
def init_run(self, config: dict) -> None:
experiment_name = config.get("tracker", {}).get("project", "feature-forge")
mlflow.set_experiment(experiment_name)
self.run = mlflow.start_run()
mlflow.log_params(self._flatten_dict(config))
def log_params(self, params: dict) -> None:
mlflow.log_params(params)
def log_metrics(self, metrics: dict, step: int | None = None) -> None:
mlflow.log_metrics(metrics, step=step)
def log_artifact(self, path: str, artifact_type: str) -> None:
mlflow.log_artifact(path)
def log_table(self, name: str, dataframe) -> None:
path = f"/tmp/{name}.csv"
dataframe.to_csv(path, index=False)
mlflow.log_artifact(path)
def finish(self) -> None:
mlflow.end_run()
def _flatten_dict(self, d: dict, parent_key: str = "", sep: str = ".") -> dict:
"""Flatten nested dicts for MLflow params."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, sep).items())
else:
items.append((new_key, v))
return dict(items)
MLflow Local Server
# Start local MLflow tracking server
mlflow server --host 0.0.0.0 --port 5000
# Configure feature_forge to use it
export FF_TRACKER__BACKEND=mlflow
export FF_TRACKER__PROJECT=feature-forge
export MLFLOW_TRACKING_URI=http://localhost:5000
No-Op Tracker
For local development or when tracking is disabled:
class NoOpTracker(ExperimentTracker):
"""No-op tracker for when tracking is disabled."""
def init_run(self, config): pass
def log_params(self, params): pass
def log_metrics(self, metrics, step=None): pass
def log_artifact(self, path, artifact_type): pass
def log_table(self, name, dataframe): pass
def finish(self): pass
Tracking Schema
Parameters (Logged Once)
{
"dataset": "titanic",
"task": "classification",
"metric": "auc",
"n_rounds": 4,
"llm_model": "deepseek-chat",
"llm_temperature": 0.2,
"router_strategy": "hybrid",
"random_state": 42,
"agents": ["unary", "cross", "aggregation", "temporal"],
}
Metrics (Logged Per Round)
{
# Per-round metrics
"round_1/n_features_generated": 8,
"round_1/n_effective_features": 3,
"round_1/avg_feature_gain": 0.025,
"round_1/best_feature_gain": 0.04,
"round_1/llm_cost_usd": 0.15,
"round_1/latency_seconds": 45.2,
# Cumulative metrics
"cumulative/n_features_total": 12,
"cumulative/auc_improvement": 0.05,
# Final metrics
"final/base_auc": 0.82,
"final/malmas_auc": 0.87,
"final/improvement": 0.05,
}
Artifacts
| Artifact | Type | Content |
|---|---|---|
generated_features |
dataset | CSV of all generated features |
agent_memories |
memory | JSON of all agent memories |
feature_importance |
plot | Bar chart of feature gains |
router_history |
json | Agent selection per round |
Configuration
# config/settings.yaml
tracker:
backend: "wandb" # wandb | mlflow | none
project: "feature-forge"
entity: "your-wandb-team" # WandB only
# Environment variables
export WANDB_API_KEY=...
export WANDB_PROJECT=feature-forge
export WANDB_ENTITY=your-team
# Or for MLflow
export MLFLOW_TRACKING_URI=http://localhost:5000
Comparison: WandB vs MLflow for feature_forge
| Dimension | WandB (Default) | MLflow (Optional) |
|---|---|---|
| Setup | Cloud, zero infra | Local server or self-hosted |
| LLM Tracking | W&B Weave (excellent) | AI Gateway + basic tracing |
| Sweeps | Built-in Bayes/Grid | Requires Optuna |
| Visualization | Best-in-class | Basic |
| Artifacts | Versioned with lineage | Manual S3/Azure config |
| Academic | Free Pro tier | Always free |
| Data Sovereignty | Cloud-hosted | Fully local |
| Integration | wandb.log() |
mlflow.log_metric() |
Recommendation: Use WandB for daily research (superior UX, free for academics). Use MLflow for sensitive data or production deployments.