Evaluation & Sandboxed Execution

Cross-validation feature evaluation, sandboxed code execution, and model factory.

Published

May 11, 2026

Introduction

Feature Forge evaluates generated features using k-fold cross-validation and executes LLM-generated code in a sandbox. This notebook explores CVEvaluator, SandboxedExecutor, ModelFactory, and the metrics registry.

Setup

Code

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

print("Evaluation & Sandbox Demo")

Evaluation & Sandbox Demo

Load Data

Code

X, y = make_classification(
    n_samples=300, n_features=6, n_informative=4, random_state=42
)
df = pd.DataFrame(X, columns=[f"f{i+1}" for i in range(X.shape[1])])
df["target"] = y

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["target"]), df["target"],
    test_size=0.3, random_state=42, stratify=df["target"]
)

CVEvaluator

Code

from feature_forge.config import Settings
from feature_forge.evaluation import CVEvaluator

config = Settings(task="classification", metric="auc")
evaluator = CVEvaluator(config=config)

baseline = evaluator.evaluate_baseline(X_train, y_train)
print(f"Baseline AUC (5-fold CV): {baseline:.4f}")

{"score": 0.931947, "metric": "auc", "folds": 5, "event": "cv_baseline_score", "level": "info", "timestamp": "2026-05-11T13:06:17.480347Z", "span": null}
Baseline AUC (5-fold CV): 0.9319

Evaluate a Synthetic Feature

Code

# Create a synthetic engineered feature
feature_df = pd.DataFrame({
    "f1_times_f2": X_train["f1"] * X_train["f2"],
})

gain = evaluator.evaluate_feature(
    X_base=X_train,
    y=y_train,
    feature_df=feature_df,
    baseline_score=baseline,
)
print(f"Gain from f1*f2: {gain:+.4f}")

{"gain": 0.001813, "new_score": 0.93376, "baseline_score": 0.931947, "event": "cv_feature_gain", "level": "debug", "timestamp": "2026-05-11T13:06:17.775012Z", "span": null}
Gain from f1*f2: +0.0018

ModelFactory

Code

from feature_forge.evaluation import ModelFactory

factory = ModelFactory()
for name in ["xgboost", "lightgbm", "random_forest", "logistic_regression"]:
    try:
        model = factory.get_model(name)
        print(f"  {name}: {model.__class__.__name__}")
    except Exception as exc:
        print(f"  {name}: unavailable ({exc})")

  xgboost: unavailable (ModelFactory.get_model() missing 1 required positional argument: 'task')
  lightgbm: unavailable (ModelFactory.get_model() missing 1 required positional argument: 'task')
  random_forest: unavailable (ModelFactory.get_model() missing 1 required positional argument: 'task')
  logistic_regression: unavailable (ModelFactory.get_model() missing 1 required positional argument: 'task')

Metrics Registry

Code

from feature_forge.evaluation.metrics import get_metric

for metric_name in ["auc", "accuracy", "f1", "mse", "rmse", "mae"]:
    try:
        fn = get_metric(metric_name)
        print(f"  {metric_name}: {fn.__name__}")
    except Exception as exc:
        print(f"  {metric_name}: unavailable ({exc})")

  auc: auc_score
  accuracy: unavailable (Unknown metric: accuracy. Available: ['auc', 'acc', 'f1', 'rmse', 'mae', 'r2', 'nrmse'])
  f1: f1_score_metric
  mse: unavailable (Unknown metric: mse. Available: ['auc', 'acc', 'f1', 'rmse', 'mae', 'r2', 'nrmse'])
  rmse: rmse_score
  mae: mae_score

Sandboxed Execution

Code

from feature_forge.evaluation.sandbox import SandboxedExecutor

sandbox = SandboxedExecutor(
    timeout_seconds=10,
    max_memory_mb=512,
)

# Valid code — must define generate_features(df)
code_valid = """
import pandas as pd

def generate_features(df):
    return pd.DataFrame({'squared_f1': df['f1'] ** 2})
"""

output = sandbox.execute(code_valid, X_train.copy())
print(f"Sandbox output shape: {output.shape}")
print(output.head())

{"code_length": 104, "input_shape": [210, 6], "event": "sandbox_execute_start", "level": "info", "timestamp": "2026-05-11T13:06:17.802908Z", "span": null}
{"result_shape": [210, 1], "latency_ms": 1564.5, "event": "sandbox_execute_complete", "level": "info", "timestamp": "2026-05-11T13:06:19.367441Z", "span": null}
Sandbox output shape: (210, 1)
     squared_f1
14     9.548707
249    1.886342
121    0.020246
257    8.441031
11     1.646172

Sandbox Safety

Code

# Attempt dangerous code — should be caught
code_bad = """
import os

def generate_features(df):
    os.system('echo pwned')
    return df
"""

try:
    sandbox.execute(code_bad, X_train.copy())
except Exception as exc:
    print(f"Sandbox correctly rejected unsafe code: {type(exc).__name__}")

{"code_length": 81, "input_shape": [210, 6], "event": "sandbox_execute_start", "level": "info", "timestamp": "2026-05-11T13:06:19.377387Z", "span": null}
{"reason": "import_not_allowed: os", "event": "sandbox_validation_blocked", "level": "warning", "timestamp": "2026-05-11T13:06:19.377969Z", "span": null}
Sandbox correctly rejected unsafe code: SandboxValidationError

Evaluate Multiple Features

Code

features_to_test = {
    "f1_sq": X_train["f1"] ** 2,
    "f1_plus_f2": X_train["f1"] + X_train["f2"],
    "f1_div_f2": X_train["f1"] / (X_train["f2"] + 1e-6),
}

eval_results = []
for feat_name, feat_series in features_to_test.items():
    feat_df = pd.DataFrame({feat_name: feat_series})
    g = evaluator.evaluate_feature(X_train, y_train, feat_df, baseline)
    eval_results.append({"feature": feat_name, "gain": round(g, 4)})

pd.DataFrame(eval_results).sort_values("gain", ascending=False)

{"gain": 0.004987, "new_score": 0.936934, "baseline_score": 0.931947, "event": "cv_feature_gain", "level": "debug", "timestamp": "2026-05-11T13:06:19.694832Z", "span": null}
{"gain": 0.006808, "new_score": 0.938755, "baseline_score": 0.931947, "event": "cv_feature_gain", "level": "debug", "timestamp": "2026-05-11T13:06:20.000988Z", "span": null}
{"gain": 0.025866, "new_score": 0.957813, "baseline_score": 0.931947, "event": "cv_feature_gain", "level": "debug", "timestamp": "2026-05-11T13:06:20.309505Z", "span": null}

	feature	gain
2	f1_div_f2	0.0259
1	f1_plus_f2	0.0068
0	f1_sq	0.0050

Plot: Feature Gains

Code

import matplotlib.pyplot as plt

eval_df = pd.DataFrame(eval_results)
fig, ax = plt.subplots(figsize=(7, 4))
colors = ["green" if g > 0 else "red" for g in eval_df["gain"]]
ax.barh(eval_df["feature"], eval_df["gain"], color=colors)
ax.axvline(x=0, color="black", linewidth=0.8)
ax.set_title("CV Gain per Synthetic Feature")
ax.set_xlabel("AUC Gain")
plt.tight_layout()
plt.show()

Summary

CVEvaluator uses stratified k-fold CV for classification, k-fold for regression
evaluate_feature() measures gain vs. baseline (original features only)
SandboxedExecutor runs LLM code with timeout and memory limits
Dangerous imports (os, sys, subprocess) are blocked by AST validation
ModelFactory provides consistent model instantiation across experiments