Head-to-head benchmark of MALMAS, CAAFE, LLM-FE, and Malmus on the same dataset with side-by-side metrics.
Published

May 11, 2026

Introduction

This notebook runs all available feature engineering methods — MALMAS (full pipeline), CAAFE, LLM-FE, and Malmus — on the same synthetic classification dataset. It measures downstream model performance (AUC with XGBoost), feature counts, and latency, then produces a side-by-side comparison table, feature overlap analysis via ArtifactDiff, and a full HTML dashboard.

This is the recommended starting point for understanding how the methods differ in practice.

Setup

Code
import os
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

print("Method Comparison: MALMAS vs CAAFE vs LLM-FE vs Malmus")
Method Comparison: MALMAS vs CAAFE vs LLM-FE vs Malmus

Load Data

Code
X, y = make_classification(
    n_samples=300, n_features=8, n_informative=5, n_redundant=2,
    n_classes=2, random_state=42,
)
feature_names = [f"f{i+1}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df["target"] = y

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["target"]), df["target"],
    test_size=0.3, random_state=42, stratify=df["target"],
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
Train: (210, 8), Test: (90, 8)

Baseline (No Feature Engineering)

Code
baseline_clf = XGBClassifier(n_estimators=100, max_depth=4, random_state=42, eval_metric="logloss")
baseline_clf.fit(X_train, y_train)
baseline_auc = roc_auc_score(y_test, baseline_clf.predict_proba(X_test)[:, 1])
print(f"Baseline AUC (no feature engineering): {baseline_auc:.4f}")
Baseline AUC (no feature engineering): 0.8938

Instantiate Methods

Code
from feature_forge.llm.providers.deepseek import DeepSeekProvider

llm = DeepSeekProvider(
    model="deepseek-chat",
    api_key=os.environ.get("FF_LLM__API_KEY", ""),
)

methods = {}

# MALMAS (full pipeline)
from feature_forge.api import MALMASFeatureEngineer
from feature_forge.config import LLMConfig, Settings

config = Settings(
    task="classification", metric="auc", n_rounds=1,
    llm=LLMConfig(model="deepseek-chat", api_key=os.environ.get("FF_LLM__API_KEY", "")),
)
methods["malmas"] = MALMASFeatureEngineer(config=config, mode="full")

# CAAFE
try:
    from feature_forge.baselines.caafe import CAAFEBaseline
    methods["caafe"] = CAAFEBaseline(llm_client=llm, iterations=2, variant="unified")
except Exception as exc:
    print(f"CAAFE skipped: {exc}")

# LLM-FE
try:
    from feature_forge.baselines.llmfe import LLMFEBaseline
    methods["llmfe"] = LLMFEBaseline(llm_client=llm, n_features=5, mode="single_shot")
except Exception as exc:
    print(f"LLM-FE skipped: {exc}")

# Malmus
try:
    from feature_forge.baselines.malmus import MalmusBaseline
    methods["malmus"] = MalmusBaseline(llm_client=llm, n_features=5, mode="single_shot")
except Exception as exc:
    print(f"Malmus skipped: {exc}")

print(f"\nMethods to compare: {list(methods.keys())}")

Methods to compare: ['malmas', 'caafe', 'llmfe', 'malmus']

Run All Methods

Code
results = {}
for name, method in methods.items():
    t0 = time.perf_counter()
    try:
        method.fit(X_train, y_train)
        latency = round(time.perf_counter() - t0, 2)

        X_train_enhanced = method.transform(X_train)
        X_test_enhanced = method.transform(X_test)
        new_cols = [c for c in X_test_enhanced.columns if c not in X_test.columns]

        clf = XGBClassifier(n_estimators=100, max_depth=4, random_state=42, eval_metric="logloss")
        clf.fit(X_train_enhanced, y_train)
        enhanced_auc = roc_auc_score(y_test, clf.predict_proba(X_test_enhanced)[:, 1])

        results[name] = {
            "status": "ok",
            "latency_s": latency,
            "n_new_features": len(new_cols),
            "enhanced_auc": enhanced_auc,
            "auc_delta": enhanced_auc - baseline_auc,
            "n_scripts": len(method.generated_scripts),
            "artifacts": method.get_artifacts(),
        }
        print(f"  {name}: {len(new_cols)} features, AUC={enhanced_auc:.4f} ({enhanced_auc - baseline_auc:+.4f}), {latency}s")
    except Exception as exc:
        latency = round(time.perf_counter() - t0, 2)
        results[name] = {"status": "error", "latency_s": latency, "error": str(exc)}
        print(f"  {name}: FAILED ({exc})")
{"mode": "full", "model": "deepseek-chat", "train_shape": [210, 8], "n_rounds": 1, "event": "fit_start", "level": "info", "timestamp": "2026-05-11T13:06:42.769573Z", "span": null}
{"n_rounds": 1, "task": "classification", "strategy": "hybrid", "event": "iterative_pipeline_start", "level": "info", "timestamp": "2026-05-11T13:06:42.770893Z", "span": null}
{"round_idx": 0, "total_rounds": 1, "event": "round_start", "level": "info", "timestamp": "2026-05-11T13:06:42.771290Z", "span": null}
{"strategy": "hybrid", "round_idx": 0, "selected_agents": ["unary", "cross_compositional", "aggregation", "temporal", "local_transform", "local_pattern"], "event": "router_select_agents", "level": "info", "timestamp": "2026-05-11T13:06:42.771491Z", "span": null}
{"round_idx": 0, "agents": ["unary", "cross_compositional", "aggregation", "temporal", "local_transform", "local_pattern"], "strategy": "hybrid", "event": "agents_selected", "level": "info", "timestamp": "2026-05-11T13:06:42.772625Z", "span": null}
{"path": "memory_files/agent_memories/unary_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.773203Z", "span": null}
{"agent": "unary", "path": "memory_files/agent_memories/unary_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.773410Z", "span": null}
{"path": "memory_files/agent_memories/cross_compositional_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.773959Z", "span": null}
{"agent": "cross_compositional", "path": "memory_files/agent_memories/cross_compositional_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.774166Z", "span": null}
{"path": "memory_files/agent_memories/aggregation_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.774564Z", "span": null}
{"agent": "aggregation", "path": "memory_files/agent_memories/aggregation_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.774744Z", "span": null}
{"path": "memory_files/agent_memories/temporal_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.775203Z", "span": null}
{"agent": "temporal", "path": "memory_files/agent_memories/temporal_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.775367Z", "span": null}
{"path": "memory_files/agent_memories/local_transform_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.775870Z", "span": null}
{"agent": "local_transform", "path": "memory_files/agent_memories/local_transform_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.776018Z", "span": null}
{"path": "memory_files/agent_memories/local_pattern_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.776843Z", "span": null}
{"agent": "local_pattern", "path": "memory_files/agent_memories/local_pattern_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.777046Z", "span": null}
{"agents": ["unary", "cross_compositional", "aggregation", "temporal", "local_transform", "local_pattern"], "num_agents": 6, "train_shape": [210, 8], "event": "pipeline_start", "level": "info", "timestamp": "2026-05-11T13:06:42.777340Z", "span": null}
{"agent": "unary", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:42.778535Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:42.778777Z", "span": null}
{"agent": "cross_compositional", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:42.858488Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:42.859128Z", "span": null}
{"agent": "aggregation", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:42.860590Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:42.861132Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.063488Z", "span": null}
{"agent": "unary", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.063890Z", "span": null}
{"agent": "temporal", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:43.065076Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.065310Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.091846Z", "span": null}
{"agent": "cross_compositional", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.092217Z", "span": null}
{"agent": "local_transform", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:43.093608Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.093819Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.094900Z", "span": null}
{"agent": "aggregation", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.095133Z", "span": null}
{"agent": "local_pattern", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:43.096040Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.096265Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.171169Z", "span": null}
{"agent": "temporal", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.171511Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.198317Z", "span": null}
{"agent": "local_transform", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.198612Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.204183Z", "span": null}
{"agent": "local_pattern", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.204420Z", "span": null}
{"agent": "unary", "error": "unary LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.204783Z", "span": null}
{"agent": "cross_compositional", "error": "cross_compositional LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.204913Z", "span": null}
{"agent": "aggregation", "error": "aggregation LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.205073Z", "span": null}
{"agent": "temporal", "error": "temporal LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.205235Z", "span": null}
{"agent": "local_transform", "error": "local_transform LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.205388Z", "span": null}
{"agent": "local_pattern", "error": "local_pattern LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.205585Z", "span": null}
{"num_specs": 0, "num_selected": 0, "reason": "no_specs_generated", "event": "pipeline_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.205734Z", "span": null}
{"path": "memory_files/agent_memories/unary_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.207271Z", "span": null}
{"path": "memory_files/agent_memories/cross_compositional_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.208384Z", "span": null}
{"path": "memory_files/agent_memories/aggregation_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.209272Z", "span": null}
{"path": "memory_files/agent_memories/temporal_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.209984Z", "span": null}
{"path": "memory_files/agent_memories/local_transform_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.211318Z", "span": null}
{"path": "memory_files/agent_memories/local_pattern_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.212881Z", "span": null}
{"round_idx": 0, "features_generated": 0, "features_selected": 0, "baseline_score": 0.0, "latency_ms": 442.0, "event": "round_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.213334Z", "span": null}
{"total_rounds": 1, "total_features": 0, "latency_ms": 442.7, "event": "iterative_pipeline_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.213576Z", "span": null}
{"num_selected_features": 0, "latency_ms": 444.4, "event": "fit_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.213951Z", "span": null}
{"input_shape": [210, 8], "num_codes": 0, "event": "transform_start", "level": "info", "timestamp": "2026-05-11T13:06:43.214185Z", "span": null}
{"output_shape": [210, 8], "latency_ms": 0.3, "num_failures": 0, "event": "transform_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.214436Z", "span": null}
{"input_shape": [90, 8], "num_codes": 0, "event": "transform_start", "level": "info", "timestamp": "2026-05-11T13:06:43.214609Z", "span": null}
{"output_shape": [90, 8], "latency_ms": 0.2, "num_failures": 0, "event": "transform_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.214778Z", "span": null}
  malmas: 0 features, AUC=0.8938 (+0.0000), 0.44s
{"score": 0.946915, "metric": "auc", "folds": 5, "event": "cv_baseline_score", "level": "info", "timestamp": "2026-05-11T13:06:43.657568Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 1, "temperature": 0.2, "max_tokens": 2048, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.658564Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.794246Z", "span": null}
  caafe: FAILED (OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}})
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 1, "temperature": 0.3, "max_tokens": 2048, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.795269Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.895412Z", "span": null}
  llmfe: FAILED (OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}})
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.3, "max_tokens": 4096, "json_mode": true, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.896589Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "json_mode": true, "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:44.005166Z", "span": null}
  malmus: FAILED (DeepSeek JSON mode error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}})

Comparison Summary Table

Code
rows = []
for name, r in results.items():
    if r["status"] == "ok":
        rows.append({
            "Method": name.upper(),
            "Features": r["n_new_features"],
            "Scripts": r["n_scripts"],
            "AUC": f"{r['enhanced_auc']:.4f}",
            "vs Baseline": f"{r['auc_delta']:+.4f}",
            "Latency (s)": r["latency_s"],
        })
    else:
        rows.append({
            "Method": name.upper(),
            "Features": "ERROR",
            "Scripts": "-",
            "AUC": "-",
            "vs Baseline": "-",
            "Latency (s)": r["latency_s"],
        })

summary_df = pd.DataFrame(rows).set_index("Method")
print(f"Baseline AUC: {baseline_auc:.4f}\n")
summary_df
Baseline AUC: 0.8938
Features Scripts AUC vs Baseline Latency (s)
Method
MALMAS 0 0 0.8938 +0.0000 0.44
CAAFE ERROR - - - 0.41
LLMFE ERROR - - - 0.10
MALMUS ERROR - - - 0.11

Artifact Diff Analysis

Code
from feature_forge.artifacts.schema import ArtifactBundle, FeatureMetadata, ProvenanceRecord
from feature_forge.artifacts.diff import ArtifactDiff

bundles = {}
for name, r in results.items():
    if r["status"] != "ok":
        continue
    arts = r["artifacts"]
    meta_list = []
    prov_records = arts.get("provenance", [])
    for prov in prov_records:
        if isinstance(prov, dict):
            meta_list.append(FeatureMetadata(
                name=prov.get("feature_name", ""),
                method=name,
                agent=prov.get("source_agent"),
                gain=prov.get("cv_gain"),
                round=prov.get("round_index"),
            ))

    prov_objs = []
    for p in prov_records:
        if isinstance(p, dict):
            prov_objs.append(ProvenanceRecord(
                feature_name=p.get("feature_name", ""),
                source_method=p.get("source_method", name),
                source_agent=p.get("source_agent"),
                round_index=p.get("round_index"),
                cv_gain=p.get("cv_gain"),
            ))

    bundles[name] = ArtifactBundle(
        method_name=name,
        generated_scripts=arts.get("feature_codes", []) or [],
        feature_metadata=meta_list,
        provenance_records=prov_objs,
    )

if len(bundles) >= 2:
    diff = ArtifactDiff(bundles)
    diff_summary = diff.summary()
    print(f"Total unique features: {diff_summary['total_unique_features']}")
    print(f"Shared across all: {diff_summary['shared_across_all']}")
    for method, data in diff_summary.get("per_method", {}).items():
        print(f"  {method}: {data['total_features']} features, {data['unique_features']} unique")

    overlap = diff.overlap_matrix()
    if not overlap.empty:
        print("\nFeature Overlap Matrix:")
        print(overlap)
else:
    print("Need >= 2 successful methods for diff analysis")
Need >= 2 successful methods for diff analysis

Plot: Overview

Code
import matplotlib.pyplot as plt

successful = {n: r for n, r in results.items() if r["status"] == "ok"}
if successful:
    names = list(successful.keys())
    aucs = [successful[n]["enhanced_auc"] for n in names]
    deltas = [successful[n]["auc_delta"] for n in names]
    feat_counts = [successful[n]["n_new_features"] for n in names]
    latencies = [successful[n]["latency_s"] for n in names]

    fig, axes = plt.subplots(1, 3, figsize=(16, 5))

    ax = axes[0]
    x = range(len(names))
    colors = ["steelblue" if d >= 0 else "salmon" for d in deltas]
    bars = ax.bar(x, aucs, color=colors, edgecolor="black", linewidth=0.5)
    ax.axhline(y=baseline_auc, color="gray", linestyle="--", linewidth=1, label=f"Baseline ({baseline_auc:.4f})")
    ax.set_xticks(x)
    ax.set_xticklabels([n.upper() for n in names], rotation=30)
    ax.set_ylabel("AUC")
    ax.set_title("Downstream AUC by Method")
    ax.legend(fontsize=8)
    for bar, val in zip(bars, aucs):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.002,
                f"{val:.4f}", ha="center", va="bottom", fontsize=9)

    ax = axes[1]
    colors = plt.cm.Set2(np.linspace(0, 1, len(names)))
    ax.bar(x, feat_counts, color=colors, edgecolor="black", linewidth=0.5)
    ax.set_xticks(x)
    ax.set_xticklabels([n.upper() for n in names], rotation=30)
    ax.set_ylabel("Number of New Features")
    ax.set_title("Features Generated by Method")

    ax = axes[2]
    ax.bar(x, latencies, color="coral", edgecolor="black", linewidth=0.5)
    ax.set_xticks(x)
    ax.set_xticklabels([n.upper() for n in names], rotation=30)
    ax.set_ylabel("Seconds")
    ax.set_title("Fit Latency")

    plt.tight_layout()
    plt.show()

HTML Dashboard

Code
from feature_forge.artifacts import ArtifactDashboard

if bundles:
    dash = ArtifactDashboard(bundles)
    report_path = "/tmp/10_method_comparison_dashboard.html"
    dash.save(report_path)
    print(f"Full dashboard: {report_path} ({os.path.getsize(report_path):,} bytes)")
else:
    print("No bundles for dashboard")
Full dashboard: /tmp/10_method_comparison_dashboard.html (2,153 bytes)

Summary

  • MALMAS, CAAFE, LLM-FE, and Malmus each take different approaches to LLM-based feature engineering
  • MALMAS uses a multi-agent router + iterative pipeline with memory
  • CAAFE iteratively prompts with feedback from previous iterations
  • LLM-FE generates all features in a single shot
  • Malmus uses structured JSON output for reliable parsing
  • ArtifactDiff reveals feature overlap and unique contributions per method
  • ArtifactDashboard generates self-contained HTML reports for offline review