Method Comparison

Head-to-head benchmark of MALMAS, CAAFE, LLM-FE, and Malmus on the same dataset with side-by-side metrics.

Published

May 11, 2026

Introduction

This notebook runs all available feature engineering methods — MALMAS (full pipeline), CAAFE, LLM-FE, and Malmus — on the same synthetic classification dataset. It measures downstream model performance (AUC with XGBoost), feature counts, and latency, then produces a side-by-side comparison table, feature overlap analysis via ArtifactDiff, and a full HTML dashboard.

This is the recommended starting point for understanding how the methods differ in practice.

Setup

Code

import os
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

print("Method Comparison: MALMAS vs CAAFE vs LLM-FE vs Malmus")

Method Comparison: MALMAS vs CAAFE vs LLM-FE vs Malmus

Load Data

Code

X, y = make_classification(
    n_samples=300, n_features=8, n_informative=5, n_redundant=2,
    n_classes=2, random_state=42,
)
feature_names = [f"f{i+1}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df["target"] = y

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["target"]), df["target"],
    test_size=0.3, random_state=42, stratify=df["target"],
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

Train: (210, 8), Test: (90, 8)

Baseline (No Feature Engineering)

Code

baseline_clf = XGBClassifier(n_estimators=100, max_depth=4, random_state=42, eval_metric="logloss")
baseline_clf.fit(X_train, y_train)
baseline_auc = roc_auc_score(y_test, baseline_clf.predict_proba(X_test)[:, 1])
print(f"Baseline AUC (no feature engineering): {baseline_auc:.4f}")

Baseline AUC (no feature engineering): 0.8938

Instantiate Methods

Code

from feature_forge.llm.providers.deepseek import DeepSeekProvider

llm = DeepSeekProvider(
    model="deepseek-chat",
    api_key=os.environ.get("FF_LLM__API_KEY", ""),
)

methods = {}

# MALMAS (full pipeline)
from feature_forge.api import MALMASFeatureEngineer
from feature_forge.config import LLMConfig, Settings

config = Settings(
    task="classification", metric="auc", n_rounds=1,
    llm=LLMConfig(model="deepseek-chat", api_key=os.environ.get("FF_LLM__API_KEY", "")),
)
methods["malmas"] = MALMASFeatureEngineer(config=config, mode="full")

# CAAFE
try:
    from feature_forge.baselines.caafe import CAAFEBaseline
    methods["caafe"] = CAAFEBaseline(llm_client=llm, iterations=2, variant="unified")
except Exception as exc:
    print(f"CAAFE skipped: {exc}")

# LLM-FE
try:
    from feature_forge.baselines.llmfe import LLMFEBaseline
    methods["llmfe"] = LLMFEBaseline(llm_client=llm, n_features=5, mode="single_shot")
except Exception as exc:
    print(f"LLM-FE skipped: {exc}")

# Malmus
try:
    from feature_forge.baselines.malmus import MalmusBaseline
    methods["malmus"] = MalmusBaseline(llm_client=llm, n_features=5, mode="single_shot")
except Exception as exc:
    print(f"Malmus skipped: {exc}")

print(f"\nMethods to compare: {list(methods.keys())}")


Methods to compare: ['malmas', 'caafe', 'llmfe', 'malmus']

Run All Methods

Code

results = {}
for name, method in methods.items():
    t0 = time.perf_counter()
    try:
        method.fit(X_train, y_train)
        latency = round(time.perf_counter() - t0, 2)

        X_train_enhanced = method.transform(X_train)
        X_test_enhanced = method.transform(X_test)
        new_cols = [c for c in X_test_enhanced.columns if c not in X_test.columns]

        clf = XGBClassifier(n_estimators=100, max_depth=4, random_state=42, eval_metric="logloss")
        clf.fit(X_train_enhanced, y_train)
        enhanced_auc = roc_auc_score(y_test, clf.predict_proba(X_test_enhanced)[:, 1])

        results[name] = {
            "status": "ok",
            "latency_s": latency,
            "n_new_features": len(new_cols),
            "enhanced_auc": enhanced_auc,
            "auc_delta": enhanced_auc - baseline_auc,
            "n_scripts": len(method.generated_scripts),
            "artifacts": method.get_artifacts(),
        }
        print(f"  {name}: {len(new_cols)} features, AUC={enhanced_auc:.4f} ({enhanced_auc - baseline_auc:+.4f}), {latency}s")
    except Exception as exc:
        latency = round(time.perf_counter() - t0, 2)
        results[name] = {"status": "error", "latency_s": latency, "error": str(exc)}
        print(f"  {name}: FAILED ({exc})")

{"mode": "full", "model": "deepseek-chat", "train_shape": [210, 8], "n_rounds": 1, "event": "fit_start", "level": "info", "timestamp": "2026-05-11T13:06:42.769573Z", "span": null}
{"n_rounds": 1, "task": "classification", "strategy": "hybrid", "event": "iterative_pipeline_start", "level": "info", "timestamp": "2026-05-11T13:06:42.770893Z", "span": null}
{"round_idx": 0, "total_rounds": 1, "event": "round_start", "level": "info", "timestamp": "2026-05-11T13:06:42.771290Z", "span": null}
{"strategy": "hybrid", "round_idx": 0, "selected_agents": ["unary", "cross_compositional", "aggregation", "temporal", "local_transform", "local_pattern"], "event": "router_select_agents", "level": "info", "timestamp": "2026-05-11T13:06:42.771491Z", "span": null}
{"round_idx": 0, "agents": ["unary", "cross_compositional", "aggregation", "temporal", "local_transform", "local_pattern"], "strategy": "hybrid", "event": "agents_selected", "level": "info", "timestamp": "2026-05-11T13:06:42.772625Z", "span": null}
{"path": "memory_files/agent_memories/unary_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.773203Z", "span": null}
{"agent": "unary", "path": "memory_files/agent_memories/unary_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.773410Z", "span": null}
{"path": "memory_files/agent_memories/cross_compositional_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.773959Z", "span": null}
{"agent": "cross_compositional", "path": "memory_files/agent_memories/cross_compositional_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.774166Z", "span": null}
{"path": "memory_files/agent_memories/aggregation_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.774564Z", "span": null}
{"agent": "aggregation", "path": "memory_files/agent_memories/aggregation_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.774744Z", "span": null}
{"path": "memory_files/agent_memories/temporal_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.775203Z", "span": null}
{"agent": "temporal", "path": "memory_files/agent_memories/temporal_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.775367Z", "span": null}
{"path": "memory_files/agent_memories/local_transform_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.775870Z", "span": null}
{"agent": "local_transform", "path": "memory_files/agent_memories/local_transform_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.776018Z", "span": null}
{"path": "memory_files/agent_memories/local_pattern_memory.json", "exists": true, "num_keys": 7, "event": "memory_load", "level": "debug", "timestamp": "2026-05-11T13:06:42.776843Z", "span": null}
{"agent": "local_pattern", "path": "memory_files/agent_memories/local_pattern_memory.json", "event": "agent_memory_initialized", "level": "debug", "timestamp": "2026-05-11T13:06:42.777046Z", "span": null}
{"agents": ["unary", "cross_compositional", "aggregation", "temporal", "local_transform", "local_pattern"], "num_agents": 6, "train_shape": [210, 8], "event": "pipeline_start", "level": "info", "timestamp": "2026-05-11T13:06:42.777340Z", "span": null}
{"agent": "unary", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:42.778535Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:42.778777Z", "span": null}
{"agent": "cross_compositional", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:42.858488Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:42.859128Z", "span": null}
{"agent": "aggregation", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:42.860590Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:42.861132Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.063488Z", "span": null}
{"agent": "unary", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.063890Z", "span": null}
{"agent": "temporal", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:43.065076Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.065310Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.091846Z", "span": null}
{"agent": "cross_compositional", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.092217Z", "span": null}
{"agent": "local_transform", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:43.093608Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.093819Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.094900Z", "span": null}
{"agent": "aggregation", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.095133Z", "span": null}
{"agent": "local_pattern", "num_columns": 8, "round_idx": 0, "event": "agent_generate_start", "level": "info", "timestamp": "2026-05-11T13:06:43.096040Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.2, "max_tokens": 4096, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.096265Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.171169Z", "span": null}
{"agent": "temporal", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.171511Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.198317Z", "span": null}
{"agent": "local_transform", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.198612Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.204183Z", "span": null}
{"agent": "local_pattern", "error": "OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generate_error", "level": "error", "timestamp": "2026-05-11T13:06:43.204420Z", "span": null}
{"agent": "unary", "error": "unary LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.204783Z", "span": null}
{"agent": "cross_compositional", "error": "cross_compositional LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.204913Z", "span": null}
{"agent": "aggregation", "error": "aggregation LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.205073Z", "span": null}
{"agent": "temporal", "error": "temporal LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.205235Z", "span": null}
{"agent": "local_transform", "error": "local_transform LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.205388Z", "span": null}
{"agent": "local_pattern", "error": "local_pattern LLM call failed: OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "agent_generation_failed", "level": "warning", "timestamp": "2026-05-11T13:06:43.205585Z", "span": null}
{"num_specs": 0, "num_selected": 0, "reason": "no_specs_generated", "event": "pipeline_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.205734Z", "span": null}
{"path": "memory_files/agent_memories/unary_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.207271Z", "span": null}
{"path": "memory_files/agent_memories/cross_compositional_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.208384Z", "span": null}
{"path": "memory_files/agent_memories/aggregation_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.209272Z", "span": null}
{"path": "memory_files/agent_memories/temporal_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.209984Z", "span": null}
{"path": "memory_files/agent_memories/local_transform_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.211318Z", "span": null}
{"path": "memory_files/agent_memories/local_pattern_memory.json", "num_keys": 7, "event": "memory_save", "level": "debug", "timestamp": "2026-05-11T13:06:43.212881Z", "span": null}
{"round_idx": 0, "features_generated": 0, "features_selected": 0, "baseline_score": 0.0, "latency_ms": 442.0, "event": "round_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.213334Z", "span": null}
{"total_rounds": 1, "total_features": 0, "latency_ms": 442.7, "event": "iterative_pipeline_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.213576Z", "span": null}
{"num_selected_features": 0, "latency_ms": 444.4, "event": "fit_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.213951Z", "span": null}
{"input_shape": [210, 8], "num_codes": 0, "event": "transform_start", "level": "info", "timestamp": "2026-05-11T13:06:43.214185Z", "span": null}
{"output_shape": [210, 8], "latency_ms": 0.3, "num_failures": 0, "event": "transform_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.214436Z", "span": null}
{"input_shape": [90, 8], "num_codes": 0, "event": "transform_start", "level": "info", "timestamp": "2026-05-11T13:06:43.214609Z", "span": null}
{"output_shape": [90, 8], "latency_ms": 0.2, "num_failures": 0, "event": "transform_complete", "level": "info", "timestamp": "2026-05-11T13:06:43.214778Z", "span": null}
  malmas: 0 features, AUC=0.8938 (+0.0000), 0.44s
{"score": 0.946915, "metric": "auc", "folds": 5, "event": "cv_baseline_score", "level": "info", "timestamp": "2026-05-11T13:06:43.657568Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 1, "temperature": 0.2, "max_tokens": 2048, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.658564Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.794246Z", "span": null}
  caafe: FAILED (OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}})
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 1, "temperature": 0.3, "max_tokens": 2048, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.795269Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:43.895412Z", "span": null}
  llmfe: FAILED (OpenAI API error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}})
{"provider": "deepseek", "model": "deepseek-chat", "num_messages": 2, "temperature": 0.3, "max_tokens": 4096, "json_mode": true, "event": "llm_request", "level": "info", "timestamp": "2026-05-11T13:06:43.896589Z", "span": null}
{"provider": "deepseek", "model": "deepseek-chat", "json_mode": true, "error": "Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}", "event": "llm_error", "level": "error", "timestamp": "2026-05-11T13:06:44.005166Z", "span": null}
  malmus: FAILED (DeepSeek JSON mode error: Error code: 401 - {'error': {'message': 'Authentication Fails, Your api key: ****4973 is invalid', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}})

Comparison Summary Table

Code

rows = []
for name, r in results.items():
    if r["status"] == "ok":
        rows.append({
            "Method": name.upper(),
            "Features": r["n_new_features"],
            "Scripts": r["n_scripts"],
            "AUC": f"{r['enhanced_auc']:.4f}",
            "vs Baseline": f"{r['auc_delta']:+.4f}",
            "Latency (s)": r["latency_s"],
        })
    else:
        rows.append({
            "Method": name.upper(),
            "Features": "ERROR",
            "Scripts": "-",
            "AUC": "-",
            "vs Baseline": "-",
            "Latency (s)": r["latency_s"],
        })

summary_df = pd.DataFrame(rows).set_index("Method")
print(f"Baseline AUC: {baseline_auc:.4f}\n")
summary_df

Baseline AUC: 0.8938

	Features	Scripts	AUC	vs Baseline	Latency (s)
Method
MALMAS	0	0	0.8938	+0.0000	0.44
CAAFE	ERROR	-	-	-	0.41
LLMFE	ERROR	-	-	-	0.10
MALMUS	ERROR	-	-	-	0.11

Artifact Diff Analysis

Code

from feature_forge.artifacts.schema import ArtifactBundle, FeatureMetadata, ProvenanceRecord
from feature_forge.artifacts.diff import ArtifactDiff

bundles = {}
for name, r in results.items():
    if r["status"] != "ok":
        continue
    arts = r["artifacts"]
    meta_list = []
    prov_records = arts.get("provenance", [])
    for prov in prov_records:
        if isinstance(prov, dict):
            meta_list.append(FeatureMetadata(
                name=prov.get("feature_name", ""),
                method=name,
                agent=prov.get("source_agent"),
                gain=prov.get("cv_gain"),
                round=prov.get("round_index"),
            ))

    prov_objs = []
    for p in prov_records:
        if isinstance(p, dict):
            prov_objs.append(ProvenanceRecord(
                feature_name=p.get("feature_name", ""),
                source_method=p.get("source_method", name),
                source_agent=p.get("source_agent"),
                round_index=p.get("round_index"),
                cv_gain=p.get("cv_gain"),
            ))

    bundles[name] = ArtifactBundle(
        method_name=name,
        generated_scripts=arts.get("feature_codes", []) or [],
        feature_metadata=meta_list,
        provenance_records=prov_objs,
    )

if len(bundles) >= 2:
    diff = ArtifactDiff(bundles)
    diff_summary = diff.summary()
    print(f"Total unique features: {diff_summary['total_unique_features']}")
    print(f"Shared across all: {diff_summary['shared_across_all']}")
    for method, data in diff_summary.get("per_method", {}).items():
        print(f"  {method}: {data['total_features']} features, {data['unique_features']} unique")

    overlap = diff.overlap_matrix()
    if not overlap.empty:
        print("\nFeature Overlap Matrix:")
        print(overlap)
else:
    print("Need >= 2 successful methods for diff analysis")

Need >= 2 successful methods for diff analysis

Plot: Overview

Code

import matplotlib.pyplot as plt

successful = {n: r for n, r in results.items() if r["status"] == "ok"}
if successful:
    names = list(successful.keys())
    aucs = [successful[n]["enhanced_auc"] for n in names]
    deltas = [successful[n]["auc_delta"] for n in names]
    feat_counts = [successful[n]["n_new_features"] for n in names]
    latencies = [successful[n]["latency_s"] for n in names]

    fig, axes = plt.subplots(1, 3, figsize=(16, 5))

    ax = axes[0]
    x = range(len(names))
    colors = ["steelblue" if d >= 0 else "salmon" for d in deltas]
    bars = ax.bar(x, aucs, color=colors, edgecolor="black", linewidth=0.5)
    ax.axhline(y=baseline_auc, color="gray", linestyle="--", linewidth=1, label=f"Baseline ({baseline_auc:.4f})")
    ax.set_xticks(x)
    ax.set_xticklabels([n.upper() for n in names], rotation=30)
    ax.set_ylabel("AUC")
    ax.set_title("Downstream AUC by Method")
    ax.legend(fontsize=8)
    for bar, val in zip(bars, aucs):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.002,
                f"{val:.4f}", ha="center", va="bottom", fontsize=9)

    ax = axes[1]
    colors = plt.cm.Set2(np.linspace(0, 1, len(names)))
    ax.bar(x, feat_counts, color=colors, edgecolor="black", linewidth=0.5)
    ax.set_xticks(x)
    ax.set_xticklabels([n.upper() for n in names], rotation=30)
    ax.set_ylabel("Number of New Features")
    ax.set_title("Features Generated by Method")

    ax = axes[2]
    ax.bar(x, latencies, color="coral", edgecolor="black", linewidth=0.5)
    ax.set_xticks(x)
    ax.set_xticklabels([n.upper() for n in names], rotation=30)
    ax.set_ylabel("Seconds")
    ax.set_title("Fit Latency")

    plt.tight_layout()
    plt.show()

HTML Dashboard

Code

from feature_forge.artifacts import ArtifactDashboard

if bundles:
    dash = ArtifactDashboard(bundles)
    report_path = "/tmp/10_method_comparison_dashboard.html"
    dash.save(report_path)
    print(f"Full dashboard: {report_path} ({os.path.getsize(report_path):,} bytes)")
else:
    print("No bundles for dashboard")

Full dashboard: /tmp/10_method_comparison_dashboard.html (2,153 bytes)

Summary

MALMAS, CAAFE, LLM-FE, and Malmus each take different approaches to LLM-based feature engineering
MALMAS uses a multi-agent router + iterative pipeline with memory
CAAFE iteratively prompts with feedback from previous iterations
LLM-FE generates all features in a single shot
Malmus uses structured JSON output for reliable parsing
ArtifactDiff reveals feature overlap and unique contributions per method
ArtifactDashboard generates self-contained HTML reports for offline review