Skip to content

Novelty Evaluation

novelentitymatcher.novelty.evaluation.evaluator

Unified novelty detection evaluator.

Supports both benchmark and research evaluation modes with comprehensive metrics and reporting.

Classes

NoveltyEvaluator(mode='benchmark', metrics=None)

Unified evaluator for novelty detection.

Supports two modes: - benchmark: Quick evaluation on OOD splits with core metrics - research: Comprehensive evaluation with confusion matrices and threshold sweeping

Metrics computed: - AUROC, AUPRC - Detection rates at 1%, 5%, 10% FPR - Precision, Recall, F1 at optimal threshold

Parameters:

Name Type Description Default
mode Literal['benchmark', 'research']

Evaluation mode ('benchmark' or 'research')

'benchmark'
metrics list[str] | None

List of metrics to compute (None for default based on mode)

None
Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py
def __init__(
    self,
    mode: Literal["benchmark", "research"] = "benchmark",
    metrics: list[str] | None = None,
):
    """
    Initialize the evaluator.

    Args:
        mode: Evaluation mode ('benchmark' or 'research')
        metrics: List of metrics to compute (None for default based on mode)
    """
    self.mode = mode
    self.metrics = metrics or self._default_metrics_for_mode(mode)
Functions
evaluate(novelty_scores, is_novel_true, threshold=None)

Evaluate novelty detection performance.

Parameters:

Name Type Description Default
novelty_scores ndarray

Predicted novelty scores (higher = more novel)

required
is_novel_true ndarray

Ground truth novelty labels (True = novel)

required
threshold float | None

Optional threshold for discrete predictions

None

Returns:

Type Description
dict[str, float]

Dictionary of metric name -> value

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py
def evaluate(
    self,
    novelty_scores: np.ndarray,
    is_novel_true: np.ndarray,
    threshold: float | None = None,
) -> dict[str, float]:
    """
    Evaluate novelty detection performance.

    Args:
        novelty_scores: Predicted novelty scores (higher = more novel)
        is_novel_true: Ground truth novelty labels (True = novel)
        threshold: Optional threshold for discrete predictions

    Returns:
        Dictionary of metric name -> value
    """
    scores = np.asarray(novelty_scores)
    labels = np.asarray(is_novel_true, dtype=bool)

    results = {}

    # AUROC and AUPRC
    if "auroc" in self.metrics:
        results["auroc"] = compute_auroc(scores, labels)

    if "auprc" in self.metrics:
        results["auprc"] = compute_auprc(scores, labels)

    # Detection rates at various FPR thresholds
    if any(m.startswith("detection_rate_") for m in self.metrics):
        dr_metrics = [m for m in self.metrics if m.startswith("detection_rate_")]
        fpr_thresholds = []
        for m in dr_metrics:
            if m == "detection_rate_1":
                fpr_thresholds.append(0.01)
            elif m == "detection_rate_5":
                fpr_thresholds.append(0.05)
            elif m == "detection_rate_10":
                fpr_thresholds.append(0.10)

        if fpr_thresholds:
            detection_rates = compute_detection_rates(
                scores, labels, tuple(fpr_thresholds)
            )
            results.update(detection_rates)

    # Precision, Recall, F1
    if any(m in ["precision", "recall", "f1"] for m in self.metrics):
        prf_results = compute_precision_recall_f1(scores, labels, threshold)
        if "precision" in self.metrics:
            results["precision"] = prf_results["precision"]
        if "recall" in self.metrics:
            results["recall"] = prf_results["recall"]
        if "f1" in self.metrics:
            results["f1"] = prf_results["f1"]
        results["optimal_threshold"] = prf_results["threshold"]

    return results
create_report(novelty_scores, is_novel_true, threshold=None)

Create a comprehensive evaluation report.

Parameters:

Name Type Description Default
novelty_scores ndarray

Predicted novelty scores (higher = more novel)

required
is_novel_true ndarray

Ground truth novelty labels (True = novel)

required
threshold float | None

Optional threshold for discrete predictions

None

Returns:

Type Description
EvaluationReport

EvaluationReport with all metrics

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py
def create_report(
    self,
    novelty_scores: np.ndarray,
    is_novel_true: np.ndarray,
    threshold: float | None = None,
) -> EvaluationReport:
    """
    Create a comprehensive evaluation report.

    Args:
        novelty_scores: Predicted novelty scores (higher = more novel)
        is_novel_true: Ground truth novelty labels (True = novel)
        threshold: Optional threshold for discrete predictions

    Returns:
        EvaluationReport with all metrics
    """
    scores = np.asarray(novelty_scores)
    labels = np.asarray(is_novel_true, dtype=bool)

    # Compute all metrics
    auroc = compute_auroc(scores, labels)
    auprc = compute_auprc(scores, labels)

    detection_rates = compute_detection_rates(scores, labels)
    dr_at_1 = detection_rates.get("detection_rate_1", 0.0)
    dr_at_5 = detection_rates.get("detection_rate_5", 0.0)
    dr_at_10 = detection_rates.get("detection_rate_10", 0.0)

    prf_results = compute_precision_recall_f1(scores, labels, threshold)
    optimal_threshold = prf_results["threshold"]

    # Confusion matrix
    cm = compute_confusion_matrix(scores, labels, optimal_threshold)

    return EvaluationReport(
        auroc=auroc,
        auprc=auprc,
        detection_rate_at_1=dr_at_1,
        detection_rate_at_5=dr_at_5,
        detection_rate_at_10=dr_at_10,
        precision=prf_results["precision"],
        recall=prf_results["recall"],
        f1=prf_results["f1"],
        optimal_threshold=optimal_threshold,
        confusion_matrix=cm,
        num_samples=len(scores),
        num_novel=int(np.sum(labels)),
        timestamp=datetime.now().isoformat(),
    )
sweep_thresholds(novelty_scores, is_novel_true, num_thresholds=100)

Sweep across thresholds and compute metrics at each.

Parameters:

Name Type Description Default
novelty_scores ndarray

Predicted novelty scores (higher = more novel)

required
is_novel_true ndarray

Ground truth novelty labels (True = novel)

required
num_thresholds int

Number of thresholds to evaluate

100

Returns:

Type Description
dict[str, ndarray]

Dict with arrays for thresholds and metrics

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py
def sweep_thresholds(
    self,
    novelty_scores: np.ndarray,
    is_novel_true: np.ndarray,
    num_thresholds: int = 100,
) -> dict[str, np.ndarray]:
    """
    Sweep across thresholds and compute metrics at each.

    Args:
        novelty_scores: Predicted novelty scores (higher = more novel)
        is_novel_true: Ground truth novelty labels (True = novel)
        num_thresholds: Number of thresholds to evaluate

    Returns:
        Dict with arrays for thresholds and metrics
    """
    from .metrics import sweep_thresholds

    thresholds = np.linspace(0, 1, num_thresholds)
    return sweep_thresholds(novelty_scores, is_novel_true, thresholds)
compare_thresholds(novelty_scores, is_novel_true, thresholds)

Compare metrics at specific thresholds.

Parameters:

Name Type Description Default
novelty_scores ndarray

Predicted novelty scores (higher = more novel)

required
is_novel_true ndarray

Ground truth novelty labels (True = novel)

required
thresholds list[float]

List of thresholds to evaluate

required

Returns:

Type Description
list[dict[str, float]]

List of dicts with metrics at each threshold

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py
def compare_thresholds(
    self,
    novelty_scores: np.ndarray,
    is_novel_true: np.ndarray,
    thresholds: list[float],
) -> list[dict[str, float]]:
    """
    Compare metrics at specific thresholds.

    Args:
        novelty_scores: Predicted novelty scores (higher = more novel)
        is_novel_true: Ground truth novelty labels (True = novel)
        thresholds: List of thresholds to evaluate

    Returns:
        List of dicts with metrics at each threshold
    """
    results = []
    for thresh in thresholds:
        metrics = self.evaluate(novelty_scores, is_novel_true, threshold=thresh)
        metrics["threshold"] = thresh
        results.append(metrics)
    return results

Functions

novelentitymatcher.novelty.evaluation.metrics

Metric computations for novelty detection evaluation.

Provides functions for computing AUROC, AUPRC, detection rates, precision, recall, F1, and confusion matrices.

Functions

compute_auroc(scores, labels)

Compute Area Under ROC Curve.

Parameters:

Name Type Description Default
scores ndarray

Predicted novelty scores (higher = more novel)

required
labels ndarray

Ground truth labels (True = novel)

required

Returns:

Type Description
float

AUROC score (0-1, 0.5 = random)

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py
def compute_auroc(scores: np.ndarray, labels: np.ndarray) -> float:
    """
    Compute Area Under ROC Curve.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)

    Returns:
        AUROC score (0-1, 0.5 = random)
    """
    from sklearn.metrics import roc_auc_score

    if len(np.unique(labels)) < 2:
        return 0.5

    try:
        return float(roc_auc_score(labels, scores))
    except ValueError:
        return 0.5

compute_auprc(scores, labels)

Compute Area Under Precision-Recall Curve.

Parameters:

Name Type Description Default
scores ndarray

Predicted novelty scores (higher = more novel)

required
labels ndarray

Ground truth labels (True = novel)

required

Returns:

Type Description
float

AUPRC score (0-1)

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py
def compute_auprc(scores: np.ndarray, labels: np.ndarray) -> float:
    """
    Compute Area Under Precision-Recall Curve.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)

    Returns:
        AUPRC score (0-1)
    """
    from sklearn.metrics import auc, precision_recall_curve

    if len(np.unique(labels)) < 2:
        return 0.0

    try:
        prec, rec, _ = precision_recall_curve(labels, scores)
        return float(auc(rec, prec))
    except ValueError:
        return 0.0

compute_detection_rates(scores, labels, fpr_thresholds=(0.01, 0.05, 0.1))

Compute detection rates at specific false positive rates.

Parameters:

Name Type Description Default
scores ndarray

Predicted novelty scores (higher = more novel)

required
labels ndarray

Ground truth labels (True = novel)

required
fpr_thresholds tuple[float, ...]

FPR values to compute detection rates for

(0.01, 0.05, 0.1)

Returns:

Type Description
dict[str, float]

Dict mapping fpr_percentage -> detection_rate

dict[str, float]

(e.g., "detection_rate_1" -> 0.95 for 1% FPR)

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py
def compute_detection_rates(
    scores: np.ndarray,
    labels: np.ndarray,
    fpr_thresholds: tuple[float, ...] = (0.01, 0.05, 0.10),
) -> dict[str, float]:
    """
    Compute detection rates at specific false positive rates.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)
        fpr_thresholds: FPR values to compute detection rates for

    Returns:
        Dict mapping fpr_percentage -> detection_rate
        (e.g., "detection_rate_1" -> 0.95 for 1% FPR)
    """
    results = {}

    for fpr in fpr_thresholds:
        non_novel_scores = scores[~labels]
        if len(non_novel_scores) == 0:
            detection_rate = 1.0 if np.all(labels) else 0.0
        else:
            threshold = np.percentile(non_novel_scores, (1 - fpr) * 100)
            detected = np.sum((scores >= threshold) & labels)
            total_novel = np.sum(labels)
            detection_rate = detected / total_novel if total_novel > 0 else 0.0

        percentage = int(fpr * 100)
        results[f"detection_rate_{percentage}"] = float(detection_rate)

    return results

compute_precision_recall_f1(scores, labels, threshold=None)

Compute precision, recall, and F1 score.

Parameters:

Name Type Description Default
scores ndarray

Predicted novelty scores (higher = more novel)

required
labels ndarray

Ground truth labels (True = novel)

required
threshold float | None

Decision threshold (if None, finds optimal)

None

Returns:

Type Description
dict[str, float]

Dict with precision, recall, f1, and threshold

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py
def compute_precision_recall_f1(
    scores: np.ndarray,
    labels: np.ndarray,
    threshold: float | None = None,
) -> dict[str, float]:
    """
    Compute precision, recall, and F1 score.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)
        threshold: Decision threshold (if None, finds optimal)

    Returns:
        Dict with precision, recall, f1, and threshold
    """
    if threshold is None:
        threshold = find_optimal_threshold(scores, labels)

    predictions = scores >= threshold

    tp = int(np.sum(predictions & labels))
    fp = int(np.sum(predictions & ~labels))
    fn = int(np.sum(~predictions & labels))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (
        2 * precision * recall / (precision + recall)
        if (precision + recall) > 0
        else 0.0
    )

    return {
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "threshold": float(threshold),
    }

find_optimal_threshold(scores, labels)

Find threshold that maximizes F1 score.

Parameters:

Name Type Description Default
scores ndarray

Predicted novelty scores (higher = more novel)

required
labels ndarray

Ground truth labels (True = novel)

required

Returns:

Type Description
float

Optimal threshold value

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py
def find_optimal_threshold(
    scores: np.ndarray,
    labels: np.ndarray,
) -> float:
    """
    Find threshold that maximizes F1 score.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)

    Returns:
        Optimal threshold value
    """
    thresholds: np.ndarray = np.percentile(scores, np.arange(5, 100, 5))
    best_f1 = 0.0
    best_thresh = 0.5

    for thresh in thresholds:
        predictions = scores >= thresh
        tp = np.sum(predictions & labels)
        fp = np.sum(predictions & ~labels)
        fn = np.sum(~predictions & labels)

        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0

        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh

    return float(best_thresh)

compute_confusion_matrix(scores, labels, threshold)

Compute confusion matrix components.

Parameters:

Name Type Description Default
scores ndarray

Predicted novelty scores (higher = more novel)

required
labels ndarray

Ground truth labels (True = novel)

required
threshold float

Decision threshold

required

Returns:

Type Description
dict[str, int]

Dict with tp, tn, fp, fn counts

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py
def compute_confusion_matrix(
    scores: np.ndarray,
    labels: np.ndarray,
    threshold: float,
) -> dict[str, int]:
    """
    Compute confusion matrix components.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)
        threshold: Decision threshold

    Returns:
        Dict with tp, tn, fp, fn counts
    """
    predictions = scores >= threshold

    tp = int(np.sum(predictions & labels))
    tn = int(np.sum(~predictions & ~labels))
    fp = int(np.sum(predictions & ~labels))
    fn = int(np.sum(~predictions & labels))

    return {"tp": tp, "tn": tn, "fp": fp, "fn": fn}

sweep_thresholds(scores, labels, thresholds=None)

Sweep across thresholds and compute metrics at each.

Parameters:

Name Type Description Default
scores ndarray

Predicted novelty scores (higher = more novel)

required
labels ndarray

Ground truth labels (True = novel)

required
thresholds ndarray | None

Array of thresholds to sweep (default: 0-100)

None

Returns:

Type Description
dict[str, ndarray]

Dict with arrays for thresholds, precision, recall, f1, tp, fp, tn, fn

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py
def sweep_thresholds(
    scores: np.ndarray,
    labels: np.ndarray,
    thresholds: np.ndarray | None = None,
) -> dict[str, np.ndarray]:
    """
    Sweep across thresholds and compute metrics at each.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)
        thresholds: Array of thresholds to sweep (default: 0-100)

    Returns:
        Dict with arrays for thresholds, precision, recall, f1, tp, fp, tn, fn
    """
    if thresholds is None:
        thresholds = np.linspace(0, 1, 101)

    precision = []
    recall = []
    f1 = []
    tp = []
    fp = []
    tn = []
    fn = []

    for thresh in thresholds:
        preds = scores >= thresh

        tp_i = np.sum(preds & labels)
        fp_i = np.sum(preds & ~labels)
        tn_i = np.sum(~preds & ~labels)
        fn_i = np.sum(~preds & labels)

        prec = tp_i / (tp_i + fp_i) if (tp_i + fp_i) > 0 else 0.0
        rec = tp_i / (tp_i + fn_i) if (tp_i + fn_i) > 0 else 0.0
        f1_i = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0

        precision.append(float(prec))
        recall.append(float(rec))
        f1.append(float(f1_i))
        tp.append(int(tp_i))
        fp.append(int(fp_i))
        tn.append(int(tn_i))
        fn.append(int(fn_i))

    return {
        "thresholds": thresholds,
        "precision": np.array(precision),
        "recall": np.array(recall),
        "f1": np.array(f1),
        "tp": np.array(tp),
        "fp": np.array(fp),
        "tn": np.array(tn),
        "fn": np.array(fn),
    }

novelentitymatcher.novelty.evaluation.splitters

Data splitters for novelty detection evaluation.

Provides utilities for creating OOD (Out-of-Distribution) splits and gradual novelty scenarios for testing.

Classes

OODSplitter(known_ratio=0.8, random_state=42)

Creates OOD (Out-of-Distribution) splits for novelty detection evaluation.

Splits data into known classes and unknown/novel classes to simulate the novelty detection scenario.

Parameters:

Name Type Description Default
known_ratio float

Fraction of classes to keep as known (0-1)

0.8
random_state int

Random seed for reproducibility

42
Source code in src/novelentitymatcher/novelty/evaluation/splitters.py
def __init__(
    self,
    known_ratio: float = 0.8,
    random_state: int = 42,
):
    """
    Initialize OOD splitter.

    Args:
        known_ratio: Fraction of classes to keep as known (0-1)
        random_state: Random seed for reproducibility
    """
    self.known_ratio = known_ratio
    self.random_state = random_state
Functions
create_split(texts, labels)

Create OOD train/test split.

Parameters:

Name Type Description Default
texts list[str]

List of input texts

required
labels list[str]

List of corresponding labels

required

Returns:

Type Description
list[str]

Tuple of (train_texts, train_labels, test_texts, test_is_novel)

list[str]
  • test_is_novel: True for novel (previously unknown) classes
Source code in src/novelentitymatcher/novelty/evaluation/splitters.py
def create_split(
    self,
    texts: list[str],
    labels: list[str],
) -> tuple[list[str], list[str], list[str], list[bool]]:
    """
    Create OOD train/test split.

    Args:
        texts: List of input texts
        labels: List of corresponding labels

    Returns:
        Tuple of (train_texts, train_labels, test_texts, test_is_novel)
        - test_is_novel: True for novel (previously unknown) classes
    """
    np.random.seed(self.random_state)

    unique_labels = sorted(set(labels))
    n_classes = len(unique_labels)
    n_known = max(1, int(n_classes * self.known_ratio))

    known_classes = set(np.random.choice(unique_labels, n_known, replace=False))

    train_texts = []
    train_labels = []
    test_texts = []
    test_is_novel = []

    for text, label in zip(texts, labels, strict=False):
        if label in known_classes:
            train_texts.append(text)
            train_labels.append(label)
        else:
            test_texts.append(text)
            test_is_novel.append(True)

    return train_texts, train_labels, test_texts, test_is_novel
create_split_with_indices(texts, labels)

Create OOD split with additional metadata.

Parameters:

Name Type Description Default
texts list[str]

List of input texts

required
labels list[str]

List of corresponding labels

required

Returns:

Type Description
dict[str, Any]

Dict with split data and metadata

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py
def create_split_with_indices(
    self,
    texts: list[str],
    labels: list[str],
) -> dict[str, Any]:
    """
    Create OOD split with additional metadata.

    Args:
        texts: List of input texts
        labels: List of corresponding labels

    Returns:
        Dict with split data and metadata
    """
    train_texts, train_labels, test_texts, test_is_novel = self.create_split(
        texts, labels
    )

    unique_labels = sorted(set(labels))
    known_classes = sorted(set(train_labels))
    novel_classes = sorted(set(unique_labels) - set(known_classes))

    return {
        "train_texts": train_texts,
        "train_labels": train_labels,
        "test_texts": test_texts,
        "test_is_novel": test_is_novel,
        "known_classes": known_classes,
        "novel_classes": novel_classes,
        "n_known": len(known_classes),
        "n_novel": len(novel_classes),
        "n_train": len(train_texts),
        "n_test": len(test_texts),
    }

GradualNoveltySplitter(known_ratios=None, random_state=42)

Creates multiple splits with gradually increasing novelty.

Useful for testing how novelty detection performance degrades as the number of novel classes increases.

Parameters:

Name Type Description Default
known_ratios list[float] | None

List of known ratios to create splits for

None
random_state int

Random seed for reproducibility

42
Source code in src/novelentitymatcher/novelty/evaluation/splitters.py
def __init__(
    self,
    known_ratios: list[float] | None = None,
    random_state: int = 42,
):
    """
    Initialize gradual novelty splitter.

    Args:
        known_ratios: List of known ratios to create splits for
        random_state: Random seed for reproducibility
    """
    self.known_ratios = known_ratios or [0.95, 0.9, 0.8, 0.7, 0.5]
    self.random_state = random_state
Functions
create_splits(texts, labels)

Create multiple splits with different novelty levels.

Parameters:

Name Type Description Default
texts list[str]

List of input texts

required
labels list[str]

List of corresponding labels

required

Returns:

Type Description
list[dict[str, Any]]

List of split dictionaries, one per known_ratio

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py
def create_splits(
    self,
    texts: list[str],
    labels: list[str],
) -> list[dict[str, Any]]:
    """
    Create multiple splits with different novelty levels.

    Args:
        texts: List of input texts
        labels: List of corresponding labels

    Returns:
        List of split dictionaries, one per known_ratio
    """
    splits = []

    for ratio in self.known_ratios:
        splitter = OODSplitter(known_ratio=ratio, random_state=self.random_state)
        split_data = splitter.create_split_with_indices(texts, labels)
        split_data["known_ratio"] = ratio
        splits.append(split_data)

    return splits
get_novelty_progression(texts, labels)

Get summary of novelty progression across splits.

Parameters:

Name Type Description Default
texts list[str]

List of input texts

required
labels list[str]

List of corresponding labels

required

Returns:

Type Description
dict[str, list]

Dict with arrays for known_ratio, n_known, n_novel

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py
def get_novelty_progression(
    self,
    texts: list[str],
    labels: list[str],
) -> dict[str, list]:
    """
    Get summary of novelty progression across splits.

    Args:
        texts: List of input texts
        labels: List of corresponding labels

    Returns:
        Dict with arrays for known_ratio, n_known, n_novel
    """
    splits = self.create_splits(texts, labels)

    return {
        "known_ratios": [s["known_ratio"] for s in splits],
        "n_known": [s["n_known"] for s in splits],
        "n_novel": [s["n_novel"] for s in splits],
        "n_train": [s["n_train"] for s in splits],
        "n_test": [s["n_test"] for s in splits],
    }