Novelty Evaluation¶

`novelentitymatcher.novelty.evaluation.evaluator` ¶

Unified novelty detection evaluator.

Supports both benchmark and research evaluation modes with comprehensive metrics and reporting.

Classes¶

`NoveltyEvaluator(mode='benchmark', metrics=None)` ¶

Unified evaluator for novelty detection.

Supports two modes: - benchmark: Quick evaluation on OOD splits with core metrics - research: Comprehensive evaluation with confusion matrices and threshold sweeping

Metrics computed: - AUROC, AUPRC - Detection rates at 1%, 5%, 10% FPR - Precision, Recall, F1 at optimal threshold

Parameters:

Name	Type	Description	Default
`mode`	`Literal['benchmark', 'research']`	Evaluation mode ('benchmark' or 'research')	`'benchmark'`
`metrics`	`list[str] \| None`	List of metrics to compute (None for default based on mode)	`None`

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py

def __init__(
    self,
    mode: Literal["benchmark", "research"] = "benchmark",
    metrics: list[str] | None = None,
):
    """
    Initialize the evaluator.

    Args:
        mode: Evaluation mode ('benchmark' or 'research')
        metrics: List of metrics to compute (None for default based on mode)
    """
    self.mode = mode
    self.metrics = metrics or self._default_metrics_for_mode(mode)

Functions¶

`evaluate(novelty_scores, is_novel_true, threshold=None)` ¶

Evaluate novelty detection performance.

Parameters:

Name	Type	Description	Default
`novelty_scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`is_novel_true`	`ndarray`	Ground truth novelty labels (True = novel)	required
`threshold`	`float \| None`	Optional threshold for discrete predictions	`None`

Returns:

Type	Description
`dict[str, float]`	Dictionary of metric name -> value

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py

def evaluate(
    self,
    novelty_scores: np.ndarray,
    is_novel_true: np.ndarray,
    threshold: float | None = None,
) -> dict[str, float]:
    """
    Evaluate novelty detection performance.

    Args:
        novelty_scores: Predicted novelty scores (higher = more novel)
        is_novel_true: Ground truth novelty labels (True = novel)
        threshold: Optional threshold for discrete predictions

    Returns:
        Dictionary of metric name -> value
    """
    scores = np.asarray(novelty_scores)
    labels = np.asarray(is_novel_true, dtype=bool)

    results = {}

    # AUROC and AUPRC
    if "auroc" in self.metrics:
        results["auroc"] = compute_auroc(scores, labels)

    if "auprc" in self.metrics:
        results["auprc"] = compute_auprc(scores, labels)

    # Detection rates at various FPR thresholds
    if any(m.startswith("detection_rate_") for m in self.metrics):
        dr_metrics = [m for m in self.metrics if m.startswith("detection_rate_")]
        fpr_thresholds = []
        for m in dr_metrics:
            if m == "detection_rate_1":
                fpr_thresholds.append(0.01)
            elif m == "detection_rate_5":
                fpr_thresholds.append(0.05)
            elif m == "detection_rate_10":
                fpr_thresholds.append(0.10)

        if fpr_thresholds:
            detection_rates = compute_detection_rates(
                scores, labels, tuple(fpr_thresholds)
            )
            results.update(detection_rates)

    # Precision, Recall, F1
    if any(m in ["precision", "recall", "f1"] for m in self.metrics):
        prf_results = compute_precision_recall_f1(scores, labels, threshold)
        if "precision" in self.metrics:
            results["precision"] = prf_results["precision"]
        if "recall" in self.metrics:
            results["recall"] = prf_results["recall"]
        if "f1" in self.metrics:
            results["f1"] = prf_results["f1"]
        results["optimal_threshold"] = prf_results["threshold"]

    return results

`create_report(novelty_scores, is_novel_true, threshold=None)` ¶

Create a comprehensive evaluation report.

Parameters:

Name	Type	Description	Default
`novelty_scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`is_novel_true`	`ndarray`	Ground truth novelty labels (True = novel)	required
`threshold`	`float \| None`	Optional threshold for discrete predictions	`None`

Returns:

Type	Description
`EvaluationReport`	EvaluationReport with all metrics

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py

def create_report(
    self,
    novelty_scores: np.ndarray,
    is_novel_true: np.ndarray,
    threshold: float | None = None,
) -> EvaluationReport:
    """
    Create a comprehensive evaluation report.

    Args:
        novelty_scores: Predicted novelty scores (higher = more novel)
        is_novel_true: Ground truth novelty labels (True = novel)
        threshold: Optional threshold for discrete predictions

    Returns:
        EvaluationReport with all metrics
    """
    scores = np.asarray(novelty_scores)
    labels = np.asarray(is_novel_true, dtype=bool)

    # Compute all metrics
    auroc = compute_auroc(scores, labels)
    auprc = compute_auprc(scores, labels)

    detection_rates = compute_detection_rates(scores, labels)
    dr_at_1 = detection_rates.get("detection_rate_1", 0.0)
    dr_at_5 = detection_rates.get("detection_rate_5", 0.0)
    dr_at_10 = detection_rates.get("detection_rate_10", 0.0)

    prf_results = compute_precision_recall_f1(scores, labels, threshold)
    optimal_threshold = prf_results["threshold"]

    # Confusion matrix
    cm = compute_confusion_matrix(scores, labels, optimal_threshold)

    return EvaluationReport(
        auroc=auroc,
        auprc=auprc,
        detection_rate_at_1=dr_at_1,
        detection_rate_at_5=dr_at_5,
        detection_rate_at_10=dr_at_10,
        precision=prf_results["precision"],
        recall=prf_results["recall"],
        f1=prf_results["f1"],
        optimal_threshold=optimal_threshold,
        confusion_matrix=cm,
        num_samples=len(scores),
        num_novel=int(np.sum(labels)),
        timestamp=datetime.now().isoformat(),
    )

`sweep_thresholds(novelty_scores, is_novel_true, num_thresholds=100)` ¶

Sweep across thresholds and compute metrics at each.

Parameters:

Name	Type	Description	Default
`novelty_scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`is_novel_true`	`ndarray`	Ground truth novelty labels (True = novel)	required
`num_thresholds`	`int`	Number of thresholds to evaluate	`100`

Returns:

Type	Description
`dict[str, ndarray]`	Dict with arrays for thresholds and metrics

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py

def sweep_thresholds(
    self,
    novelty_scores: np.ndarray,
    is_novel_true: np.ndarray,
    num_thresholds: int = 100,
) -> dict[str, np.ndarray]:
    """
    Sweep across thresholds and compute metrics at each.

    Args:
        novelty_scores: Predicted novelty scores (higher = more novel)
        is_novel_true: Ground truth novelty labels (True = novel)
        num_thresholds: Number of thresholds to evaluate

    Returns:
        Dict with arrays for thresholds and metrics
    """
    from .metrics import sweep_thresholds

    thresholds = np.linspace(0, 1, num_thresholds)
    return sweep_thresholds(novelty_scores, is_novel_true, thresholds)

`compare_thresholds(novelty_scores, is_novel_true, thresholds)` ¶

Compare metrics at specific thresholds.

Parameters:

Name	Type	Description	Default
`novelty_scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`is_novel_true`	`ndarray`	Ground truth novelty labels (True = novel)	required
`thresholds`	`list[float]`	List of thresholds to evaluate	required

Returns:

Type	Description
`list[dict[str, float]]`	List of dicts with metrics at each threshold

Source code in src/novelentitymatcher/novelty/evaluation/evaluator.py

def compare_thresholds(
    self,
    novelty_scores: np.ndarray,
    is_novel_true: np.ndarray,
    thresholds: list[float],
) -> list[dict[str, float]]:
    """
    Compare metrics at specific thresholds.

    Args:
        novelty_scores: Predicted novelty scores (higher = more novel)
        is_novel_true: Ground truth novelty labels (True = novel)
        thresholds: List of thresholds to evaluate

    Returns:
        List of dicts with metrics at each threshold
    """
    results = []
    for thresh in thresholds:
        metrics = self.evaluate(novelty_scores, is_novel_true, threshold=thresh)
        metrics["threshold"] = thresh
        results.append(metrics)
    return results

Functions¶

`novelentitymatcher.novelty.evaluation.metrics` ¶

Metric computations for novelty detection evaluation.

Provides functions for computing AUROC, AUPRC, detection rates, precision, recall, F1, and confusion matrices.

Functions¶

`compute_auroc(scores, labels)` ¶

Compute Area Under ROC Curve.

Parameters:

Name	Type	Description	Default
`scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`labels`	`ndarray`	Ground truth labels (True = novel)	required

Returns:

Type	Description
`float`	AUROC score (0-1, 0.5 = random)

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py

def compute_auroc(scores: np.ndarray, labels: np.ndarray) -> float:
    """
    Compute Area Under ROC Curve.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)

    Returns:
        AUROC score (0-1, 0.5 = random)
    """
    from sklearn.metrics import roc_auc_score

    if len(np.unique(labels)) < 2:
        return 0.5

    try:
        return float(roc_auc_score(labels, scores))
    except ValueError:
        return 0.5

`compute_auprc(scores, labels)` ¶

Compute Area Under Precision-Recall Curve.

Parameters:

Name	Type	Description	Default
`scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`labels`	`ndarray`	Ground truth labels (True = novel)	required

Returns:

Type	Description
`float`	AUPRC score (0-1)

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py

def compute_auprc(scores: np.ndarray, labels: np.ndarray) -> float:
    """
    Compute Area Under Precision-Recall Curve.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)

    Returns:
        AUPRC score (0-1)
    """
    from sklearn.metrics import auc, precision_recall_curve

    if len(np.unique(labels)) < 2:
        return 0.0

    try:
        prec, rec, _ = precision_recall_curve(labels, scores)
        return float(auc(rec, prec))
    except ValueError:
        return 0.0

`compute_detection_rates(scores, labels, fpr_thresholds=(0.01, 0.05, 0.1))` ¶

Compute detection rates at specific false positive rates.

Parameters:

Name	Type	Description	Default
`scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`labels`	`ndarray`	Ground truth labels (True = novel)	required
`fpr_thresholds`	`tuple[float, ...]`	FPR values to compute detection rates for	`(0.01, 0.05, 0.1)`

Returns:

Type	Description
`dict[str, float]`	Dict mapping fpr_percentage -> detection_rate
`dict[str, float]`	(e.g., "detection_rate_1" -> 0.95 for 1% FPR)

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py

def compute_detection_rates(
    scores: np.ndarray,
    labels: np.ndarray,
    fpr_thresholds: tuple[float, ...] = (0.01, 0.05, 0.10),
) -> dict[str, float]:
    """
    Compute detection rates at specific false positive rates.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)
        fpr_thresholds: FPR values to compute detection rates for

    Returns:
        Dict mapping fpr_percentage -> detection_rate
        (e.g., "detection_rate_1" -> 0.95 for 1% FPR)
    """
    results = {}

    for fpr in fpr_thresholds:
        non_novel_scores = scores[~labels]
        if len(non_novel_scores) == 0:
            detection_rate = 1.0 if np.all(labels) else 0.0
        else:
            threshold = np.percentile(non_novel_scores, (1 - fpr) * 100)
            detected = np.sum((scores >= threshold) & labels)
            total_novel = np.sum(labels)
            detection_rate = detected / total_novel if total_novel > 0 else 0.0

        percentage = int(fpr * 100)
        results[f"detection_rate_{percentage}"] = float(detection_rate)

    return results

`compute_precision_recall_f1(scores, labels, threshold=None)` ¶

Compute precision, recall, and F1 score.

Parameters:

Name	Type	Description	Default
`scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`labels`	`ndarray`	Ground truth labels (True = novel)	required
`threshold`	`float \| None`	Decision threshold (if None, finds optimal)	`None`

Returns:

Type	Description
`dict[str, float]`	Dict with precision, recall, f1, and threshold

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py

def compute_precision_recall_f1(
    scores: np.ndarray,
    labels: np.ndarray,
    threshold: float | None = None,
) -> dict[str, float]:
    """
    Compute precision, recall, and F1 score.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)
        threshold: Decision threshold (if None, finds optimal)

    Returns:
        Dict with precision, recall, f1, and threshold
    """
    if threshold is None:
        threshold = find_optimal_threshold(scores, labels)

    predictions = scores >= threshold

    tp = int(np.sum(predictions & labels))
    fp = int(np.sum(predictions & ~labels))
    fn = int(np.sum(~predictions & labels))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (
        2 * precision * recall / (precision + recall)
        if (precision + recall) > 0
        else 0.0
    )

    return {
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "threshold": float(threshold),
    }

`find_optimal_threshold(scores, labels)` ¶

Find threshold that maximizes F1 score.

Parameters:

Name	Type	Description	Default
`scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`labels`	`ndarray`	Ground truth labels (True = novel)	required

Returns:

Type	Description
`float`	Optimal threshold value

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py

def find_optimal_threshold(
    scores: np.ndarray,
    labels: np.ndarray,
) -> float:
    """
    Find threshold that maximizes F1 score.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)

    Returns:
        Optimal threshold value
    """
    thresholds: np.ndarray = np.percentile(scores, np.arange(5, 100, 5))
    best_f1 = 0.0
    best_thresh = 0.5

    for thresh in thresholds:
        predictions = scores >= thresh
        tp = np.sum(predictions & labels)
        fp = np.sum(predictions & ~labels)
        fn = np.sum(~predictions & labels)

        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0

        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh

    return float(best_thresh)

`compute_confusion_matrix(scores, labels, threshold)` ¶

Compute confusion matrix components.

Parameters:

Name	Type	Description	Default
`scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`labels`	`ndarray`	Ground truth labels (True = novel)	required
`threshold`	`float`	Decision threshold	required

Returns:

Type	Description
`dict[str, int]`	Dict with tp, tn, fp, fn counts

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py

def compute_confusion_matrix(
    scores: np.ndarray,
    labels: np.ndarray,
    threshold: float,
) -> dict[str, int]:
    """
    Compute confusion matrix components.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)
        threshold: Decision threshold

    Returns:
        Dict with tp, tn, fp, fn counts
    """
    predictions = scores >= threshold

    tp = int(np.sum(predictions & labels))
    tn = int(np.sum(~predictions & ~labels))
    fp = int(np.sum(predictions & ~labels))
    fn = int(np.sum(~predictions & labels))

    return {"tp": tp, "tn": tn, "fp": fp, "fn": fn}

`sweep_thresholds(scores, labels, thresholds=None)` ¶

Sweep across thresholds and compute metrics at each.

Parameters:

Name	Type	Description	Default
`scores`	`ndarray`	Predicted novelty scores (higher = more novel)	required
`labels`	`ndarray`	Ground truth labels (True = novel)	required
`thresholds`	`ndarray \| None`	Array of thresholds to sweep (default: 0-100)	`None`

Returns:

Type	Description
`dict[str, ndarray]`	Dict with arrays for thresholds, precision, recall, f1, tp, fp, tn, fn

Source code in src/novelentitymatcher/novelty/evaluation/metrics.py

def sweep_thresholds(
    scores: np.ndarray,
    labels: np.ndarray,
    thresholds: np.ndarray | None = None,
) -> dict[str, np.ndarray]:
    """
    Sweep across thresholds and compute metrics at each.

    Args:
        scores: Predicted novelty scores (higher = more novel)
        labels: Ground truth labels (True = novel)
        thresholds: Array of thresholds to sweep (default: 0-100)

    Returns:
        Dict with arrays for thresholds, precision, recall, f1, tp, fp, tn, fn
    """
    if thresholds is None:
        thresholds = np.linspace(0, 1, 101)

    precision = []
    recall = []
    f1 = []
    tp = []
    fp = []
    tn = []
    fn = []

    for thresh in thresholds:
        preds = scores >= thresh

        tp_i = np.sum(preds & labels)
        fp_i = np.sum(preds & ~labels)
        tn_i = np.sum(~preds & ~labels)
        fn_i = np.sum(~preds & labels)

        prec = tp_i / (tp_i + fp_i) if (tp_i + fp_i) > 0 else 0.0
        rec = tp_i / (tp_i + fn_i) if (tp_i + fn_i) > 0 else 0.0
        f1_i = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0

        precision.append(float(prec))
        recall.append(float(rec))
        f1.append(float(f1_i))
        tp.append(int(tp_i))
        fp.append(int(fp_i))
        tn.append(int(tn_i))
        fn.append(int(fn_i))

    return {
        "thresholds": thresholds,
        "precision": np.array(precision),
        "recall": np.array(recall),
        "f1": np.array(f1),
        "tp": np.array(tp),
        "fp": np.array(fp),
        "tn": np.array(tn),
        "fn": np.array(fn),
    }

`novelentitymatcher.novelty.evaluation.splitters` ¶

Data splitters for novelty detection evaluation.

Provides utilities for creating OOD (Out-of-Distribution) splits and gradual novelty scenarios for testing.

Classes¶

`OODSplitter(known_ratio=0.8, random_state=42)` ¶

Creates OOD (Out-of-Distribution) splits for novelty detection evaluation.

Splits data into known classes and unknown/novel classes to simulate the novelty detection scenario.

Parameters:

Name	Type	Description	Default
`known_ratio`	`float`	Fraction of classes to keep as known (0-1)	`0.8`
`random_state`	`int`	Random seed for reproducibility	`42`

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py

def __init__(
    self,
    known_ratio: float = 0.8,
    random_state: int = 42,
):
    """
    Initialize OOD splitter.

    Args:
        known_ratio: Fraction of classes to keep as known (0-1)
        random_state: Random seed for reproducibility
    """
    self.known_ratio = known_ratio
    self.random_state = random_state

Functions¶

`create_split(texts, labels)` ¶

Create OOD train/test split.

Parameters:

Name	Type	Description	Default
`texts`	`list[str]`	List of input texts	required
`labels`	`list[str]`	List of corresponding labels	required

Returns:

Type	Description
`list[str]`	Tuple of (train_texts, train_labels, test_texts, test_is_novel)
`list[str]`	test_is_novel: True for novel (previously unknown) classes

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py

def create_split(
    self,
    texts: list[str],
    labels: list[str],
) -> tuple[list[str], list[str], list[str], list[bool]]:
    """
    Create OOD train/test split.

    Args:
        texts: List of input texts
        labels: List of corresponding labels

    Returns:
        Tuple of (train_texts, train_labels, test_texts, test_is_novel)
        - test_is_novel: True for novel (previously unknown) classes
    """
    np.random.seed(self.random_state)

    unique_labels = sorted(set(labels))
    n_classes = len(unique_labels)
    n_known = max(1, int(n_classes * self.known_ratio))

    known_classes = set(np.random.choice(unique_labels, n_known, replace=False))

    train_texts = []
    train_labels = []
    test_texts = []
    test_is_novel = []

    for text, label in zip(texts, labels, strict=False):
        if label in known_classes:
            train_texts.append(text)
            train_labels.append(label)
        else:
            test_texts.append(text)
            test_is_novel.append(True)

    return train_texts, train_labels, test_texts, test_is_novel

`create_split_with_indices(texts, labels)` ¶

Create OOD split with additional metadata.

Parameters:

Name	Type	Description	Default
`texts`	`list[str]`	List of input texts	required
`labels`	`list[str]`	List of corresponding labels	required

Returns:

Type	Description
`dict[str, Any]`	Dict with split data and metadata

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py

def create_split_with_indices(
    self,
    texts: list[str],
    labels: list[str],
) -> dict[str, Any]:
    """
    Create OOD split with additional metadata.

    Args:
        texts: List of input texts
        labels: List of corresponding labels

    Returns:
        Dict with split data and metadata
    """
    train_texts, train_labels, test_texts, test_is_novel = self.create_split(
        texts, labels
    )

    unique_labels = sorted(set(labels))
    known_classes = sorted(set(train_labels))
    novel_classes = sorted(set(unique_labels) - set(known_classes))

    return {
        "train_texts": train_texts,
        "train_labels": train_labels,
        "test_texts": test_texts,
        "test_is_novel": test_is_novel,
        "known_classes": known_classes,
        "novel_classes": novel_classes,
        "n_known": len(known_classes),
        "n_novel": len(novel_classes),
        "n_train": len(train_texts),
        "n_test": len(test_texts),
    }

`GradualNoveltySplitter(known_ratios=None, random_state=42)` ¶

Creates multiple splits with gradually increasing novelty.

Useful for testing how novelty detection performance degrades as the number of novel classes increases.

Parameters:

Name	Type	Description	Default
`known_ratios`	`list[float] \| None`	List of known ratios to create splits for	`None`
`random_state`	`int`	Random seed for reproducibility	`42`

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py

def __init__(
    self,
    known_ratios: list[float] | None = None,
    random_state: int = 42,
):
    """
    Initialize gradual novelty splitter.

    Args:
        known_ratios: List of known ratios to create splits for
        random_state: Random seed for reproducibility
    """
    self.known_ratios = known_ratios or [0.95, 0.9, 0.8, 0.7, 0.5]
    self.random_state = random_state

Functions¶

`create_splits(texts, labels)` ¶

Create multiple splits with different novelty levels.

Parameters:

Name	Type	Description	Default
`texts`	`list[str]`	List of input texts	required
`labels`	`list[str]`	List of corresponding labels	required

Returns:

Type	Description
`list[dict[str, Any]]`	List of split dictionaries, one per known_ratio

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py

def create_splits(
    self,
    texts: list[str],
    labels: list[str],
) -> list[dict[str, Any]]:
    """
    Create multiple splits with different novelty levels.

    Args:
        texts: List of input texts
        labels: List of corresponding labels

    Returns:
        List of split dictionaries, one per known_ratio
    """
    splits = []

    for ratio in self.known_ratios:
        splitter = OODSplitter(known_ratio=ratio, random_state=self.random_state)
        split_data = splitter.create_split_with_indices(texts, labels)
        split_data["known_ratio"] = ratio
        splits.append(split_data)

    return splits

`get_novelty_progression(texts, labels)` ¶

Get summary of novelty progression across splits.

Parameters:

Name	Type	Description	Default
`texts`	`list[str]`	List of input texts	required
`labels`	`list[str]`	List of corresponding labels	required

Returns:

Type	Description
`dict[str, list]`	Dict with arrays for known_ratio, n_known, n_novel

Source code in src/novelentitymatcher/novelty/evaluation/splitters.py

def get_novelty_progression(
    self,
    texts: list[str],
    labels: list[str],
) -> dict[str, list]:
    """
    Get summary of novelty progression across splits.

    Args:
        texts: List of input texts
        labels: List of corresponding labels

    Returns:
        Dict with arrays for known_ratio, n_known, n_novel
    """
    splits = self.create_splits(texts, labels)

    return {
        "known_ratios": [s["known_ratio"] for s in splits],
        "n_known": [s["n_known"] for s in splits],
        "n_novel": [s["n_novel"] for s in splits],
        "n_train": [s["n_train"] for s in splits],
        "n_test": [s["n_test"] for s in splits],
    }

Novelty Evaluation¶

novelentitymatcher.novelty.evaluation.evaluator ¶

Classes¶

NoveltyEvaluator(mode='benchmark', metrics=None) ¶

Functions¶

evaluate(novelty_scores, is_novel_true, threshold=None) ¶

create_report(novelty_scores, is_novel_true, threshold=None) ¶

sweep_thresholds(novelty_scores, is_novel_true, num_thresholds=100) ¶

compare_thresholds(novelty_scores, is_novel_true, thresholds) ¶

Functions¶

novelentitymatcher.novelty.evaluation.metrics ¶

Functions¶

compute_auroc(scores, labels) ¶

compute_auprc(scores, labels) ¶

compute_detection_rates(scores, labels, fpr_thresholds=(0.01, 0.05, 0.1)) ¶

compute_precision_recall_f1(scores, labels, threshold=None) ¶

find_optimal_threshold(scores, labels) ¶

compute_confusion_matrix(scores, labels, threshold) ¶

sweep_thresholds(scores, labels, thresholds=None) ¶

novelentitymatcher.novelty.evaluation.splitters ¶

Classes¶

OODSplitter(known_ratio=0.8, random_state=42) ¶

Functions¶

create_split(texts, labels) ¶

create_split_with_indices(texts, labels) ¶

GradualNoveltySplitter(known_ratios=None, random_state=42) ¶

Functions¶

create_splits(texts, labels) ¶

get_novelty_progression(texts, labels) ¶

`novelentitymatcher.novelty.evaluation.evaluator` ¶

`NoveltyEvaluator(mode='benchmark', metrics=None)` ¶

`evaluate(novelty_scores, is_novel_true, threshold=None)` ¶

`create_report(novelty_scores, is_novel_true, threshold=None)` ¶

`sweep_thresholds(novelty_scores, is_novel_true, num_thresholds=100)` ¶

`compare_thresholds(novelty_scores, is_novel_true, thresholds)` ¶

`novelentitymatcher.novelty.evaluation.metrics` ¶

`compute_auroc(scores, labels)` ¶

`compute_auprc(scores, labels)` ¶

`compute_detection_rates(scores, labels, fpr_thresholds=(0.01, 0.05, 0.1))` ¶

`compute_precision_recall_f1(scores, labels, threshold=None)` ¶

`find_optimal_threshold(scores, labels)` ¶

`compute_confusion_matrix(scores, labels, threshold)` ¶

`sweep_thresholds(scores, labels, thresholds=None)` ¶

`novelentitymatcher.novelty.evaluation.splitters` ¶

`OODSplitter(known_ratio=0.8, random_state=42)` ¶

`create_split(texts, labels)` ¶

`create_split_with_indices(texts, labels)` ¶

`GradualNoveltySplitter(known_ratios=None, random_state=42)` ¶

`create_splits(texts, labels)` ¶

`get_novelty_progression(texts, labels)` ¶