Skip to content

Novelty Proposals

novelentitymatcher.novelty.proposal.llm

LLM-based class proposal system for novel class discovery.

Uses litellm with structured output to generate meaningful class names and descriptions for clusters of novel samples.

Classes

LLMProposalSchema

Bases: BaseModel

Schema enforcing the exact JSON structure expected from LLM proposals.

Functions
get_schema_json() classmethod

Return the JSON Schema representation.

Source code in src/novelentitymatcher/novelty/proposal/llm.py
@classmethod
def get_schema_json(cls) -> str:
    """Return the JSON Schema representation."""
    return json.dumps(cls.model_json_schema(), indent=2)

LLMProposalWithSchemaSchema

Bases: BaseModel

Schema for proposals that include attribute/field discovery.

Functions
get_schema_json() classmethod

Return the JSON Schema representation.

Source code in src/novelentitymatcher/novelty/proposal/llm.py
@classmethod
def get_schema_json(cls) -> str:
    """Return the JSON Schema representation."""
    return json.dumps(cls.model_json_schema(), indent=2)

LLMClassProposer(primary_model=None, provider=None, fallback_models=None, api_keys=None, temperature=0.3, max_tokens=4096, max_clusters_per_summary=20)

Propose new class names and descriptions using LLMs.

Uses litellm for multi-provider support with automatic fallback.

Parameters:

Name Type Description Default
primary_model str | None

Primary model to use (e.g., 'openrouter/anthropic/claude-sonnet-4')

None
provider str | None

Preferred provider when auto-selecting a default model

None
fallback_models list[str] | None

Fallback models if primary fails

None
api_keys dict[str, str] | None

API keys for providers (e.g., {'openrouter': 'sk-...'})

None
temperature float

Sampling temperature

0.3
max_tokens int

Maximum tokens in response

4096
max_clusters_per_summary int

Maximum clusters to include per LLM summary call (for hierarchical mode)

20
Source code in src/novelentitymatcher/novelty/proposal/llm.py
def __init__(
    self,
    primary_model: str | None = None,
    provider: str | None = None,
    fallback_models: list[str] | None = None,
    api_keys: dict[str, str] | None = None,
    temperature: float = 0.3,
    max_tokens: int = 4096,
    max_clusters_per_summary: int = 20,
):
    """
    Initialize LLM class proposer.

    Args:
        primary_model: Primary model to use (e.g., 'openrouter/anthropic/claude-sonnet-4')
        provider: Preferred provider when auto-selecting a default model
        fallback_models: Fallback models if primary fails
        api_keys: API keys for providers (e.g., {'openrouter': 'sk-...'})
        temperature: Sampling temperature
        max_tokens: Maximum tokens in response
        max_clusters_per_summary: Maximum clusters to include per LLM summary call (for hierarchical mode)
    """
    self.primary_model = primary_model or os.getenv(
        "LLM_CLASS_PROPOSER_MODEL",
        self._default_model_for_provider(provider),
    )
    default_fallbacks = [
        model for model in DEFAULT_PROVIDERS if model != self.primary_model
    ]
    self.fallback_models = fallback_models or default_fallbacks
    self._api_keys = api_keys or self._get_api_keys_from_env()
    self.temperature = temperature
    self.max_tokens = max_tokens
    self.max_clusters_per_summary = max_clusters_per_summary

    # Load LLM configuration with environment variable support when available.
    try:
        from .config import get_llm_config

        self.config = get_llm_config()
    except ImportError:  # pragma: no cover - optional dependency
        self.config = SimpleNamespace(  # type: ignore[assignment]
            timeout=30,
            max_retries=5,
            circuit_fail_max=3,
            circuit_reset_seconds=60,
        )

    # Create circuit breaker for LLM API calls
    self.llm_circuit_breaker = CircuitBreaker(
        fail_max=self.config.circuit_fail_max,
        timeout_duration=timedelta(seconds=self.config.circuit_reset_seconds),
    )

    logger.info(
        f"LLMClassProposer initialized: timeout={self.config.timeout}s, "
        f"max_retries={self.config.max_retries}, "
        f"circuit_fail_max={self.config.circuit_fail_max}, "
        f"circuit_reset={self.config.circuit_reset_seconds}s"
    )
Functions
propose_classes(novel_samples, existing_classes, context=None)

Propose new classes based on novel samples.

Parameters:

Name Type Description Default
novel_samples list[NovelSampleMetadata]

List of detected novel samples

required
existing_classes list[str]

List of existing class names

required
context str | None

Optional domain context

None

Returns:

Type Description
NovelClassAnalysis

NovelClassAnalysis with proposed classes

Source code in src/novelentitymatcher/novelty/proposal/llm.py
def propose_classes(
    self,
    novel_samples: list[NovelSampleMetadata],
    existing_classes: list[str],
    context: str | None = None,
) -> NovelClassAnalysis:
    """
    Propose new classes based on novel samples.

    Args:
        novel_samples: List of detected novel samples
        existing_classes: List of existing class names
        context: Optional domain context

    Returns:
        NovelClassAnalysis with proposed classes
    """
    if not novel_samples:
        raise ValueError("novel_samples cannot be empty")

    logger.info(
        f"Proposing classes for {len(novel_samples)} novel samples "
        f"using model: {self.primary_model}"
    )

    clustered_samples = self._group_by_cluster(novel_samples)
    prompt = self._build_proposal_prompt(
        novel_samples,
        existing_classes,
        clustered_samples,
        context,
    )
    clusters = self._clusters_from_samples(clustered_samples)
    return self._run_structured_cluster_proposal(
        prompt=prompt,
        discovery_clusters=clusters,
        novel_samples=novel_samples,
    )
propose_from_clusters(discovery_clusters, existing_classes, context=None, max_retries=2, hierarchical=True)

Generate proposals from cluster-level evidence.

Parameters:

Name Type Description Default
discovery_clusters list[DiscoveryCluster]

List of discovery clusters.

required
existing_classes list[str]

List of existing class names.

required
context str | None

Optional domain context.

None
max_retries int

Maximum retry attempts.

2
hierarchical bool

If True, use hierarchical summarization for large cluster sets.

True
Source code in src/novelentitymatcher/novelty/proposal/llm.py
def propose_from_clusters(
    self,
    discovery_clusters: list[DiscoveryCluster],
    existing_classes: list[str],
    context: str | None = None,
    max_retries: int = 2,
    hierarchical: bool = True,
) -> NovelClassAnalysis:
    """Generate proposals from cluster-level evidence.

    Args:
        discovery_clusters: List of discovery clusters.
        existing_classes: List of existing class names.
        context: Optional domain context.
        max_retries: Maximum retry attempts.
        hierarchical: If True, use hierarchical summarization for large cluster sets.
    """
    if not discovery_clusters:
        raise ValueError("discovery_clusters cannot be empty")

    if hierarchical and len(discovery_clusters) > self.max_clusters_per_summary:
        return self._propose_hierarchical(
            discovery_clusters, existing_classes, context, max_retries
        )

    prompt = self._build_cluster_prompt(
        discovery_clusters=discovery_clusters,
        existing_classes=existing_classes,
        context=context,
    )
    return self._run_structured_cluster_proposal(
        prompt=prompt,
        discovery_clusters=discovery_clusters,
        max_retries=max_retries,
    )
propose_from_clusters_with_schema(discovery_clusters, existing_classes, context=None, max_retries=2, hierarchical=True, max_attributes=10)

Generate proposals with attribute/field discovery from cluster evidence.

Like propose_from_clusters but the LLM prompt requests discovery of common attributes and data structures for each proposed class.

Parameters:

Name Type Description Default
discovery_clusters list[DiscoveryCluster]

List of discovery clusters.

required
existing_classes list[str]

List of existing class names.

required
context str | None

Optional domain context.

None
max_retries int

Maximum retry attempts.

2
hierarchical bool

If True, use hierarchical summarization for large cluster sets.

True
Source code in src/novelentitymatcher/novelty/proposal/llm.py
def propose_from_clusters_with_schema(
    self,
    discovery_clusters: list[DiscoveryCluster],
    existing_classes: list[str],
    context: str | None = None,
    max_retries: int = 2,
    hierarchical: bool = True,
    max_attributes: int = 10,
) -> NovelClassAnalysis:
    """Generate proposals with attribute/field discovery from cluster evidence.

    Like ``propose_from_clusters`` but the LLM prompt requests discovery of
    common attributes and data structures for each proposed class.

    Args:
        discovery_clusters: List of discovery clusters.
        existing_classes: List of existing class names.
        context: Optional domain context.
        max_retries: Maximum retry attempts.
        hierarchical: If True, use hierarchical summarization for large cluster sets.
    """
    if not discovery_clusters:
        raise ValueError("discovery_clusters cannot be empty")

    if hierarchical and len(discovery_clusters) > self.max_clusters_per_summary:
        return self._propose_hierarchical(
            discovery_clusters,
            existing_classes,
            context,
            max_retries,
            include_schema_discovery=True,
            max_attributes=max_attributes,
        )

    prompt = self._build_cluster_prompt_with_schema(
        discovery_clusters=discovery_clusters,
        existing_classes=existing_classes,
        context=context,
        max_attributes=max_attributes,
    )
    analysis = self._run_structured_cluster_proposal(
        prompt=prompt,
        discovery_clusters=discovery_clusters,
        max_retries=max_retries,
        retry_schema_json=LLMProposalWithSchemaSchema.get_schema_json(),
    )

    if analysis.proposed_classes:
        analysis = self._enrich_proposals_with_schema(analysis)

    return analysis

Functions

novelentitymatcher.novelty.proposal.retrieval

Retrieval-Augmented LLM Class Proposer.

Enhances LLM-based class proposal with retrieval of in-context examples using dense embeddings (BGE-M3 style) for improved class naming.

Classes

RetrievalAugmentedProposer(retriever=None, llm_proposer=None, k_examples=5, k_novel_per_class=3, retrieval_metric='cosine', rerank=False)

LLM class proposer enhanced with retrieval-based in-context examples.

Retrieves most relevant examples from a corpus to include in the LLM prompt, improving class naming quality.

Parameters:

Name Type Description Default
retriever EmbeddingBackend | None

Embedding backend for retrieval (e.g., BGE-M3)

None
llm_proposer Any | None

Existing LLMClassProposer to enhance

None
k_examples int

Number of in-context examples to retrieve

5
k_novel_per_class int

Number of novel examples per proposed class

3
retrieval_metric str

Similarity metric for retrieval

'cosine'
rerank bool

Whether to use reranking for better examples

False
Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
def __init__(
    self,
    retriever: EmbeddingBackend | None = None,
    llm_proposer: Any | None = None,
    k_examples: int = 5,
    k_novel_per_class: int = 3,
    retrieval_metric: str = "cosine",
    rerank: bool = False,
):
    """
    Initialize retrieval-augmented proposer.

    Args:
        retriever: Embedding backend for retrieval (e.g., BGE-M3)
        llm_proposer: Existing LLMClassProposer to enhance
        k_examples: Number of in-context examples to retrieve
        k_novel_per_class: Number of novel examples per proposed class
        retrieval_metric: Similarity metric for retrieval
        rerank: Whether to use reranking for better examples
    """
    self.retriever = retriever
    self.llm_proposer = llm_proposer
    self.k_examples = k_examples
    self.k_novel_per_class = k_novel_per_class
    self.retrieval_metric = retrieval_metric
    self.rerank = rerank

    self._example_corpus: list[str] = []
    self._example_embeddings: Any | None = None
    self._is_indexed: bool = False
Attributes
is_ready property

Check if proposer is ready for use.

Functions
index_examples(examples, embeddings=None)

Index examples for retrieval.

Parameters:

Name Type Description Default
examples list[str]

List of example texts to index

required
embeddings Any | None

Pre-computed embeddings (if None, will compute)

None
Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
def index_examples(
    self,
    examples: list[str],
    embeddings: Any | None = None,
) -> None:
    """
    Index examples for retrieval.

    Args:
        examples: List of example texts to index
        embeddings: Pre-computed embeddings (if None, will compute)
    """
    self._example_corpus = examples

    if embeddings is not None:
        self._example_embeddings = embeddings
    elif self.retriever is not None:
        self._example_embeddings = self.retriever.encode(examples)

    self._is_indexed = True
    logger.info(f"Indexed {len(examples)} examples for retrieval")
retrieve(query, k=None)

Retrieve k most relevant examples for a query.

Parameters:

Name Type Description Default
query str

Query text

required
k int | None

Number of examples to retrieve (default: k_examples)

None

Returns:

Type Description
list[dict[str, Any]]

List of dicts with 'text', 'score', 'index'

Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
def retrieve(
    self,
    query: str,
    k: int | None = None,
) -> list[dict[str, Any]]:
    """
    Retrieve k most relevant examples for a query.

    Args:
        query: Query text
        k: Number of examples to retrieve (default: k_examples)

    Returns:
        List of dicts with 'text', 'score', 'index'
    """
    if not self._is_indexed:
        raise RuntimeError("Must call index_examples() before retrieve()")

    k = k or self.k_examples

    if self.retriever is None:
        logger.warning("No retriever available, returning empty results")
        return []

    query_embedding = self.retriever.encode([query])

    from sklearn.metrics.pairwise import cosine_similarity

    similarities = cosine_similarity(
        query_embedding,
        self._example_embeddings,
    )[0]

    top_indices = sorted(
        range(len(similarities)),
        key=lambda i: similarities[i],
        reverse=True,
    )[:k]

    results = [
        {
            "text": self._example_corpus[idx],
            "score": float(similarities[idx]),
            "index": int(idx),
        }
        for idx in top_indices
    ]

    return results
retrieve_by_class(class_name, novel_samples, existing_classes)

Retrieve examples relevant to a proposed class.

Parameters:

Name Type Description Default
class_name str

Proposed class name

required
novel_samples list[Any]

Novel samples to find examples for

required
existing_classes list[str]

List of existing class names

required

Returns:

Type Description
dict[str, Any]

Dict with retrieved examples and metadata

Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
def retrieve_by_class(
    self,
    class_name: str,
    novel_samples: list[Any],
    existing_classes: list[str],
) -> dict[str, Any]:
    """
    Retrieve examples relevant to a proposed class.

    Args:
        class_name: Proposed class name
        novel_samples: Novel samples to find examples for
        existing_classes: List of existing class names

    Returns:
        Dict with retrieved examples and metadata
    """
    if not novel_samples:
        return {"examples": [], "class_name": class_name}

    texts = [s.text if hasattr(s, "text") else str(s) for s in novel_samples]
    query = f"{class_name}: {', '.join(texts[:3])}"

    retrieved = self.retrieve(query, k=self.k_novel_per_class)

    return {
        "class_name": class_name,
        "examples": retrieved,
        "query": query,
    }
build_prompt(novel_samples, existing_classes, context=None, use_retrieval=True)

Build prompt for LLM class proposal with retrieval.

Parameters:

Name Type Description Default
novel_samples list[Any]

Novel samples to propose classes for

required
existing_classes list[str]

List of existing class names

required
context str | None

Optional domain context

None
use_retrieval bool

Whether to include retrieved examples

True

Returns:

Type Description
str

Formatted prompt string

Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
    def build_prompt(
        self,
        novel_samples: list[Any],
        existing_classes: list[str],
        context: str | None = None,
        use_retrieval: bool = True,
    ) -> str:
        """
        Build prompt for LLM class proposal with retrieval.

        Args:
            novel_samples: Novel samples to propose classes for
            existing_classes: List of existing class names
            context: Optional domain context
            use_retrieval: Whether to include retrieved examples

        Returns:
            Formatted prompt string
        """
        sample_texts = [
            f"- {s.text if hasattr(s, 'text') else str(s)}" for s in novel_samples[:20]
        ]
        if len(novel_samples) > 20:
            sample_texts.append(f"... and {len(novel_samples) - 20} more samples")

        samples_section = "\n".join(sample_texts)

        existing_section = ", ".join(existing_classes) if existing_classes else "None"

        context_section = f"\n\nDomain Context: {context}" if context else ""

        retrieval_section = ""
        if use_retrieval and self._is_indexed and self.retriever:
            retrieved_examples = []
            for sample in novel_samples[:5]:
                text = sample.text if hasattr(sample, "text") else str(sample)
                results = self.retrieve(text, k=2)
                for r in results:
                    retrieved_examples.append(
                        f'- Example: "{r["text"]}" (relevance: {r["score"]:.2f})'
                    )

            if retrieved_examples:
                retrieval_section = "\n\nRetrieved relevant examples:\n" + "\n".join(
                    retrieved_examples[:10]
                )

        prompt = f"""You are analyzing text samples that don't fit well into existing categories.

Existing Classes: {existing_section}{context_section}{retrieval_section}

Novel Samples (detected as not fitting existing classes):
{samples_section}

Your task is to:
1. Analyze these samples to identify meaningful new categories
2. Propose concise, descriptive class names
3. Provide justifications for each proposal
4. Identify samples that should be rejected as noise

IMPORTANT RESPONSE FORMAT:
You must respond with a valid JSON object matching this schema:
{{
  "proposed_classes": [
    {{
      "name": "class name (2-4 words)",
      "description": "clear description of what this class represents",
      "confidence": 0.0-1.0,
      "sample_count": number of samples fitting this class,
      "example_samples": ["sample1", "sample2", "sample3"],
      "justification": "why this class makes sense",
      "suggested_parent": null or "parent class name if hierarchical"
    }}
  ],
  "rejected_as_noise": ["sample text to reject"],
  "analysis_summary": "brief summary of your analysis",
  "cluster_count": number of distinct clusters found
}}

Guidelines:
- Class names should be concise (2-4 words), descriptive
- Confidence should reflect how clearly the samples form a coherent category
- Only propose classes with at least 3 supporting samples
- Reject samples that appear to be noise, errors, or too diverse
- Return "proposed_classes": [] if no coherent new class should be created
- Consider hierarchical relationships if relevant to the domain

Provide your analysis as a JSON object:"""

        return prompt
propose_classes(novel_samples, existing_classes, context=None)

Propose new classes with retrieval-augmented prompting.

Parameters:

Name Type Description Default
novel_samples list[Any]

Novel samples to propose classes for

required
existing_classes list[str]

List of existing class names

required
context str | None

Optional domain context

None

Returns:

Type Description
Any | None

NovelClassAnalysis from LLM or None if unavailable

Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
def propose_classes(
    self,
    novel_samples: list[Any],
    existing_classes: list[str],
    context: str | None = None,
) -> Any | None:
    """
    Propose new classes with retrieval-augmented prompting.

    Args:
        novel_samples: Novel samples to propose classes for
        existing_classes: List of existing class names
        context: Optional domain context

    Returns:
        NovelClassAnalysis from LLM or None if unavailable
    """
    if not self.llm_proposer:
        logger.warning("No LLM proposer configured")
        return None

    prompt = self.build_prompt(
        novel_samples=novel_samples,
        existing_classes=existing_classes,
        context=context,
        use_retrieval=True,
    )

    try:
        response, model_used = self._call_llm_with_fallback(prompt)
        analysis = self._parse_response(response, model_used)
        return analysis
    except (ValueError, TypeError, ConnectionError, RuntimeError) as e:
        logger.error(f"LLM proposal failed: {e}")
        return None

BGERetriever(model_name='BAAI/bge-m3', device=None, batch_size=32)

BGE-M3 style dense retriever for examples.

Simple wrapper that uses sentence-transformers for dense retrieval of in-context examples.

Parameters:

Name Type Description Default
model_name str

Model name for sentence-transformers

'BAAI/bge-m3'
device str | None

Device to use ("cuda", "cpu", or None for auto)

None
batch_size int

Batch size for encoding

32
Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
def __init__(
    self,
    model_name: str = "BAAI/bge-m3",
    device: str | None = None,
    batch_size: int = 32,
):
    """
    Initialize BGE retriever.

    Args:
        model_name: Model name for sentence-transformers
        device: Device to use ("cuda", "cpu", or None for auto)
        batch_size: Batch size for encoding
    """
    self.model_name = model_name
    self.device = device
    self.batch_size = batch_size
    self._model: Any | None = None
    self._is_initialized = False
Functions
encode(texts, batch_size=None)

Encode texts to embeddings.

Parameters:

Name Type Description Default
texts list[str]

List of texts to encode

required
batch_size int | None

Override batch size

None

Returns:

Type Description
Any

numpy array of embeddings (n, dim)

Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
def encode(
    self,
    texts: list[str],
    batch_size: int | None = None,
) -> Any:
    """
    Encode texts to embeddings.

    Args:
        texts: List of texts to encode
        batch_size: Override batch size

    Returns:
        numpy array of embeddings (n, dim)
    """
    self._initialize()

    batch_size = batch_size or self.batch_size
    assert self._model is not None, "Model should be initialized"
    embeddings = self._model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=False,
        normalize_embeddings=True,
    )
    return embeddings
similarity(query_embeddings, corpus_embeddings)

Compute similarity between query and corpus.

Parameters:

Name Type Description Default
query_embeddings Any

Query embeddings (n, dim)

required
corpus_embeddings Any

Corpus embeddings (m, dim)

required

Returns:

Type Description
ndarray

Similarity matrix (n, m)

Source code in src/novelentitymatcher/novelty/proposal/retrieval.py
def similarity(
    self,
    query_embeddings: Any,
    corpus_embeddings: Any,
) -> np.ndarray:
    """
    Compute similarity between query and corpus.

    Args:
        query_embeddings: Query embeddings (n, dim)
        corpus_embeddings: Corpus embeddings (m, dim)

    Returns:
        Similarity matrix (n, m)
    """
    from sklearn.metrics.pairwise import cosine_similarity

    return cosine_similarity(query_embeddings, corpus_embeddings)

Functions

novelentitymatcher.novelty.proposal.config

LLM API configuration with validation and environment variable support.

Provides Pydantic-based configuration for LLM timeouts, retries, and circuit breaker settings to ensure production-ready LLM integration.

Classes

LLMConfig

Bases: BaseSettings

LLM API configuration with production-ready defaults.

Supports environment variable overrides via LLM_* prefix.

Environment Variables

LLM_TIMEOUT: Request timeout in seconds (default: 30) LLM_MAX_RETRIES: Maximum retry attempts (default: 5) LLM_CIRCUIT_FAIL_MAX: Consecutive failures before opening circuit (default: 3) LLM_CIRCUIT_RESET_SECONDS: Circuit open duration (default: 60)

Functions
validate_timeout(v) classmethod

Ensure timeout is reasonable (1-600 seconds).

Source code in src/novelentitymatcher/novelty/proposal/config.py
@field_validator("timeout")
@classmethod
def validate_timeout(cls, v: int) -> int:
    """Ensure timeout is reasonable (1-600 seconds)."""
    if v < 1 or v > 600:
        raise ValueError("timeout must be between 1 and 600 seconds")
    return v

Functions

get_llm_config()

Get or create LLMConfig singleton.

Returns:

Type Description
LLMConfig

LLMConfig instance with defaults or environment variable overrides.

Source code in src/novelentitymatcher/novelty/proposal/config.py
def get_llm_config() -> LLMConfig:
    """Get or create LLMConfig singleton.

    Returns:
        LLMConfig instance with defaults or environment variable overrides.
    """
    global _default_config
    if _default_config is None:
        _default_config = LLMConfig()
    return _default_config

novelentitymatcher.novelty.proposal.schema_enforcement

Schema enforcement for LLM proposal outputs.

Provides retry-aware validation of LLM-generated proposals against Pydantic schemas, with structured error feedback for re-prompting.

Classes

ValidationResult(is_valid, parsed=None, errors=None)

Result of validating raw LLM output against a schema.

Source code in src/novelentitymatcher/novelty/proposal/schema_enforcement.py
def __init__(
    self,
    is_valid: bool,
    parsed: BaseModel | None = None,
    errors: list[dict[str, Any]] | None = None,
):
    self.is_valid = is_valid
    self.parsed = parsed
    self.errors = errors or []

SchemaEnforcer(max_retries=2, schema_model=None)

Validate and enforce Pydantic schemas on LLM outputs with retry logic.

Usage::

enforcer = SchemaEnforcer(max_retries=2, schema_model=LLMProposalSchema)
result = enforcer.enforce(raw_output, proposer_fn, context)
Source code in src/novelentitymatcher/novelty/proposal/schema_enforcement.py
def __init__(
    self,
    max_retries: int = 2,
    schema_model: type[BaseModel] | None = None,
):
    self.max_retries = max_retries
    self.schema_model = schema_model
Functions
validate(raw_output)

Validate raw LLM output against the configured Pydantic schema.

Parameters:

Name Type Description Default
raw_output dict[str, Any]

Parsed JSON dict from LLM response.

required

Returns:

Type Description
ValidationResult

ValidationResult with validity status and any errors.

Source code in src/novelentitymatcher/novelty/proposal/schema_enforcement.py
def validate(self, raw_output: dict[str, Any]) -> ValidationResult:
    """Validate raw LLM output against the configured Pydantic schema.

    Args:
        raw_output: Parsed JSON dict from LLM response.

    Returns:
        ValidationResult with validity status and any errors.
    """
    if self.schema_model is None:
        return ValidationResult(is_valid=True, parsed=None, errors=[])

    try:
        parsed = self.schema_model(**raw_output)
        return ValidationResult(is_valid=True, parsed=parsed, errors=[])
    except ValidationError as exc:
        errors = []
        for error in exc.errors():
            loc = error.get("loc", [])
            field_path = ".".join(str(part) for part in loc) if loc else "root"
            errors.append(
                {
                    "field": field_path,
                    "type": error.get("type", "unknown"),
                    "message": error.get("msg", ""),
                    "input": str(error.get("input", ""))[:100],
                }
            )
        return ValidationResult(is_valid=False, parsed=None, errors=errors)
enforce(raw_output, proposer_fn, context=None)

Validate with retry loop. On failure, re-prompt with error feedback.

Parameters:

Name Type Description Default
raw_output dict[str, Any]

Initial parsed LLM output to validate.

required
proposer_fn Callable[[str | None], dict[str, Any]]

Callable that takes an error feedback string and returns a new raw output dict from the LLM.

required
context dict[str, Any] | None

Optional context for error messages.

None

Returns:

Type Description
dict[str, Any]

Validated raw output dict (possibly from a retry).

Source code in src/novelentitymatcher/novelty/proposal/schema_enforcement.py
def enforce(
    self,
    raw_output: dict[str, Any],
    proposer_fn: Callable[[str | None], dict[str, Any]],
    context: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """Validate with retry loop. On failure, re-prompt with error feedback.

    Args:
        raw_output: Initial parsed LLM output to validate.
        proposer_fn: Callable that takes an error feedback string and returns
                     a new raw output dict from the LLM.
        context: Optional context for error messages.

    Returns:
        Validated raw output dict (possibly from a retry).
    """
    result = self.validate(raw_output)
    if result.is_valid:
        return raw_output

    for attempt in range(self.max_retries):
        feedback = self._build_feedback(result.errors, attempt + 1)
        logger.info(
            "Schema validation failed (attempt %d/%d), re-prompting: %s",
            attempt + 1,
            self.max_retries,
            result.error_summary(),
        )
        raw_output = proposer_fn(feedback)
        result = self.validate(raw_output)
        if result.is_valid:
            return raw_output

    logger.warning(
        "Schema enforcement exhausted retries (%d). Returning last output.",
        self.max_retries,
    )
    return raw_output

Functions