Novelty Dashboard
import marimo as mo
import matplotlib.pyplot as plt
from novelentitymatcher import Matcher , NovelEntityMatcher
from novelentitymatcher.novelty import DetectionConfig
from novelentitymatcher.novelty.config.strategies import (
ConfidenceConfig ,
KNNConfig ,
)
mo . md (
"""
# Novel Entity Detection Dashboard
Feed in text queries and see which ones are flagged as **novel** (not matching
any known entity). Uses confidence + KNN distance strategies.
"""
)
entities = [
{ "id" : "physics" , "name" : "Physics" },
{ "id" : "cs" , "name" : "Computer Science" },
{ "id" : "biology" , "name" : "Biology" },
{ "id" : "chemistry" , "name" : "Chemistry" },
{ "id" : "math" , "name" : "Mathematics" },
]
training_texts = [
"quantum mechanics" ,
"wave function" ,
"particle physics" ,
"thermodynamics" ,
"machine learning" ,
"neural networks" ,
"algorithm design" ,
"data structures" ,
"gene expression" ,
"protein synthesis" ,
"cell division" ,
"DNA replication" ,
"organic synthesis" ,
"molecular bonding" ,
"periodic table" ,
"reaction kinetics" ,
"linear algebra" ,
"calculus" ,
"topology" ,
"number theory" ,
]
training_labels = [
"physics" ,
"physics" ,
"physics" ,
"physics" ,
"cs" ,
"cs" ,
"cs" ,
"cs" ,
"biology" ,
"biology" ,
"biology" ,
"biology" ,
"chemistry" ,
"chemistry" ,
"chemistry" ,
"chemistry" ,
"math" ,
"math" ,
"math" ,
"math" ,
]
matcher = Matcher ( entities = entities , model = "minilm" , threshold = 0.6 )
matcher . fit ( texts = training_texts , labels = training_labels )
mo . md (
f "**Trained matcher** with { len ( training_texts ) } samples across { len ( entities ) } classes."
)
query_area = mo . ui . text_area (
value = "quantum superposition \n deep learning models \n CRISPR gene editing \n computational chemistry methods \n autonomous robotics \n discount coupon policy \n weather forecast today \n gibberish zxqv placeholder" ,
label = "Enter queries (one per line):" ,
full_width = True ,
)
query_area
_queries = [ q . strip () for q in query_area . value . strip () . split ( " \n " ) if q . strip ()]
if not _queries :
detection_output = mo . md ( "Add some queries above!" )
else :
_nm = NovelEntityMatcher (
matcher = matcher ,
detection_config = DetectionConfig (
strategies = [ "confidence" , "knn_distance" ],
confidence = ConfidenceConfig ( threshold = 0.45 ),
knn_distance = KNNConfig ( distance_threshold = 0.45 ),
combine_method = "weighted" ,
),
)
_rows = []
for _q in _queries :
_r = _nm . match ( _q )
_rows . append (
{
"query" : _q ,
"is_novel" : _r . is_novel ,
"predicted_id" : _r . predicted_id if not _r . is_novel else "—" ,
"score" : f " { _r . score : .2% } " ,
"signals" : ", " . join ( _r . signals . keys ()) if _r . signals else "—" ,
}
)
detection_output = mo . ui . table ( _rows , label = "Detection Results" )
detection_output
_queries = [ q . strip () for q in query_area . value . strip () . split ( " \n " ) if q . strip ()]
if not _queries :
chart_output = mo . md ( "No queries to plot." )
else :
_scores = []
for _q in _queries :
_r = matcher . match ( _q )
_entry = _r if isinstance ( _r , dict ) else _r
_score = _entry . get ( "score" , 0 ) if isinstance ( _entry , dict ) else 0
_scores . append ( _score )
_fig , _ax = plt . subplots ( figsize = ( 8 , 4 ))
_colors = [ "#e74c3c" if s < 0.5 else "#2ecc71" for s in _scores ]
_ax . barh ( range ( len ( _queries )), _scores , color = _colors )
_ax . set_yticks ( range ( len ( _queries )))
_ax . set_yticklabels ( _queries , fontsize = 9 )
_ax . set_xlabel ( "Match Confidence" )
_ax . set_title ( "Confidence Scores (red = likely novel)" )
_ax . axvline ( x = 0.5 , color = "gray" , linestyle = "--" , alpha = 0.7 , label = "threshold" )
_ax . legend ()
plt . tight_layout ()
chart_output = _fig
chart_output