Skip to content

Instantly share code, notes, and snippets.

@myedibleenso
Last active August 29, 2025 00:19
Show Gist options
  • Select an option

  • Save myedibleenso/c065db1a9d293a425c46bd031b2aab3b to your computer and use it in GitHub Desktop.

Select an option

Save myedibleenso/c065db1a9d293a425c46bd031b2aab3b to your computer and use it in GitHub Desktop.
Sketch for Seetha. "fuzzy" alignment to ground truth for taxonomy
from __future__ import annotations
from collections import Counter
from typing import Dict, List, Optional, Text
from dataclasses import dataclass
@dataclass
class Span:
start: int
end: int
tokens: Optional[List[Text]]
# the Gold label from the taxonomy
label: Optional[Text]
def __len__(self) -> int:
# +1 ?
return self.end - self.start
def overlaps(self, other: Span) -> bool:
## compare start and end of two spans to determine if they overlap
pass
def subsumes(self, other: Span) -> bool:
"""
Determine if other Span is included in this Span.
"""
def __eq__(self, other: Span) -> bool:
return (self.start == other.start and self.end == other.end)
def score_match(self, other: Span) -> float:
"""
Returns a score between 0 and 1 where 1 is a perfect match.
"""
# is spans are equal, it's a perfect match
if self == other:
return 1.0
elif self.subsumes(other):
# what percentage of our span is matched?
return len(other) / len(self)
elif self.overlaps(other):
# how much do they intersect?
# how long is each?
# FIXME: generate a score here
return 0.0
# tests
# assuming an **exclusive** interval
s_a = Span(start=0, end=3)
assert len(s_a) == 3
# FIXME: add simple tests for .overlaps(), ==, etc.
predicted = Span(start=0, end=3)
# gold spans
gold_candidates = [Span(start=0, end=1), Span(start=2, end=5)]
scored_pairs = [(predicted.score_match(candidate) ,candidate) for candidate in gold_candidates]
top_score, _ = max(scored_pairs)
# scenario 1: find all gold spans with our highest score and create **multiple** training examples (i.e., one example for each **distinct** label)
# scenario 2: find all gold spans with our highest score and create a training example for our most frequent label with that score. Note that in the case of a tie, this may produce multiple examples for one original prediction.
highest_scoring: Dict[Text, int] = Counter([candidate.label for (score, candidate) in scored_pairs if score == top_score])
_, most_freq = highest_scoring.most_common(1)
# all labels with our highest frequency
to_assign = [lbl for (lbl, cnt) in highest_scoring.most_common() if cnt == most_freq]
for lbl in to_assign:
# TODO: process our prediction by writing the CoNLL-style sentence with the new label?
# for example, a two column tsv file:
# I like <span>turtles</span> . LABEL_X
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment