myedibleenso · August 29, 2025 00:19
diff --git a/align.py b/align.py
 from __future__ import annotations
 from collections import Counter
 from typing import Dict, List, Optional, Text
 from dataclasses import dataclass

 @dataclass
 class Span:
  start: int
  end: int
  tokens: Optional[List[Text]]
  # the Gold label from the taxonomy
  label: Optional[Text]

  def __len__(self) -> int:
    # +1 ?
    return self.end - self.start

  def overlaps(self, other: Span) -> bool:
    ## compare start and end of two spans to determine if they overlap
    pass

  def subsumes(self, other: Span) -> bool:
    """
    Determine if other Span is included in this Span.
    """

  def __eq__(self, other: Span) -> bool:
    return (self.start == other.start and self.end == other.end)

  def score_match(self, other: Span) -> float:
    """
    Returns a score between 0 and 1 where 1 is a perfect match.
    """
    # is spans are equal, it's a perfect match
    if self == other:
      return 1.0
    elif self.subsumes(other):
      # what percentage of our span is matched?
      return len(other) / len(self)
    elif self.overlaps(other):
      # how much do they intersect?
      # how long is each?
      # FIXME: generate a score here
    return 0.0

 # tests
 # assuming an **exclusive** interval
 s_a = Span(start=0, end=3)
 assert len(s_a) == 3
 # FIXME: add simple tests for .overlaps(), ==, etc.


 predicted = Span(start=0, end=3)
 # gold spans
 gold_candidates = [Span(start=0, end=1), Span(start=2, end=5)]

 scored_pairs = [(predicted.score_match(candidate) ,candidate) for candidate in gold_candidates]

 top_score, _ = max(scored_pairs)
 # scenario 1: find all gold spans with our highest score and create **multiple** training examples (i.e., one example for each **distinct** label)

 # scenario 2: find all gold spans with our highest score and create a training example for our most frequent label with that score.  Note that in the case of a tie, this may produce multiple examples for one original prediction.

 highest_scoring: Dict[Text, int] = Counter([candidate.label for (score, candidate) in scored_pairs if score == top_score])

 _, most_freq = highest_scoring.most_common(1)

 # all labels with our highest frequency
 to_assign = [lbl for (lbl, cnt) in highest_scoring.most_common() if cnt == most_freq]

 for lbl in to_assign:
  # TODO: process our prediction by writing the CoNLL-style sentence with the new label?
  # for example, a two column tsv file:
  # I like <span>turtles</span> .          LABEL_X
  pass
	from __future__ import annotations
	from collections import Counter
	from typing import Dict, List, Optional, Text
	from dataclasses import dataclass

	@dataclass
	class Span:
	start: int
	end: int
	tokens: Optional[List[Text]]
	# the Gold label from the taxonomy
	label: Optional[Text]

	def __len__(self) -> int:
	# +1 ?
	return self.end - self.start

	def overlaps(self, other: Span) -> bool:
	## compare start and end of two spans to determine if they overlap
	pass

	def subsumes(self, other: Span) -> bool:
	"""
	Determine if other Span is included in this Span.
	"""

	def __eq__(self, other: Span) -> bool:
	return (self.start == other.start and self.end == other.end)

	def score_match(self, other: Span) -> float:
	"""
	Returns a score between 0 and 1 where 1 is a perfect match.
	"""
	# is spans are equal, it's a perfect match
	if self == other:
	return 1.0
	elif self.subsumes(other):
	# what percentage of our span is matched?
	return len(other) / len(self)
	elif self.overlaps(other):
	# how much do they intersect?
	# how long is each?
	# FIXME: generate a score here
	return 0.0

	# tests
	# assuming an exclusive interval
	s_a = Span(start=0, end=3)
	assert len(s_a) == 3
	# FIXME: add simple tests for .overlaps(), ==, etc.


	predicted = Span(start=0, end=3)
	# gold spans
	gold_candidates = [Span(start=0, end=1), Span(start=2, end=5)]

	scored_pairs = [(predicted.score_match(candidate) ,candidate) for candidate in gold_candidates]

	top_score, _ = max(scored_pairs)
	# scenario 1: find all gold spans with our highest score and create multiple training examples (i.e., one example for each distinct label)

	# scenario 2: find all gold spans with our highest score and create a training example for our most frequent label with that score. Note that in the case of a tie, this may produce multiple examples for one original prediction.

	highest_scoring: Dict[Text, int] = Counter([candidate.label for (score, candidate) in scored_pairs if score == top_score])

	_, most_freq = highest_scoring.most_common(1)

	# all labels with our highest frequency
	to_assign = [lbl for (lbl, cnt) in highest_scoring.most_common() if cnt == most_freq]

	for lbl in to_assign:
	# TODO: process our prediction by writing the CoNLL-style sentence with the new label?
	# for example, a two column tsv file:
	# I like <span>turtles</span> . LABEL_X
	pass
No results found