TVect · November 10, 2020 03:42
diff --git a/keywords_ner.py b/keywords_ner.py
 """
 Named Entity Recognition based on dictionaries

 reference:
    https://github.com/mpuig/spacy-lookup
    https://spacy.io/usage/rule-based-matching
 """

 import json
 from flashtext import KeywordProcessor


 class EntitySpan:

    def __init__(self, doc, start, end, label):
        """
        :param doc (Doc): The parent document.
        :param start (int): The index of the first character of the span.
        :param end (int): The index of the first character after the span.
        :param label (uint64 or string): A label to attach to the Span, e.g. for named entities.
        """
        self.doc = doc
        self.start = start
        self.end = end
        self.label = label

    def merge(self, *args, **kwargs):  # real signature unknown
        """
        Retokenize the document, such that the span is merged into a single token.

            **attributes: Attributes to assign to the merged token. By default,
                attributes are inherited from the syntactic root token of the span.
            RETURNS (Token): The newly merged token.
        """
        pass

    def __str__(self):
        return json.dumps(
            {"doc": self.doc,
             "start": self.start,
             "end": self.end,
             "label": self.label,
             "entity": self.doc[self.start: self.end]},
            ensure_ascii=False, indent=4)


 class KeywordsNER:

    def __init__(self,
                 keywords_list=[],
                 keywords_dict={},
                 keywords_file=None,
                 case_sensitive=False):
        """Initialise the pipeline component.
        """
        # Set up the KeywordProcessor
        self.keyword_processor = KeywordProcessor(case_sensitive=case_sensitive)
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file:
            self.keyword_processor.add_keyword_from_file(keywords_file)

    def extract_entities(self, in_text):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        # TODO flashtext.KeywordProcessor 默认做最长匹配？
        matches = self.keyword_processor.extract_keywords(in_text, span_info=True)

        spans = []  # keep spans here to merge them later
        for canonical, start, end in matches:
            # Generate Span representing the entity & set label
            # the keyword processor returns index values based on character positions, not words
            entity_span = EntitySpan(doc=in_text, start=start, end=end, label=canonical)
            if entity_span:
                spans.append(entity_span)

        # TODO Iterate over all spans and merge them into one token.
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched indices!
            span.merge()

        return spans


 if __name__ == "__main__":
    key_ner = KeywordsNER(keywords_dict={"LOC": ["广东省", "广东"], "PER": ["省长"]})
    for entity in key_ner.extract_entities("广东省长假成绩单亮眼"):
        print(entity)
	"""
	Named Entity Recognition based on dictionaries

	reference:
	https://github.com/mpuig/spacy-lookup
	https://spacy.io/usage/rule-based-matching
	"""

	import json
	from flashtext import KeywordProcessor


	class EntitySpan:

	def __init__(self, doc, start, end, label):
	"""
	:param doc (Doc): The parent document.
	:param start (int): The index of the first character of the span.
	:param end (int): The index of the first character after the span.
	:param label (uint64 or string): A label to attach to the Span, e.g. for named entities.
	"""
	self.doc = doc
	self.start = start
	self.end = end
	self.label = label

	def merge(self, args, *kwargs): # real signature unknown
	"""
	Retokenize the document, such that the span is merged into a single token.

	**attributes: Attributes to assign to the merged token. By default,
	attributes are inherited from the syntactic root token of the span.
	RETURNS (Token): The newly merged token.
	"""
	pass

	def __str__(self):
	return json.dumps(
	{"doc": self.doc,
	"start": self.start,
	"end": self.end,
	"label": self.label,
	"entity": self.doc[self.start: self.end]},
	ensure_ascii=False, indent=4)


	class KeywordsNER:

	def __init__(self,
	keywords_list=[],
	keywords_dict={},
	keywords_file=None,
	case_sensitive=False):
	"""Initialise the pipeline component.
	"""
	# Set up the KeywordProcessor
	self.keyword_processor = KeywordProcessor(case_sensitive=case_sensitive)
	self.keyword_processor.add_keywords_from_list(keywords_list)
	self.keyword_processor.add_keywords_from_dict(keywords_dict)
	if keywords_file:
	self.keyword_processor.add_keyword_from_file(keywords_file)

	def extract_entities(self, in_text):
	"""Apply the pipeline component on a Doc object and modify it if matches
	are found. Return the Doc, so it can be processed by the next component
	in the pipeline, if available.
	"""
	# TODO flashtext.KeywordProcessor 默认做最长匹配？
	matches = self.keyword_processor.extract_keywords(in_text, span_info=True)

	spans = [] # keep spans here to merge them later
	for canonical, start, end in matches:
	# Generate Span representing the entity & set label
	# the keyword processor returns index values based on character positions, not words
	entity_span = EntitySpan(doc=in_text, start=start, end=end, label=canonical)
	if entity_span:
	spans.append(entity_span)

	# TODO Iterate over all spans and merge them into one token.
	for span in spans:
	# Iterate over all spans and merge them into one token. This is done
	# after setting the entities – otherwise, it would cause mismatched indices!
	span.merge()

	return spans


	if __name__ == "__main__":
	key_ner = KeywordsNER(keywords_dict={"LOC": ["广东省", "广东"], "PER": ["省长"]})
	for entity in key_ner.extract_entities("广东省长假成绩单亮眼"):
	print(entity)
No results found