Skip to content

Instantly share code, notes, and snippets.

@TVect
Created November 10, 2020 03:42
Show Gist options
  • Select an option

  • Save TVect/44f41d36195917a1a16ae5b09d75e925 to your computer and use it in GitHub Desktop.

Select an option

Save TVect/44f41d36195917a1a16ae5b09d75e925 to your computer and use it in GitHub Desktop.
keywords ner
"""
Named Entity Recognition based on dictionaries
reference:
https://github.com/mpuig/spacy-lookup
https://spacy.io/usage/rule-based-matching
"""
import json
from flashtext import KeywordProcessor
class EntitySpan:
def __init__(self, doc, start, end, label):
"""
:param doc (Doc): The parent document.
:param start (int): The index of the first character of the span.
:param end (int): The index of the first character after the span.
:param label (uint64 or string): A label to attach to the Span, e.g. for named entities.
"""
self.doc = doc
self.start = start
self.end = end
self.label = label
def merge(self, *args, **kwargs): # real signature unknown
"""
Retokenize the document, such that the span is merged into a single token.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token.
"""
pass
def __str__(self):
return json.dumps(
{"doc": self.doc,
"start": self.start,
"end": self.end,
"label": self.label,
"entity": self.doc[self.start: self.end]},
ensure_ascii=False, indent=4)
class KeywordsNER:
def __init__(self,
keywords_list=[],
keywords_dict={},
keywords_file=None,
case_sensitive=False):
"""Initialise the pipeline component.
"""
# Set up the KeywordProcessor
self.keyword_processor = KeywordProcessor(case_sensitive=case_sensitive)
self.keyword_processor.add_keywords_from_list(keywords_list)
self.keyword_processor.add_keywords_from_dict(keywords_dict)
if keywords_file:
self.keyword_processor.add_keyword_from_file(keywords_file)
def extract_entities(self, in_text):
"""Apply the pipeline component on a Doc object and modify it if matches
are found. Return the Doc, so it can be processed by the next component
in the pipeline, if available.
"""
# TODO flashtext.KeywordProcessor 默认做最长匹配?
matches = self.keyword_processor.extract_keywords(in_text, span_info=True)
spans = [] # keep spans here to merge them later
for canonical, start, end in matches:
# Generate Span representing the entity & set label
# the keyword processor returns index values based on character positions, not words
entity_span = EntitySpan(doc=in_text, start=start, end=end, label=canonical)
if entity_span:
spans.append(entity_span)
# TODO Iterate over all spans and merge them into one token.
for span in spans:
# Iterate over all spans and merge them into one token. This is done
# after setting the entities – otherwise, it would cause mismatched indices!
span.merge()
return spans
if __name__ == "__main__":
key_ner = KeywordsNER(keywords_dict={"LOC": ["广东省", "广东"], "PER": ["省长"]})
for entity in key_ner.extract_entities("广东省长假成绩单亮眼"):
print(entity)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment