Created
November 10, 2020 03:42
-
-
Save TVect/44f41d36195917a1a16ae5b09d75e925 to your computer and use it in GitHub Desktop.
keywords ner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Named Entity Recognition based on dictionaries | |
| reference: | |
| https://github.com/mpuig/spacy-lookup | |
| https://spacy.io/usage/rule-based-matching | |
| """ | |
| import json | |
| from flashtext import KeywordProcessor | |
| class EntitySpan: | |
| def __init__(self, doc, start, end, label): | |
| """ | |
| :param doc (Doc): The parent document. | |
| :param start (int): The index of the first character of the span. | |
| :param end (int): The index of the first character after the span. | |
| :param label (uint64 or string): A label to attach to the Span, e.g. for named entities. | |
| """ | |
| self.doc = doc | |
| self.start = start | |
| self.end = end | |
| self.label = label | |
| def merge(self, *args, **kwargs): # real signature unknown | |
| """ | |
| Retokenize the document, such that the span is merged into a single token. | |
| **attributes: Attributes to assign to the merged token. By default, | |
| attributes are inherited from the syntactic root token of the span. | |
| RETURNS (Token): The newly merged token. | |
| """ | |
| pass | |
| def __str__(self): | |
| return json.dumps( | |
| {"doc": self.doc, | |
| "start": self.start, | |
| "end": self.end, | |
| "label": self.label, | |
| "entity": self.doc[self.start: self.end]}, | |
| ensure_ascii=False, indent=4) | |
| class KeywordsNER: | |
| def __init__(self, | |
| keywords_list=[], | |
| keywords_dict={}, | |
| keywords_file=None, | |
| case_sensitive=False): | |
| """Initialise the pipeline component. | |
| """ | |
| # Set up the KeywordProcessor | |
| self.keyword_processor = KeywordProcessor(case_sensitive=case_sensitive) | |
| self.keyword_processor.add_keywords_from_list(keywords_list) | |
| self.keyword_processor.add_keywords_from_dict(keywords_dict) | |
| if keywords_file: | |
| self.keyword_processor.add_keyword_from_file(keywords_file) | |
| def extract_entities(self, in_text): | |
| """Apply the pipeline component on a Doc object and modify it if matches | |
| are found. Return the Doc, so it can be processed by the next component | |
| in the pipeline, if available. | |
| """ | |
| # TODO flashtext.KeywordProcessor 默认做最长匹配? | |
| matches = self.keyword_processor.extract_keywords(in_text, span_info=True) | |
| spans = [] # keep spans here to merge them later | |
| for canonical, start, end in matches: | |
| # Generate Span representing the entity & set label | |
| # the keyword processor returns index values based on character positions, not words | |
| entity_span = EntitySpan(doc=in_text, start=start, end=end, label=canonical) | |
| if entity_span: | |
| spans.append(entity_span) | |
| # TODO Iterate over all spans and merge them into one token. | |
| for span in spans: | |
| # Iterate over all spans and merge them into one token. This is done | |
| # after setting the entities – otherwise, it would cause mismatched indices! | |
| span.merge() | |
| return spans | |
| if __name__ == "__main__": | |
| key_ner = KeywordsNER(keywords_dict={"LOC": ["广东省", "广东"], "PER": ["省长"]}) | |
| for entity in key_ner.extract_entities("广东省长假成绩单亮眼"): | |
| print(entity) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment