Skip to content

Instantly share code, notes, and snippets.

@kristiyanto
Created July 3, 2024 15:25
Show Gist options
  • Select an option

  • Save kristiyanto/a5d9ec70a04e11f1aef9a384db71a42a to your computer and use it in GitHub Desktop.

Select an option

Save kristiyanto/a5d9ec70a04e11f1aef9a384db71a42a to your computer and use it in GitHub Desktop.
NLP: NER using rule based matcher - Medium Article
# Refer to the Jupyter Notebook and article for package imports and the complete code.
def extract_keywords(text, max_keywords=10):
doc = nlp(text)
matcher = Matcher(nlp.vocab)
# Noun and Noun Phrases
noun_phrases_patterns = [
[{'POS': 'NUM'}, {'POS': 'NOUN'}], #example: 2 bedrooms
[{'POS': 'ADJ', 'OP': '*'}, {'POS': 'NOUN'}], #example: beautiful house
[{'POS': 'NOUN', 'OP': '+'}], #example: house
]
# Geo-political entity
gpe_patterns = [
[{'ENT_TYPE': 'GPE'}], #example: Tokyo
]
# Location
loc_patterns = [
[{'ENT_TYPE': 'LOC'}], #example: downtown
]
# Facility
fac_patterns = [
[{'ENT_TYPE': 'FAC'}], #example: airport
]
# Proximity
proximity_patterns = [
[{'POS': 'ADJ'}, {'POS': 'ADP'}, {'POS': 'NOUN', 'ENT_TYPE': 'FAC', 'OP': '?'}], # example: near airport
[{'POS': 'ADJ'}, {'POS': 'ADP'}, {'POS': 'PROPN', 'ENT_TYPE': 'FAC', 'OP': '?'}] # example: near to Narita
]
for entity, patterns in zip(['NOUN_PHRASE', 'GPE', 'LOC', 'FAC', "PROXIMITY"],
[noun_phrases_patterns, gpe_patterns, loc_patterns,
fac_patterns, proximity_patterns]):
matcher.add(entity, patterns)
matches = matcher(doc)
keywords = []
for match_id, start, end in matches:
span = doc[start:end]
match_label = nlp.vocab.strings[match_id]
keywords.append((match_label, span.text.strip().lower()))
keyword_freq = {}
for keyword in keywords:
keyword_freq[keyword] = keyword_freq.get(keyword, 0) + 1
keywords = sorted(keyword_freq, key=keyword_freq.get, reverse=True)
return keywords[:max_keywords]
def extract_tokens(text):
doc = nlp(text)
tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop and token.is_ascii]
return tokens
data['keywords'] = data['description'].apply(extract_keywords)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment