kristiyanto · July 3, 2024 15:25
diff --git a/matcher.py b/matcher.py
 # Refer to the Jupyter Notebook and article for package imports and the complete code.

 def extract_keywords(text, max_keywords=10):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    # Noun and Noun Phrases
    noun_phrases_patterns = [
        [{'POS': 'NUM'}, {'POS': 'NOUN'}], #example: 2 bedrooms
        [{'POS': 'ADJ', 'OP': '*'}, {'POS': 'NOUN'}], #example: beautiful house
        [{'POS': 'NOUN', 'OP': '+'}], #example: house
    ]

    # Geo-political entity
    gpe_patterns = [
        [{'ENT_TYPE': 'GPE'}], #example: Tokyo
    ]

    # Location
    loc_patterns = [
        [{'ENT_TYPE': 'LOC'}], #example: downtown
    ]

    # Facility
    fac_patterns = [
        [{'ENT_TYPE': 'FAC'}], #example: airport
    ]

    # Proximity
    proximity_patterns = [
    [{'POS': 'ADJ'}, {'POS': 'ADP'}, {'POS': 'NOUN', 'ENT_TYPE': 'FAC', 'OP': '?'}], # example: near airport
    [{'POS': 'ADJ'}, {'POS': 'ADP'}, {'POS': 'PROPN', 'ENT_TYPE': 'FAC', 'OP': '?'}] # example: near to Narita
    ]

    for entity, patterns in zip(['NOUN_PHRASE', 'GPE', 'LOC', 'FAC', "PROXIMITY"], 
                                [noun_phrases_patterns, gpe_patterns, loc_patterns, 
                                 fac_patterns, proximity_patterns]):
        
        matcher.add(entity, patterns)

    matches = matcher(doc)
    keywords = []
    for match_id, start, end in matches:
        span = doc[start:end]
        match_label = nlp.vocab.strings[match_id]
        keywords.append((match_label, span.text.strip().lower()))

    keyword_freq = {}
    for keyword in keywords:
        keyword_freq[keyword] = keyword_freq.get(keyword, 0) + 1
    
    keywords = sorted(keyword_freq, key=keyword_freq.get, reverse=True)
    return keywords[:max_keywords]

 def extract_tokens(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop and token.is_ascii]
    return tokens


 data['keywords'] = data['description'].apply(extract_keywords)
	# Refer to the Jupyter Notebook and article for package imports and the complete code.

	def extract_keywords(text, max_keywords=10):
	doc = nlp(text)
	matcher = Matcher(nlp.vocab)

	# Noun and Noun Phrases
	noun_phrases_patterns = [
	[{'POS': 'NUM'}, {'POS': 'NOUN'}], #example: 2 bedrooms
	[{'POS': 'ADJ', 'OP': '*'}, {'POS': 'NOUN'}], #example: beautiful house
	[{'POS': 'NOUN', 'OP': '+'}], #example: house
	]

	# Geo-political entity
	gpe_patterns = [
	[{'ENT_TYPE': 'GPE'}], #example: Tokyo
	]

	# Location
	loc_patterns = [
	[{'ENT_TYPE': 'LOC'}], #example: downtown
	]

	# Facility
	fac_patterns = [
	[{'ENT_TYPE': 'FAC'}], #example: airport
	]

	# Proximity
	proximity_patterns = [
	[{'POS': 'ADJ'}, {'POS': 'ADP'}, {'POS': 'NOUN', 'ENT_TYPE': 'FAC', 'OP': '?'}], # example: near airport
	[{'POS': 'ADJ'}, {'POS': 'ADP'}, {'POS': 'PROPN', 'ENT_TYPE': 'FAC', 'OP': '?'}] # example: near to Narita
	]

	for entity, patterns in zip(['NOUN_PHRASE', 'GPE', 'LOC', 'FAC', "PROXIMITY"],
	[noun_phrases_patterns, gpe_patterns, loc_patterns,
	fac_patterns, proximity_patterns]):

	matcher.add(entity, patterns)

	matches = matcher(doc)
	keywords = []
	for match_id, start, end in matches:
	span = doc[start:end]
	match_label = nlp.vocab.strings[match_id]
	keywords.append((match_label, span.text.strip().lower()))

	keyword_freq = {}
	for keyword in keywords:
	keyword_freq[keyword] = keyword_freq.get(keyword, 0) + 1

	keywords = sorted(keyword_freq, key=keyword_freq.get, reverse=True)
	return keywords[:max_keywords]

	def extract_tokens(text):
	doc = nlp(text)
	tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop and token.is_ascii]
	return tokens


	data['keywords'] = data['description'].apply(extract_keywords)
No results found