Skip to content

Instantly share code, notes, and snippets.

@myedibleenso
Last active August 29, 2025 00:15
Show Gist options
  • Select an option

  • Save myedibleenso/e896d93fd0826f28a565ae065adf0bd1 to your computer and use it in GitHub Desktop.

Select an option

Save myedibleenso/e896d93fd0826f28a565ae065adf0bd1 to your computer and use it in GitHub Desktop.
sklearn example
# tested with ...
# numpy==1.26.0
# scikit-learn==1.2.2
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
le = preprocessing.LabelEncoder()
X_raw = [
"Frogs are amphibians", "Turtles are amphibians",
"Dolphins are mammals", "Cats are mammals",
"Lizards are reptiles", "Snakes are reptiles"
]
y_raw = ["AMPHIBIAN", "MAMMAL", "REPTILE", "AMPHIBIAN", "MAMMAL", "REPTILE"]
y = le.fit_transform(y_raw)
pipeline = Pipeline([
("features", FeatureUnion([
# a feature extraction pipeline for **word** n-grams
# and combinations (pairs) of them
("word_ngram_combinations", Pipeline([
("word_ngrams", CountVectorizer(ngram_range=(1,1))),
# include pairs of n-grams
("word_poly", PolynomialFeatures(degree=2))
])
),
# a feature extraction pipeline for **character** n-grams
# and combinations (pairs) of them
("char_ngram_combinations", Pipeline([
("char_ngrams", CountVectorizer(ngram_range=(2,4), analyzer="char")),
# include pairs of n-grams
("char_poly", PolynomialFeatures(degree=2))
])
),
# # a third feature extraction pipeline
# # might derive stylometric features
# ("custom_features", Pipeline([
# # see the Arabic NLP example I shared
# ('stylometric', CustomDictVectorizer()),
# ])
# )
])
),
("clf", LogisticRegression(multi_class="ovr"))
])
pipeline.fit(X_raw, y)
features = pipeline["features"].get_feature_names_out()
x_i_raw = "Monkeys are mammals"
x_i = pipeline["features"].transform([x_i_raw])
y_hat = pipeline["clf"].predict(x_i)
# classifier weights for our predicted class
w = pipeline["clf"].coef_[y_hat]
# get 1D array
w = w.flatten()
# dimensional gymnastics to flatten
x_i = np.asarray(x_i.todense()).reshape(-1)
# scale features by x_i
# NOTE: this will zero out weights
# where features are absent for x_i
weighted_wx_i = w * x_i
k = 10
# NOTE: argsort will go from smallest to largest
# so we need to reverse it (i.e., largest positive at the front)
top_k = np.argsort(weighted_wx_i)[::-1][:k]
# some of those top k might be zero, so let's filter them out:
# NOTE: we might want to be more aggressive (ex. > 0.5)
surviving_top_k_feature_indices = top_k[weighted_wx_i[top_k] != 0]
top_k_features = features[surviving_top_k_feature_indices]
top_k_weights = weighted_wx_i[surviving_top_k_feature_indices]
print(list(zip(top_k_features, top_k_weights)))
# tested with ...
# numpy==1.26.0
# scikit-learn==1.2.2
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
le = preprocessing.LabelEncoder()
vectorizer = CountVectorizer(ngram_range=(1,1))
clf = LogisticRegression(multi_class="ovr")
X_raw = [
"Frogs are amphibians", "Turtles are amphibians",
"Dolphins are mammals", "Cats are mammals",
"Lizards are reptiles", "Snakes are reptiles"
]
y_raw = ["AMPHIBIAN", "MAMMAL", "REPTILE", "AMPHIBIAN", "MAMMAL", "REPTILE"]
X = vectorizer.fit_transform(X_raw)
y = le.fit_transform(y_raw)
features = vectorizer.get_feature_names_out()
clf.fit(X, y)
x_i_raw = "Monkeys are mammals"
x_i = vectorizer.transform([x_i_raw])
y_hat = clf.predict(x_i)
# classifier weights for our predicted class
w = clf.coef_[y_hat]
# get 1D array
w = w.flatten()
# dimensional gymnastics to flatten
x_i = np.asarray(x_i.todense()).reshape(-1)
# scale features by x_i
# NOTE: this will zero out weights
# where features are absent for x_i
weighted_wx_i = w * x_i
k = 3
# NOTE: argsort will go from smallest to largest
# so we need to reverse it (i.e., largest positive at the front)
top_k = np.argsort(weighted_wx_i)[::-1][:k]
# some of those top k might be zero, so let's filter them out:
# NOTE: we might want to be more aggressive (ex. > 0.5)
surviving_top_k_feature_indices = top_k[weighted_wx_i[top_k] != 0]
top_k_features = features[surviving_top_k_feature_indices]
top_k_weights = weighted_wx_i[surviving_top_k_feature_indices]
print(list(zip(top_k_features, top_k_weights)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment