Last active
August 29, 2025 00:15
-
-
Save myedibleenso/e896d93fd0826f28a565ae065adf0bd1 to your computer and use it in GitHub Desktop.
sklearn example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # tested with ... | |
| # numpy==1.26.0 | |
| # scikit-learn==1.2.2 | |
| from sklearn import preprocessing | |
| from sklearn.pipeline import FeatureUnion, Pipeline | |
| from sklearn.preprocessing import PolynomialFeatures | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| import numpy as np | |
| le = preprocessing.LabelEncoder() | |
| X_raw = [ | |
| "Frogs are amphibians", "Turtles are amphibians", | |
| "Dolphins are mammals", "Cats are mammals", | |
| "Lizards are reptiles", "Snakes are reptiles" | |
| ] | |
| y_raw = ["AMPHIBIAN", "MAMMAL", "REPTILE", "AMPHIBIAN", "MAMMAL", "REPTILE"] | |
| y = le.fit_transform(y_raw) | |
| pipeline = Pipeline([ | |
| ("features", FeatureUnion([ | |
| # a feature extraction pipeline for **word** n-grams | |
| # and combinations (pairs) of them | |
| ("word_ngram_combinations", Pipeline([ | |
| ("word_ngrams", CountVectorizer(ngram_range=(1,1))), | |
| # include pairs of n-grams | |
| ("word_poly", PolynomialFeatures(degree=2)) | |
| ]) | |
| ), | |
| # a feature extraction pipeline for **character** n-grams | |
| # and combinations (pairs) of them | |
| ("char_ngram_combinations", Pipeline([ | |
| ("char_ngrams", CountVectorizer(ngram_range=(2,4), analyzer="char")), | |
| # include pairs of n-grams | |
| ("char_poly", PolynomialFeatures(degree=2)) | |
| ]) | |
| ), | |
| # # a third feature extraction pipeline | |
| # # might derive stylometric features | |
| # ("custom_features", Pipeline([ | |
| # # see the Arabic NLP example I shared | |
| # ('stylometric', CustomDictVectorizer()), | |
| # ]) | |
| # ) | |
| ]) | |
| ), | |
| ("clf", LogisticRegression(multi_class="ovr")) | |
| ]) | |
| pipeline.fit(X_raw, y) | |
| features = pipeline["features"].get_feature_names_out() | |
| x_i_raw = "Monkeys are mammals" | |
| x_i = pipeline["features"].transform([x_i_raw]) | |
| y_hat = pipeline["clf"].predict(x_i) | |
| # classifier weights for our predicted class | |
| w = pipeline["clf"].coef_[y_hat] | |
| # get 1D array | |
| w = w.flatten() | |
| # dimensional gymnastics to flatten | |
| x_i = np.asarray(x_i.todense()).reshape(-1) | |
| # scale features by x_i | |
| # NOTE: this will zero out weights | |
| # where features are absent for x_i | |
| weighted_wx_i = w * x_i | |
| k = 10 | |
| # NOTE: argsort will go from smallest to largest | |
| # so we need to reverse it (i.e., largest positive at the front) | |
| top_k = np.argsort(weighted_wx_i)[::-1][:k] | |
| # some of those top k might be zero, so let's filter them out: | |
| # NOTE: we might want to be more aggressive (ex. > 0.5) | |
| surviving_top_k_feature_indices = top_k[weighted_wx_i[top_k] != 0] | |
| top_k_features = features[surviving_top_k_feature_indices] | |
| top_k_weights = weighted_wx_i[surviving_top_k_feature_indices] | |
| print(list(zip(top_k_features, top_k_weights))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # tested with ... | |
| # numpy==1.26.0 | |
| # scikit-learn==1.2.2 | |
| from sklearn import preprocessing | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| import numpy as np | |
| le = preprocessing.LabelEncoder() | |
| vectorizer = CountVectorizer(ngram_range=(1,1)) | |
| clf = LogisticRegression(multi_class="ovr") | |
| X_raw = [ | |
| "Frogs are amphibians", "Turtles are amphibians", | |
| "Dolphins are mammals", "Cats are mammals", | |
| "Lizards are reptiles", "Snakes are reptiles" | |
| ] | |
| y_raw = ["AMPHIBIAN", "MAMMAL", "REPTILE", "AMPHIBIAN", "MAMMAL", "REPTILE"] | |
| X = vectorizer.fit_transform(X_raw) | |
| y = le.fit_transform(y_raw) | |
| features = vectorizer.get_feature_names_out() | |
| clf.fit(X, y) | |
| x_i_raw = "Monkeys are mammals" | |
| x_i = vectorizer.transform([x_i_raw]) | |
| y_hat = clf.predict(x_i) | |
| # classifier weights for our predicted class | |
| w = clf.coef_[y_hat] | |
| # get 1D array | |
| w = w.flatten() | |
| # dimensional gymnastics to flatten | |
| x_i = np.asarray(x_i.todense()).reshape(-1) | |
| # scale features by x_i | |
| # NOTE: this will zero out weights | |
| # where features are absent for x_i | |
| weighted_wx_i = w * x_i | |
| k = 3 | |
| # NOTE: argsort will go from smallest to largest | |
| # so we need to reverse it (i.e., largest positive at the front) | |
| top_k = np.argsort(weighted_wx_i)[::-1][:k] | |
| # some of those top k might be zero, so let's filter them out: | |
| # NOTE: we might want to be more aggressive (ex. > 0.5) | |
| surviving_top_k_feature_indices = top_k[weighted_wx_i[top_k] != 0] | |
| top_k_features = features[surviving_top_k_feature_indices] | |
| top_k_weights = weighted_wx_i[surviving_top_k_feature_indices] | |
| print(list(zip(top_k_features, top_k_weights))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment