myedibleenso · August 29, 2025 00:15
diff --git a/pipeline-example.py b/pipeline-example.py
 # tested with ...
 # numpy==1.26.0
 # scikit-learn==1.2.2
 from sklearn import preprocessing
 from sklearn.pipeline import FeatureUnion, Pipeline
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.linear_model import LogisticRegression
 import numpy as np

 le = preprocessing.LabelEncoder()

 X_raw = [
  "Frogs are amphibians", "Turtles are amphibians", 
  "Dolphins are mammals", "Cats are mammals", 
  "Lizards are reptiles", "Snakes are reptiles"
 ]
 y_raw = ["AMPHIBIAN", "MAMMAL", "REPTILE", "AMPHIBIAN", "MAMMAL", "REPTILE"]

 y = le.fit_transform(y_raw)

 pipeline = Pipeline([
  ("features", FeatureUnion([
      # a feature extraction pipeline for **word** n-grams 
      # and combinations (pairs) of them
      ("word_ngram_combinations", Pipeline([
          ("word_ngrams", CountVectorizer(ngram_range=(1,1))),
          # include pairs of n-grams
          ("word_poly", PolynomialFeatures(degree=2))
        ])
      ),
      # a feature extraction pipeline for **character** n-grams 
      # and combinations (pairs) of them
      ("char_ngram_combinations", Pipeline([
          ("char_ngrams", CountVectorizer(ngram_range=(2,4), analyzer="char")),
          # include pairs of n-grams
          ("char_poly", PolynomialFeatures(degree=2))
        ])
      ),
      # # a third feature extraction pipeline 
      # # might derive stylometric features
      # ("custom_features", Pipeline([
      #     # see the Arabic NLP example I shared
      #     ('stylometric', CustomDictVectorizer()),
      #   ])
      # )
    ])
  ),
  ("clf", LogisticRegression(multi_class="ovr"))
 ])

 pipeline.fit(X_raw, y)

 features = pipeline["features"].get_feature_names_out()

 x_i_raw = "Monkeys are mammals"
 x_i = pipeline["features"].transform([x_i_raw])
 y_hat = pipeline["clf"].predict(x_i)

 # classifier weights for our predicted class
 w = pipeline["clf"].coef_[y_hat]


 # get 1D array
 w = w.flatten()

 # dimensional gymnastics to flatten
 x_i = np.asarray(x_i.todense()).reshape(-1)

 # scale features by x_i
 # NOTE: this will zero out weights 
 # where features are absent for x_i
 weighted_wx_i = w * x_i

 k = 10
 # NOTE: argsort will go from smallest to largest
 # so we need to reverse it (i.e., largest positive at the front)
 top_k = np.argsort(weighted_wx_i)[::-1][:k]

 # some of those top k might be zero, so let's filter them out:
 # NOTE: we might want to be more aggressive (ex. > 0.5)
 surviving_top_k_feature_indices = top_k[weighted_wx_i[top_k] != 0]


 top_k_features = features[surviving_top_k_feature_indices]

 top_k_weights = weighted_wx_i[surviving_top_k_feature_indices]

 print(list(zip(top_k_features, top_k_weights)))
diff --git a/top-features-example.py b/top-features-example.py
 # tested with ...
 # numpy==1.26.0
 # scikit-learn==1.2.2
 from sklearn import preprocessing
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.linear_model import LogisticRegression
 import numpy as np

 le = preprocessing.LabelEncoder()
 vectorizer = CountVectorizer(ngram_range=(1,1))
 clf = LogisticRegression(multi_class="ovr")

 X_raw = [
  "Frogs are amphibians", "Turtles are amphibians", 
  "Dolphins are mammals", "Cats are mammals", 
  "Lizards are reptiles", "Snakes are reptiles"
 ]
 y_raw = ["AMPHIBIAN", "MAMMAL", "REPTILE", "AMPHIBIAN", "MAMMAL", "REPTILE"]

 X = vectorizer.fit_transform(X_raw)
 y = le.fit_transform(y_raw)

 features = vectorizer.get_feature_names_out()

 clf.fit(X, y)

 x_i_raw = "Monkeys are mammals"
 x_i = vectorizer.transform([x_i_raw])
 y_hat = clf.predict(x_i)

 # classifier weights for our predicted class
 w = clf.coef_[y_hat]


 # get 1D array
 w = w.flatten()

 # dimensional gymnastics to flatten
 x_i = np.asarray(x_i.todense()).reshape(-1)

 # scale features by x_i
 # NOTE: this will zero out weights 
 # where features are absent for x_i
 weighted_wx_i = w * x_i

 k = 3
 # NOTE: argsort will go from smallest to largest
 # so we need to reverse it (i.e., largest positive at the front)
 top_k = np.argsort(weighted_wx_i)[::-1][:k]

 # some of those top k might be zero, so let's filter them out:
 # NOTE: we might want to be more aggressive (ex. > 0.5)
 surviving_top_k_feature_indices = top_k[weighted_wx_i[top_k] != 0]


 top_k_features = features[surviving_top_k_feature_indices]

 top_k_weights = weighted_wx_i[surviving_top_k_feature_indices]

 print(list(zip(top_k_features, top_k_weights)))
	# tested with ...
	# numpy==1.26.0
	# scikit-learn==1.2.2
	from sklearn import preprocessing
	from sklearn.pipeline import FeatureUnion, Pipeline
	from sklearn.preprocessing import PolynomialFeatures
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.linear_model import LogisticRegression
	import numpy as np

	le = preprocessing.LabelEncoder()

	X_raw = [
	"Frogs are amphibians", "Turtles are amphibians",
	"Dolphins are mammals", "Cats are mammals",
	"Lizards are reptiles", "Snakes are reptiles"
	]
	y_raw = ["AMPHIBIAN", "MAMMAL", "REPTILE", "AMPHIBIAN", "MAMMAL", "REPTILE"]

	y = le.fit_transform(y_raw)

	pipeline = Pipeline([
	("features", FeatureUnion([
	# a feature extraction pipeline for word n-grams
	# and combinations (pairs) of them
	("word_ngram_combinations", Pipeline([
	("word_ngrams", CountVectorizer(ngram_range=(1,1))),
	# include pairs of n-grams
	("word_poly", PolynomialFeatures(degree=2))
	])
	),
	# a feature extraction pipeline for character n-grams
	# and combinations (pairs) of them
	("char_ngram_combinations", Pipeline([
	("char_ngrams", CountVectorizer(ngram_range=(2,4), analyzer="char")),
	# include pairs of n-grams
	("char_poly", PolynomialFeatures(degree=2))
	])
	),
	# # a third feature extraction pipeline
	# # might derive stylometric features
	# ("custom_features", Pipeline([
	# # see the Arabic NLP example I shared
	# ('stylometric', CustomDictVectorizer()),
	# ])
	# )
	])
	),
	("clf", LogisticRegression(multi_class="ovr"))
	])

	pipeline.fit(X_raw, y)

	features = pipeline["features"].get_feature_names_out()

	x_i_raw = "Monkeys are mammals"
	x_i = pipeline["features"].transform([x_i_raw])
	y_hat = pipeline["clf"].predict(x_i)

	# classifier weights for our predicted class
	w = pipeline["clf"].coef_[y_hat]


	# get 1D array
	w = w.flatten()

	# dimensional gymnastics to flatten
	x_i = np.asarray(x_i.todense()).reshape(-1)

	# scale features by x_i
	# NOTE: this will zero out weights
	# where features are absent for x_i
	weighted_wx_i = w * x_i

	k = 10
	# NOTE: argsort will go from smallest to largest
	# so we need to reverse it (i.e., largest positive at the front)
	top_k = np.argsort(weighted_wx_i)[::-1][:k]

	# some of those top k might be zero, so let's filter them out:
	# NOTE: we might want to be more aggressive (ex. > 0.5)
	surviving_top_k_feature_indices = top_k[weighted_wx_i[top_k] != 0]


	top_k_features = features[surviving_top_k_feature_indices]

	top_k_weights = weighted_wx_i[surviving_top_k_feature_indices]

	print(list(zip(top_k_features, top_k_weights)))
No results found