Skip to content

Instantly share code, notes, and snippets.

@yoongkang
Created August 3, 2019 16:48
Show Gist options
  • Select an option

  • Save yoongkang/bfb9808691863072748a5cf95f1cc898 to your computer and use it in GitHub Desktop.

Select an option

Save yoongkang/bfb9808691863072748a5cf95f1cc898 to your computer and use it in GitHub Desktop.
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import column_or_1d
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import _num_samples
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.multiclass import type_of_target
def _encode_check_unknown(values, uniques, return_mask=False):
"""
Helper function to check for unknowns in values to be encoded.
Uses pure python method for object dtype, and numpy method for
all other dtypes.
Parameters
----------
values : array
Values to check for unknowns.
uniques : array
Allowed uniques values.
return_mask : bool, default False
If True, return a mask of the same shape as `values` indicating
the valid values.
Returns
-------
diff : list
The unique values present in `values` and not in `uniques` (the
unknown values).
valid_mask : boolean array
Additionally returned if ``return_mask=True``.
"""
if values.dtype == object:
uniques_set = set(uniques)
diff = list(set(values) - uniques_set)
if return_mask:
if diff:
valid_mask = np.array([val in uniques_set for val in values])
else:
valid_mask = np.ones(len(values), dtype=bool)
return diff, valid_mask
else:
return diff
else:
unique_values = np.unique(values)
diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
if return_mask:
if diff:
valid_mask = np.in1d(values, uniques)
else:
valid_mask = np.ones(len(values), dtype=bool)
return diff, valid_mask
else:
return diff
def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
# only used in _encode below, see docstring there for details
if uniques is None:
if encode:
uniques, encoded = np.unique(values, return_inverse=True)
return uniques, encoded
else:
# unique sorts
return np.unique(values)
if encode:
if check_unknown:
diff = _encode_check_unknown(values, uniques)
if diff:
raise ValueError("y contains previously unseen labels: %s"
% str(diff))
encoded = np.searchsorted(uniques, values)
return uniques, encoded
else:
return uniques
def _encode_python(values, uniques=None, encode=False):
# only used in _encode below, see docstring there for details
if uniques is None:
uniques = sorted(set(values))
uniques = np.array(uniques, dtype=values.dtype)
if encode:
table = {val: i for i, val in enumerate(uniques)}
encoded = np.array([table.get(v, -1) for v in values])
return uniques, encoded
else:
return uniques
def _encode(values, uniques=None, encode=False, check_unknown=True):
"""Helper function to factorize (find uniques) and encode values.
Uses pure python method for object dtype, and numpy method for
all other dtypes.
The numpy method has the limitation that the `uniques` need to
be sorted. Importantly, this is not checked but assumed to already be
the case. The calling method needs to ensure this for all non-object
values.
Parameters
----------
values : array
Values to factorize or encode.
uniques : array, optional
If passed, uniques are not determined from passed values (this
can be because the user specified categories, or because they
already have been determined in fit).
encode : bool, default False
If True, also encode the values into integer codes based on `uniques`.
check_unknown : bool, default True
If True, check for values in ``values`` that are not in ``unique``
and raise an error. This is ignored for object dtype, and treated as
True in this case. This parameter is useful for
_BaseEncoder._transform() to avoid calling _encode_check_unknown()
twice.
Returns
-------
uniques
If ``encode=False``. The unique values are sorted if the `uniques`
parameter was None (and thus inferred from the data).
(uniques, encoded)
If ``encode=True``.
"""
if values.dtype == object:
try:
res = _encode_python(values, uniques, encode)
except TypeError:
raise TypeError("argument must be a string or number")
return res
else:
return _encode_numpy(values, uniques, encode,
check_unknown=check_unknown)
class CustomLabelEncoder(LabelEncoder):
"""Encode labels with value between 0 and n_classes-1.
Read more in the :ref:`User Guide <preprocessing_targets>`.
Attributes
----------
classes_ : array of shape (n_class,)
Holds the label for each class.
Examples
--------
`LabelEncoder` can be used to normalize labels.
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2]...)
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])
It can also be used to transform non-numerical labels (as long as they are
hashable and comparable) to numerical labels.
>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']
See also
--------
sklearn.preprocessing.OrdinalEncoder : encode categorical features
using an ordinal encoding scheme.
"""
def fit(self, y):
"""Fit label encoder
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
self : returns an instance of self.
"""
y = column_or_1d(y, warn=True)
self.classes_ = _encode(y)
return self
def fit_transform(self, y):
"""Fit label encoder and return encoded labels
Parameters
----------
y : array-like of shape [n_samples]
Target values.
Returns
-------
y : array-like of shape [n_samples]
"""
y = column_or_1d(y, warn=True)
self.classes_, y = _encode(y, encode=True)
return y
def transform(self, y):
"""Transform labels to normalized encoding.
Parameters
----------
y : array-like of shape [n_samples]
Target values.
Returns
-------
y : array-like of shape [n_samples]
"""
check_is_fitted(self, 'classes_')
y = column_or_1d(y, warn=True)
# transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
_, y = _encode(y, uniques=self.classes_, encode=True)
return y
def inverse_transform(self, y):
"""Transform labels back to original encoding.
Parameters
----------
y : numpy array of shape [n_samples]
Target values.
Returns
-------
y : numpy array of shape [n_samples]
"""
check_is_fitted(self, 'classes_')
y = column_or_1d(y, warn=True)
# inverse transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
diff = np.setdiff1d(y, np.arange(len(self.classes_)))
if len(diff):
raise ValueError(
"y contains previously unseen labels: %s" % str(diff))
y = np.asarray(y)
return self.classes_[y]
def _more_tags(self):
return {'X_types': ['1dlabels']}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment