Created
August 3, 2019 16:48
-
-
Save yoongkang/bfb9808691863072748a5cf95f1cc898 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.utils import column_or_1d | |
| from sklearn.utils.validation import check_array | |
| from sklearn.utils.validation import check_is_fitted | |
| from sklearn.utils.validation import _num_samples | |
| from sklearn.utils.multiclass import unique_labels | |
| from sklearn.utils.multiclass import type_of_target | |
| def _encode_check_unknown(values, uniques, return_mask=False): | |
| """ | |
| Helper function to check for unknowns in values to be encoded. | |
| Uses pure python method for object dtype, and numpy method for | |
| all other dtypes. | |
| Parameters | |
| ---------- | |
| values : array | |
| Values to check for unknowns. | |
| uniques : array | |
| Allowed uniques values. | |
| return_mask : bool, default False | |
| If True, return a mask of the same shape as `values` indicating | |
| the valid values. | |
| Returns | |
| ------- | |
| diff : list | |
| The unique values present in `values` and not in `uniques` (the | |
| unknown values). | |
| valid_mask : boolean array | |
| Additionally returned if ``return_mask=True``. | |
| """ | |
| if values.dtype == object: | |
| uniques_set = set(uniques) | |
| diff = list(set(values) - uniques_set) | |
| if return_mask: | |
| if diff: | |
| valid_mask = np.array([val in uniques_set for val in values]) | |
| else: | |
| valid_mask = np.ones(len(values), dtype=bool) | |
| return diff, valid_mask | |
| else: | |
| return diff | |
| else: | |
| unique_values = np.unique(values) | |
| diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) | |
| if return_mask: | |
| if diff: | |
| valid_mask = np.in1d(values, uniques) | |
| else: | |
| valid_mask = np.ones(len(values), dtype=bool) | |
| return diff, valid_mask | |
| else: | |
| return diff | |
| def _encode_numpy(values, uniques=None, encode=False, check_unknown=True): | |
| # only used in _encode below, see docstring there for details | |
| if uniques is None: | |
| if encode: | |
| uniques, encoded = np.unique(values, return_inverse=True) | |
| return uniques, encoded | |
| else: | |
| # unique sorts | |
| return np.unique(values) | |
| if encode: | |
| if check_unknown: | |
| diff = _encode_check_unknown(values, uniques) | |
| if diff: | |
| raise ValueError("y contains previously unseen labels: %s" | |
| % str(diff)) | |
| encoded = np.searchsorted(uniques, values) | |
| return uniques, encoded | |
| else: | |
| return uniques | |
| def _encode_python(values, uniques=None, encode=False): | |
| # only used in _encode below, see docstring there for details | |
| if uniques is None: | |
| uniques = sorted(set(values)) | |
| uniques = np.array(uniques, dtype=values.dtype) | |
| if encode: | |
| table = {val: i for i, val in enumerate(uniques)} | |
| encoded = np.array([table.get(v, -1) for v in values]) | |
| return uniques, encoded | |
| else: | |
| return uniques | |
| def _encode(values, uniques=None, encode=False, check_unknown=True): | |
| """Helper function to factorize (find uniques) and encode values. | |
| Uses pure python method for object dtype, and numpy method for | |
| all other dtypes. | |
| The numpy method has the limitation that the `uniques` need to | |
| be sorted. Importantly, this is not checked but assumed to already be | |
| the case. The calling method needs to ensure this for all non-object | |
| values. | |
| Parameters | |
| ---------- | |
| values : array | |
| Values to factorize or encode. | |
| uniques : array, optional | |
| If passed, uniques are not determined from passed values (this | |
| can be because the user specified categories, or because they | |
| already have been determined in fit). | |
| encode : bool, default False | |
| If True, also encode the values into integer codes based on `uniques`. | |
| check_unknown : bool, default True | |
| If True, check for values in ``values`` that are not in ``unique`` | |
| and raise an error. This is ignored for object dtype, and treated as | |
| True in this case. This parameter is useful for | |
| _BaseEncoder._transform() to avoid calling _encode_check_unknown() | |
| twice. | |
| Returns | |
| ------- | |
| uniques | |
| If ``encode=False``. The unique values are sorted if the `uniques` | |
| parameter was None (and thus inferred from the data). | |
| (uniques, encoded) | |
| If ``encode=True``. | |
| """ | |
| if values.dtype == object: | |
| try: | |
| res = _encode_python(values, uniques, encode) | |
| except TypeError: | |
| raise TypeError("argument must be a string or number") | |
| return res | |
| else: | |
| return _encode_numpy(values, uniques, encode, | |
| check_unknown=check_unknown) | |
| class CustomLabelEncoder(LabelEncoder): | |
| """Encode labels with value between 0 and n_classes-1. | |
| Read more in the :ref:`User Guide <preprocessing_targets>`. | |
| Attributes | |
| ---------- | |
| classes_ : array of shape (n_class,) | |
| Holds the label for each class. | |
| Examples | |
| -------- | |
| `LabelEncoder` can be used to normalize labels. | |
| >>> from sklearn import preprocessing | |
| >>> le = preprocessing.LabelEncoder() | |
| >>> le.fit([1, 2, 2, 6]) | |
| LabelEncoder() | |
| >>> le.classes_ | |
| array([1, 2, 6]) | |
| >>> le.transform([1, 1, 2, 6]) | |
| array([0, 0, 1, 2]...) | |
| >>> le.inverse_transform([0, 0, 1, 2]) | |
| array([1, 1, 2, 6]) | |
| It can also be used to transform non-numerical labels (as long as they are | |
| hashable and comparable) to numerical labels. | |
| >>> le = preprocessing.LabelEncoder() | |
| >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) | |
| LabelEncoder() | |
| >>> list(le.classes_) | |
| ['amsterdam', 'paris', 'tokyo'] | |
| >>> le.transform(["tokyo", "tokyo", "paris"]) | |
| array([2, 2, 1]...) | |
| >>> list(le.inverse_transform([2, 2, 1])) | |
| ['tokyo', 'tokyo', 'paris'] | |
| See also | |
| -------- | |
| sklearn.preprocessing.OrdinalEncoder : encode categorical features | |
| using an ordinal encoding scheme. | |
| """ | |
| def fit(self, y): | |
| """Fit label encoder | |
| Parameters | |
| ---------- | |
| y : array-like of shape (n_samples,) | |
| Target values. | |
| Returns | |
| ------- | |
| self : returns an instance of self. | |
| """ | |
| y = column_or_1d(y, warn=True) | |
| self.classes_ = _encode(y) | |
| return self | |
| def fit_transform(self, y): | |
| """Fit label encoder and return encoded labels | |
| Parameters | |
| ---------- | |
| y : array-like of shape [n_samples] | |
| Target values. | |
| Returns | |
| ------- | |
| y : array-like of shape [n_samples] | |
| """ | |
| y = column_or_1d(y, warn=True) | |
| self.classes_, y = _encode(y, encode=True) | |
| return y | |
| def transform(self, y): | |
| """Transform labels to normalized encoding. | |
| Parameters | |
| ---------- | |
| y : array-like of shape [n_samples] | |
| Target values. | |
| Returns | |
| ------- | |
| y : array-like of shape [n_samples] | |
| """ | |
| check_is_fitted(self, 'classes_') | |
| y = column_or_1d(y, warn=True) | |
| # transform of empty array is empty array | |
| if _num_samples(y) == 0: | |
| return np.array([]) | |
| _, y = _encode(y, uniques=self.classes_, encode=True) | |
| return y | |
| def inverse_transform(self, y): | |
| """Transform labels back to original encoding. | |
| Parameters | |
| ---------- | |
| y : numpy array of shape [n_samples] | |
| Target values. | |
| Returns | |
| ------- | |
| y : numpy array of shape [n_samples] | |
| """ | |
| check_is_fitted(self, 'classes_') | |
| y = column_or_1d(y, warn=True) | |
| # inverse transform of empty array is empty array | |
| if _num_samples(y) == 0: | |
| return np.array([]) | |
| diff = np.setdiff1d(y, np.arange(len(self.classes_))) | |
| if len(diff): | |
| raise ValueError( | |
| "y contains previously unseen labels: %s" % str(diff)) | |
| y = np.asarray(y) | |
| return self.classes_[y] | |
| def _more_tags(self): | |
| return {'X_types': ['1dlabels']} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment