yoongkang · August 3, 2019 16:48
diff --git a/custom_label_encoder.py b/custom_label_encoder.py
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils import column_or_1d

 from sklearn.utils.validation import check_array
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.validation import _num_samples
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.multiclass import type_of_target


 def _encode_check_unknown(values, uniques, return_mask=False):
    """
    Helper function to check for unknowns in values to be encoded.
    Uses pure python method for object dtype, and numpy method for
    all other dtypes.
    Parameters
    ----------
    values : array
        Values to check for unknowns.
    uniques : array
        Allowed uniques values.
    return_mask : bool, default False
        If True, return a mask of the same shape as `values` indicating
        the valid values.
    Returns
    -------
    diff : list
        The unique values present in `values` and not in `uniques` (the
        unknown values).
    valid_mask : boolean array
        Additionally returned if ``return_mask=True``.
    """
    if values.dtype == object:
        uniques_set = set(uniques)
        diff = list(set(values) - uniques_set)
        if return_mask:
            if diff:
                valid_mask = np.array([val in uniques_set for val in values])
            else:
                valid_mask = np.ones(len(values), dtype=bool)
            return diff, valid_mask
        else:
            return diff
    else:
        unique_values = np.unique(values)
        diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
        if return_mask:
            if diff:
                valid_mask = np.in1d(values, uniques)
            else:
                valid_mask = np.ones(len(values), dtype=bool)
            return diff, valid_mask
        else:
            return diff



 def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
    # only used in _encode below, see docstring there for details
    if uniques is None:
        if encode:
            uniques, encoded = np.unique(values, return_inverse=True)
            return uniques, encoded
        else:
            # unique sorts
            return np.unique(values)
    if encode:
        if check_unknown:
            diff = _encode_check_unknown(values, uniques)
            if diff:
                raise ValueError("y contains previously unseen labels: %s"
                                 % str(diff))
        encoded = np.searchsorted(uniques, values)
        return uniques, encoded
    else:
        return uniques


 def _encode_python(values, uniques=None, encode=False):
    # only used in _encode below, see docstring there for details
    if uniques is None:
        uniques = sorted(set(values))
        uniques = np.array(uniques, dtype=values.dtype)
    if encode:
        table = {val: i for i, val in enumerate(uniques)}
        encoded = np.array([table.get(v, -1) for v in values])
        return uniques, encoded
    else:
        return uniques


 def _encode(values, uniques=None, encode=False, check_unknown=True):
    """Helper function to factorize (find uniques) and encode values.
    Uses pure python method for object dtype, and numpy method for
    all other dtypes.
    The numpy method has the limitation that the `uniques` need to
    be sorted. Importantly, this is not checked but assumed to already be
    the case. The calling method needs to ensure this for all non-object
    values.
    Parameters
    ----------
    values : array
        Values to factorize or encode.
    uniques : array, optional
        If passed, uniques are not determined from passed values (this
        can be because the user specified categories, or because they
        already have been determined in fit).
    encode : bool, default False
        If True, also encode the values into integer codes based on `uniques`.
    check_unknown : bool, default True
        If True, check for values in ``values`` that are not in ``unique``
        and raise an error. This is ignored for object dtype, and treated as
        True in this case. This parameter is useful for
        _BaseEncoder._transform() to avoid calling _encode_check_unknown()
        twice.
    Returns
    -------
    uniques
        If ``encode=False``. The unique values are sorted if the `uniques`
        parameter was None (and thus inferred from the data).
    (uniques, encoded)
        If ``encode=True``.
    """
    if values.dtype == object:
        try:
            res = _encode_python(values, uniques, encode)
        except TypeError:
            raise TypeError("argument must be a string or number")
        return res
    else:
        return _encode_numpy(values, uniques, encode,
                             check_unknown=check_unknown)


 class CustomLabelEncoder(LabelEncoder):
    """Encode labels with value between 0 and n_classes-1.
    Read more in the :ref:`User Guide <preprocessing_targets>`.
    Attributes
    ----------
    classes_ : array of shape (n_class,)
        Holds the label for each class.
    Examples
    --------
    `LabelEncoder` can be used to normalize labels.
    >>> from sklearn import preprocessing
    >>> le = preprocessing.LabelEncoder()
    >>> le.fit([1, 2, 2, 6])
    LabelEncoder()
    >>> le.classes_
    array([1, 2, 6])
    >>> le.transform([1, 1, 2, 6])
    array([0, 0, 1, 2]...)
    >>> le.inverse_transform([0, 0, 1, 2])
    array([1, 1, 2, 6])
    It can also be used to transform non-numerical labels (as long as they are
    hashable and comparable) to numerical labels.
    >>> le = preprocessing.LabelEncoder()
    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
    LabelEncoder()
    >>> list(le.classes_)
    ['amsterdam', 'paris', 'tokyo']
    >>> le.transform(["tokyo", "tokyo", "paris"])
    array([2, 2, 1]...)
    >>> list(le.inverse_transform([2, 2, 1]))
    ['tokyo', 'tokyo', 'paris']
    See also
    --------
    sklearn.preprocessing.OrdinalEncoder : encode categorical features
        using an ordinal encoding scheme.
    """

    def fit(self, y):
        """Fit label encoder
        Parameters
        ----------
        y : array-like of shape (n_samples,)
            Target values.
        Returns
        -------
        self : returns an instance of self.
        """
        y = column_or_1d(y, warn=True)
        self.classes_ = _encode(y)
        return self

    def fit_transform(self, y):
        """Fit label encoder and return encoded labels
        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.
        Returns
        -------
        y : array-like of shape [n_samples]
        """
        y = column_or_1d(y, warn=True)
        self.classes_, y = _encode(y, encode=True)
        return y

    def transform(self, y):
        """Transform labels to normalized encoding.
        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.
        Returns
        -------
        y : array-like of shape [n_samples]
        """
        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)
        # transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        _, y = _encode(y, uniques=self.classes_, encode=True)
        return y

    def inverse_transform(self, y):
        """Transform labels back to original encoding.
        Parameters
        ----------
        y : numpy array of shape [n_samples]
            Target values.
        Returns
        -------
        y : numpy array of shape [n_samples]
        """
        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)
        # inverse transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        diff = np.setdiff1d(y, np.arange(len(self.classes_)))
        if len(diff):
            raise ValueError(
                    "y contains previously unseen labels: %s" % str(diff))
        y = np.asarray(y)
        return self.classes_[y]

    def _more_tags(self):
        return {'X_types': ['1dlabels']}
	from sklearn.preprocessing import LabelEncoder
	from sklearn.utils import column_or_1d

	from sklearn.utils.validation import check_array
	from sklearn.utils.validation import check_is_fitted
	from sklearn.utils.validation import _num_samples
	from sklearn.utils.multiclass import unique_labels
	from sklearn.utils.multiclass import type_of_target


	def _encode_check_unknown(values, uniques, return_mask=False):
	"""
	Helper function to check for unknowns in values to be encoded.
	Uses pure python method for object dtype, and numpy method for
	all other dtypes.
	Parameters
	----------
	values : array
	Values to check for unknowns.
	uniques : array
	Allowed uniques values.
	return_mask : bool, default False
	If True, return a mask of the same shape as `values` indicating
	the valid values.
	Returns
	-------
	diff : list
	The unique values present in `values` and not in `uniques` (the
	unknown values).
	valid_mask : boolean array
	Additionally returned if ``return_mask=True``.
	"""
	if values.dtype == object:
	uniques_set = set(uniques)
	diff = list(set(values) - uniques_set)
	if return_mask:
	if diff:
	valid_mask = np.array([val in uniques_set for val in values])
	else:
	valid_mask = np.ones(len(values), dtype=bool)
	return diff, valid_mask
	else:
	return diff
	else:
	unique_values = np.unique(values)
	diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
	if return_mask:
	if diff:
	valid_mask = np.in1d(values, uniques)
	else:
	valid_mask = np.ones(len(values), dtype=bool)
	return diff, valid_mask
	else:
	return diff



	def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
	# only used in _encode below, see docstring there for details
	if uniques is None:
	if encode:
	uniques, encoded = np.unique(values, return_inverse=True)
	return uniques, encoded
	else:
	# unique sorts
	return np.unique(values)
	if encode:
	if check_unknown:
	diff = _encode_check_unknown(values, uniques)
	if diff:
	raise ValueError("y contains previously unseen labels: %s"
	% str(diff))
	encoded = np.searchsorted(uniques, values)
	return uniques, encoded
	else:
	return uniques


	def _encode_python(values, uniques=None, encode=False):
	# only used in _encode below, see docstring there for details
	if uniques is None:
	uniques = sorted(set(values))
	uniques = np.array(uniques, dtype=values.dtype)
	if encode:
	table = {val: i for i, val in enumerate(uniques)}
	encoded = np.array([table.get(v, -1) for v in values])
	return uniques, encoded
	else:
	return uniques


	def _encode(values, uniques=None, encode=False, check_unknown=True):
	"""Helper function to factorize (find uniques) and encode values.
	Uses pure python method for object dtype, and numpy method for
	all other dtypes.
	The numpy method has the limitation that the `uniques` need to
	be sorted. Importantly, this is not checked but assumed to already be
	the case. The calling method needs to ensure this for all non-object
	values.
	Parameters
	----------
	values : array
	Values to factorize or encode.
	uniques : array, optional
	If passed, uniques are not determined from passed values (this
	can be because the user specified categories, or because they
	already have been determined in fit).
	encode : bool, default False
	If True, also encode the values into integer codes based on `uniques`.
	check_unknown : bool, default True
	If True, check for values in ``values`` that are not in ``unique``
	and raise an error. This is ignored for object dtype, and treated as
	True in this case. This parameter is useful for
	_BaseEncoder._transform() to avoid calling _encode_check_unknown()
	twice.
	Returns
	-------
	uniques
	If ``encode=False``. The unique values are sorted if the `uniques`
	parameter was None (and thus inferred from the data).
	(uniques, encoded)
	If ``encode=True``.
	"""
	if values.dtype == object:
	try:
	res = _encode_python(values, uniques, encode)
	except TypeError:
	raise TypeError("argument must be a string or number")
	return res
	else:
	return _encode_numpy(values, uniques, encode,
	check_unknown=check_unknown)


	class CustomLabelEncoder(LabelEncoder):
	"""Encode labels with value between 0 and n_classes-1.
	Read more in the :ref:`User Guide <preprocessing_targets>`.
	Attributes
	----------
	classes_ : array of shape (n_class,)
	Holds the label for each class.
	Examples
	--------
	`LabelEncoder` can be used to normalize labels.
	>>> from sklearn import preprocessing
	>>> le = preprocessing.LabelEncoder()
	>>> le.fit([1, 2, 2, 6])
	LabelEncoder()
	>>> le.classes_
	array([1, 2, 6])
	>>> le.transform([1, 1, 2, 6])
	array([0, 0, 1, 2]...)
	>>> le.inverse_transform([0, 0, 1, 2])
	array([1, 1, 2, 6])
	It can also be used to transform non-numerical labels (as long as they are
	hashable and comparable) to numerical labels.
	>>> le = preprocessing.LabelEncoder()
	>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
	LabelEncoder()
	>>> list(le.classes_)
	['amsterdam', 'paris', 'tokyo']
	>>> le.transform(["tokyo", "tokyo", "paris"])
	array([2, 2, 1]...)
	>>> list(le.inverse_transform([2, 2, 1]))
	['tokyo', 'tokyo', 'paris']
	See also
	--------
	sklearn.preprocessing.OrdinalEncoder : encode categorical features
	using an ordinal encoding scheme.
	"""

	def fit(self, y):
	"""Fit label encoder
	Parameters
	----------
	y : array-like of shape (n_samples,)
	Target values.
	Returns
	-------
	self : returns an instance of self.
	"""
	y = column_or_1d(y, warn=True)
	self.classes_ = _encode(y)
	return self

	def fit_transform(self, y):
	"""Fit label encoder and return encoded labels
	Parameters
	----------
	y : array-like of shape [n_samples]
	Target values.
	Returns
	-------
	y : array-like of shape [n_samples]
	"""
	y = column_or_1d(y, warn=True)
	self.classes_, y = _encode(y, encode=True)
	return y

	def transform(self, y):
	"""Transform labels to normalized encoding.
	Parameters
	----------
	y : array-like of shape [n_samples]
	Target values.
	Returns
	-------
	y : array-like of shape [n_samples]
	"""
	check_is_fitted(self, 'classes_')
	y = column_or_1d(y, warn=True)
	# transform of empty array is empty array
	if _num_samples(y) == 0:
	return np.array([])

	_, y = _encode(y, uniques=self.classes_, encode=True)
	return y

	def inverse_transform(self, y):
	"""Transform labels back to original encoding.
	Parameters
	----------
	y : numpy array of shape [n_samples]
	Target values.
	Returns
	-------
	y : numpy array of shape [n_samples]
	"""
	check_is_fitted(self, 'classes_')
	y = column_or_1d(y, warn=True)
	# inverse transform of empty array is empty array
	if _num_samples(y) == 0:
	return np.array([])

	diff = np.setdiff1d(y, np.arange(len(self.classes_)))
	if len(diff):
	raise ValueError(
	"y contains previously unseen labels: %s" % str(diff))
	y = np.asarray(y)
	return self.classes_[y]

	def _more_tags(self):
	return {'X_types': ['1dlabels']}
No results found