Last active
May 11, 2018 11:47
-
-
Save ndaifallah/55123b85da7260527a8d169d5a40a97d to your computer and use it in GitHub Desktop.
This gist is made for some data formats preprocessing, When you have a column in dataset which contains ingredients for food recipes, you can use this class to help you in tags binarizing.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # I needed this in some unprepared datasets | |
| # Author Nasreddine DAIFALLAH | |
| import pandas as pn | |
| class TagLabelEncoder: | |
| def __init__(self, strip=',', tolower=True): | |
| self.strip = strip | |
| self.tolower = tolower | |
| pass | |
| def fit(self, arr=[]): | |
| p = list(map(lambda x: x.split(self.strip), arr)) | |
| self.vocabulary = {} | |
| for idx in p: | |
| for jdx in idx: | |
| self.vocabulary[jdx.strip()] = True | |
| self.vocabulary = self.vocabulary.keys() | |
| if self.tolower: | |
| self.vocabulary = list(map(lambda x: x.lower(), self.vocabulary)) | |
| pass | |
| def transform(self, arr=[], to_dataframe=False): | |
| if to_dataframe: | |
| return self.to_dataframe(arr) | |
| voc = {} | |
| for i, idx in enumerate(self.vocabulary): | |
| voc[idx] = i | |
| result = [[0 for idx in self.vocabulary] for jdx in range(len(arr))] | |
| r = list(map(lambda x: x.split(self.strip), arr)) | |
| for i, idx in enumerate(r): | |
| for jdx in idx: | |
| if self.tolower: | |
| result[i][voc[jdx.strip().lower()]] = 1 | |
| else: | |
| result[i][voc[jdx.strip()]] = 1 | |
| return result | |
| def to_dataframe(self, matrix=[]): | |
| arr = self.transform(matrix) | |
| horizontal_dict = {} | |
| for i, idx in enumerate(self.vocabulary): | |
| horizontal_dict[idx] = [] | |
| for jdx in arr: | |
| horizontal_dict[idx].append(jdx[i]) | |
| return pn.DataFrame.from_dict(horizontal_dict) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment