Skip to content

Instantly share code, notes, and snippets.

@ndaifallah
Last active May 11, 2018 11:47
Show Gist options
  • Select an option

  • Save ndaifallah/55123b85da7260527a8d169d5a40a97d to your computer and use it in GitHub Desktop.

Select an option

Save ndaifallah/55123b85da7260527a8d169d5a40a97d to your computer and use it in GitHub Desktop.
This gist is made for some data formats preprocessing, When you have a column in dataset which contains ingredients for food recipes, you can use this class to help you in tags binarizing.
# I needed this in some unprepared datasets
# Author Nasreddine DAIFALLAH
import pandas as pn
class TagLabelEncoder:
def __init__(self, strip=',', tolower=True):
self.strip = strip
self.tolower = tolower
pass
def fit(self, arr=[]):
p = list(map(lambda x: x.split(self.strip), arr))
self.vocabulary = {}
for idx in p:
for jdx in idx:
self.vocabulary[jdx.strip()] = True
self.vocabulary = self.vocabulary.keys()
if self.tolower:
self.vocabulary = list(map(lambda x: x.lower(), self.vocabulary))
pass
def transform(self, arr=[], to_dataframe=False):
if to_dataframe:
return self.to_dataframe(arr)
voc = {}
for i, idx in enumerate(self.vocabulary):
voc[idx] = i
result = [[0 for idx in self.vocabulary] for jdx in range(len(arr))]
r = list(map(lambda x: x.split(self.strip), arr))
for i, idx in enumerate(r):
for jdx in idx:
if self.tolower:
result[i][voc[jdx.strip().lower()]] = 1
else:
result[i][voc[jdx.strip()]] = 1
return result
def to_dataframe(self, matrix=[]):
arr = self.transform(matrix)
horizontal_dict = {}
for i, idx in enumerate(self.vocabulary):
horizontal_dict[idx] = []
for jdx in arr:
horizontal_dict[idx].append(jdx[i])
return pn.DataFrame.from_dict(horizontal_dict)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment