Skip to content

Instantly share code, notes, and snippets.

@kboghe
Last active September 24, 2020 14:01
Show Gist options
  • Select an option

  • Save kboghe/4fc69c6adffc84dd194e17f59753396e to your computer and use it in GitHub Desktop.

Select an option

Save kboghe/4fc69c6adffc84dd194e17f59753396e to your computer and use it in GitHub Desktop.
hierarchicalclus
from sklearn.cluster import AgglomerativeClustering
from tslearn.clustering import TimeSeriesKMeans
import pandas as pd
#writing function to summarize cluster membership across regions
def cluster_membership(cases=None,cluster=None):
url_clus = pd.DataFrame({'url':cases,'cluster':cluster})
url_clus = pd.merge(url_clus,geos_locinfo_poptimes[['url','country']].drop_duplicates(),how='left')
url_clus['index'] = 1
url_clus = url_clus.pivot(index=['url','cluster'], columns='country', values='index')
url_clus = url_clus.groupby('cluster').sum()
total_url_clus = url_clus.T.melt().groupby('cluster')['value'].sum()
url_clus = pd.merge(url_clus, total_url_clus, how='left', left_index=True, right_index=True)
url_clus[['BE', 'DE', 'ES', 'FR', 'IT', 'NL']] = url_clus[['BE', 'DE', 'ES', 'FR', 'IT', 'NL']].div(url_clus['value'].values, axis=0)
return url_clus
#subsetting data to create a restaurant location * hour matrix (pivoting)
byhour_pop = geos_locinfo_poptimes[(geos_locinfo_poptimes['category_aggregated'] == 'restaurant')].groupby(['url','hour list','day list'])['percentage busy'].agg(['mean']).reset_index()
#byhour_pop = byhour_pop[byhour_pop['hour list'].isin(['12','13','14','15','16','17','18','19','20','21','22','23','24'])]
byhour_pop = byhour_pop[~byhour_pop['hour list'].isin(['day marked as closed','not enough location data available for this day'])]
byhour_pop["day order"] = byhour_pop["day list"].replace({"monday": "0", "tuesday": "1","wednesday":"2","thursday":"3","friday":"4","saturday":"5","sunday":"6"})
byhour_pop = byhour_pop.sort_values(['url','day order','hours_adj']).drop(['day order','hours_adj'],axis=1)
byhour_pop = byhour_pop.pivot(index=['url'], columns=['hour list','day list'], values='mean').reset_index()
byhour_pop = byhour_pop.fillna(0).set_index('url')
# calculating different hierarchical clustering solutions, from 1 to 6 clusters#
url_clus_total = pd.DataFrame(columns=['BE','DE','ES','FR','IT','NL','value','solution'])
for index,ax,title in zip(list(range(1,7)),axn.flatten(),['1 cluster','2 clusters','3 clusters','4 clusters','5 clusters','6 clusters']):
cluster = AgglomerativeClustering(n_clusters=index, affinity='euclidean', linkage='ward')
prediction = cluster.fit_predict(byhour_pop)
url_clus = cluster_membership(cases=byhour_pop.index,cluster=prediction)
url_clus['solution'] = [title] * len(url_clus)
url_clus_total = pd.concat([url_clus_total,url_clus])
#writing cluster memebership across countries to hard drive
url_clus_total.to_csv("hierarchical_clustering_popularity.csv",encoding='utf-8',sep=";")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment