Last active
September 24, 2020 14:01
-
-
Save kboghe/4fc69c6adffc84dd194e17f59753396e to your computer and use it in GitHub Desktop.
hierarchicalclus
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.cluster import AgglomerativeClustering | |
| from tslearn.clustering import TimeSeriesKMeans | |
| import pandas as pd | |
| #writing function to summarize cluster membership across regions | |
| def cluster_membership(cases=None,cluster=None): | |
| url_clus = pd.DataFrame({'url':cases,'cluster':cluster}) | |
| url_clus = pd.merge(url_clus,geos_locinfo_poptimes[['url','country']].drop_duplicates(),how='left') | |
| url_clus['index'] = 1 | |
| url_clus = url_clus.pivot(index=['url','cluster'], columns='country', values='index') | |
| url_clus = url_clus.groupby('cluster').sum() | |
| total_url_clus = url_clus.T.melt().groupby('cluster')['value'].sum() | |
| url_clus = pd.merge(url_clus, total_url_clus, how='left', left_index=True, right_index=True) | |
| url_clus[['BE', 'DE', 'ES', 'FR', 'IT', 'NL']] = url_clus[['BE', 'DE', 'ES', 'FR', 'IT', 'NL']].div(url_clus['value'].values, axis=0) | |
| return url_clus | |
| #subsetting data to create a restaurant location * hour matrix (pivoting) | |
| byhour_pop = geos_locinfo_poptimes[(geos_locinfo_poptimes['category_aggregated'] == 'restaurant')].groupby(['url','hour list','day list'])['percentage busy'].agg(['mean']).reset_index() | |
| #byhour_pop = byhour_pop[byhour_pop['hour list'].isin(['12','13','14','15','16','17','18','19','20','21','22','23','24'])] | |
| byhour_pop = byhour_pop[~byhour_pop['hour list'].isin(['day marked as closed','not enough location data available for this day'])] | |
| byhour_pop["day order"] = byhour_pop["day list"].replace({"monday": "0", "tuesday": "1","wednesday":"2","thursday":"3","friday":"4","saturday":"5","sunday":"6"}) | |
| byhour_pop = byhour_pop.sort_values(['url','day order','hours_adj']).drop(['day order','hours_adj'],axis=1) | |
| byhour_pop = byhour_pop.pivot(index=['url'], columns=['hour list','day list'], values='mean').reset_index() | |
| byhour_pop = byhour_pop.fillna(0).set_index('url') | |
| # calculating different hierarchical clustering solutions, from 1 to 6 clusters# | |
| url_clus_total = pd.DataFrame(columns=['BE','DE','ES','FR','IT','NL','value','solution']) | |
| for index,ax,title in zip(list(range(1,7)),axn.flatten(),['1 cluster','2 clusters','3 clusters','4 clusters','5 clusters','6 clusters']): | |
| cluster = AgglomerativeClustering(n_clusters=index, affinity='euclidean', linkage='ward') | |
| prediction = cluster.fit_predict(byhour_pop) | |
| url_clus = cluster_membership(cases=byhour_pop.index,cluster=prediction) | |
| url_clus['solution'] = [title] * len(url_clus) | |
| url_clus_total = pd.concat([url_clus_total,url_clus]) | |
| #writing cluster memebership across countries to hard drive | |
| url_clus_total.to_csv("hierarchical_clustering_popularity.csv",encoding='utf-8',sep=";") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment