import pandas as pd data = pd.read_csv("/datasets/visits.csv", sep="\t") data['local_time'] = ( pd.to_datetime(data['date_time'], yearfirst=True) + pd.Timedelta(hours=3) ) data['date_hour'] = data['local_time'].dt.round('1H') data['too_fast'] = data['time_spent'] < 60 data['too_slow'] = data['time_spent'] > 1000 too_fast_stat = data.pivot_table(index='id', values='too_fast') good_ids = too_fast_stat.query('too_fast < 0.5') good_data = data.query('id in @good_ids.index') good_data = good_data.query('60 <= time_spent <= 1000') station_stat = data.pivot_table(index="id", values="time_spent", aggfunc="median") good_station_stat = good_data.pivot_table(index="id", values="time_spent", aggfunc="median") stat = data.pivot_table(index='name', values='time_spent') good_stat = good_data.pivot_table(index='name', values='time_spent', aggfunc='median') stat['good_time_spent'] = good_stat['time_spent'] id_name = good_data.pivot_table(index='id', values='name', aggfunc=['first', 'count']) id_name.columns = ['name', 'count'] station_stat_full = id_name.join(good_station_stat) good_stat2 = ( station_stat_full .query('count > 30') .pivot_table(index='name', values='time_spent', aggfunc=['median', 'count']) ) good_stat2.columns = ['median_time', 'stations'] final_stat = stat.join(good_stat2) big_nets_stat = final_stat.query('stations > 10') station_stat_full['group_name'] = ( station_stat_full['name'] .where(station_stat_full['name'].isin(big_nets_stat.index), 'Другие') ) stat_grouped = ( station_stat_full .query('count > 30') .pivot_table(index='group_name', values='time_spent', aggfunc=['median', 'count']) ) stat_grouped.columns = ['time_spent', 'count'] good_data['group_name'] = ( good_data['name'] .where(good_data['name'].isin(big_nets_stat.index), 'Другие') ) for group_name, group_data in good_data.groupby('group_name'): group_data.plot(kind='hist', y='time_spent', bins=50, title=group_name)