Last active
September 29, 2019 10:58
-
-
Save GitForMike/70e631dc4538fbd80aaaee223a78694c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def equalsWhenOneCharRemoved(x, y): | |
| if len(x)<len(y): | |
| temp = x | |
| x = y | |
| y= temp | |
| if (len(x)-len(y))!=1: | |
| return False | |
| for i in range(len(y)): | |
| if(x[i]!=y[i]): | |
| if x[i+1:]==y[i:]: | |
| return True | |
| else: | |
| return False | |
| return True |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import csv | |
| import json | |
| def FindPopularProduct(fileName): | |
| df = pd.DataFrame() | |
| f = open(fileName,'r',encoding='utf-8') | |
| out = csv.reader(f) | |
| for line in out: | |
| jsonLine = json.loads(line[0]) | |
| df = df.append(jsonLine,ignore_index=True) | |
| f.close() | |
| result1 = "" | |
| result1MaxCount = 0 | |
| result2 = "" | |
| result2MaxCount = 0 | |
| for product_id in df['product_id'].unique(): | |
| dfSub=df.loc[df['product_id']==product_id] | |
| count = dfSub['user_id'].nunique() | |
| if count>result1MaxCount: | |
| result1 = product_id | |
| result1MaxCount = count | |
| elif count == result1MaxCount: | |
| result1 = result1+", "+product_id | |
| count2 = dfSub['quantity'].sum() | |
| if count2>result2MaxCount: | |
| result2 = product_id | |
| result2MaxCount = count2 | |
| elif count2 == result2MaxCount: | |
| result2 = result2+", "+product_id | |
| print("Most popular product(s) based on the number of purchasers: [ "+result1+" ]") | |
| print("Most popular product(s) based on the quantity of goods sold: [ "+result2+" ]") | |
| if __name__ == "__main__": | |
| FindPopularProduct("SWE sample data - Q2 data.csv") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import datetime | |
| def ReadDataFile(fileName): | |
| df = pd.read_csv(fileName) | |
| df['ts']=pd.to_datetime(df['ts']) | |
| return df | |
| if __name__ == "__main__": | |
| df = ReadDataFile("SWE sample data - Q3 data.csv") | |
| #sub problem 1 | |
| #Find sub dataframe which country_id is BDV | |
| #For each site_id, count unique user_id and find which has maximum unique user_id | |
| dfSub = df.loc[df['country_id']=="BDV"] | |
| result1 = '' | |
| result1Count = 0 | |
| for site_id in dfSub['site_id'].unique(): | |
| count = dfSub.loc[dfSub['site_id']==site_id]["user_id"].nunique() | |
| if count > result1Count: | |
| result1 = site_id | |
| result1Count = count | |
| elif count == result1Count: | |
| result1 = result1+", "+site_id | |
| print(result1+", "+str(result1Count)) | |
| #sub problem 2 | |
| #Find sub dataframe which 'ts' is between 2019-02-03 00:00:00 and 2019-02-04 23:59:59. | |
| #Group by the dataframe which 'user_id' and 'site_id' are same and print if the same count is more than 10 | |
| dfSub = df[df['ts'].between(datetime.datetime(2019, 2, 3, 0, 0, 0),\ | |
| datetime.datetime(2019, 2, 4, 23, 59, 59))] | |
| dfSub2 = dfSub.groupby(['user_id','site_id']).size().reset_index(name='size') | |
| dfSub3 = dfSub2[dfSub2['size']>10] | |
| for index,row in dfSub3.iterrows(): | |
| print("("+row['user_id']+", "+row['site_id']+", "+str(row['size'])+")") | |
| #sub problem 3 | |
| #Find sub dataframe for each user_id where ts is the biggest(last visiting) | |
| #Group by site_id for count last visit users and print TOP 3. | |
| dfSub = df.loc[df.groupby(['user_id'])['ts'].idxmax()] | |
| dfSub2 = dfSub.groupby('site_id').size().reset_index(name='size').sort_values(["size"],ascending=[False]) | |
| dfSub2 = dfSub2.reset_index(drop=True) | |
| for index,row in dfSub2.iterrows(): | |
| if index>2: | |
| break | |
| print("("+row['site_id']+", "+str(row['size'])+")") | |
| #sub problem 4 | |
| #Find sub dataframe for each user_id where ts is the biggest and smallest. | |
| #Find the number of user_id which first_site_id and last_site_id are same and merge them. | |
| dfSub = df.loc[df.groupby(['user_id'])['ts'].idxmax()] | |
| dfSub = dfSub[['user_id','site_id']].rename(columns = {'site_id':'last_site_id'}) | |
| dfSub2 = df.loc[df.groupby(['user_id'])['ts'].idxmin()] | |
| dfSub2 = dfSub2[['user_id','site_id']].rename(columns = {'site_id':'first_site_id'}) | |
| dfSub3 = pd.merge(dfSub,dfSub2, on='user_id') | |
| dfSub4 = dfSub3.groupby(['first_site_id','last_site_id']).size().reset_index(name='size') | |
| dfSub4 = dfSub4[dfSub4['size']>1] | |
| print(dfSub4['size'].sum()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment