Skip to content

Instantly share code, notes, and snippets.

@GitForMike
Last active September 29, 2019 10:58
Show Gist options
  • Select an option

  • Save GitForMike/70e631dc4538fbd80aaaee223a78694c to your computer and use it in GitHub Desktop.

Select an option

Save GitForMike/70e631dc4538fbd80aaaee223a78694c to your computer and use it in GitHub Desktop.
def equalsWhenOneCharRemoved(x, y):
if len(x)<len(y):
temp = x
x = y
y= temp
if (len(x)-len(y))!=1:
return False
for i in range(len(y)):
if(x[i]!=y[i]):
if x[i+1:]==y[i:]:
return True
else:
return False
return True
import pandas as pd
import csv
import json
def FindPopularProduct(fileName):
df = pd.DataFrame()
f = open(fileName,'r',encoding='utf-8')
out = csv.reader(f)
for line in out:
jsonLine = json.loads(line[0])
df = df.append(jsonLine,ignore_index=True)
f.close()
result1 = ""
result1MaxCount = 0
result2 = ""
result2MaxCount = 0
for product_id in df['product_id'].unique():
dfSub=df.loc[df['product_id']==product_id]
count = dfSub['user_id'].nunique()
if count>result1MaxCount:
result1 = product_id
result1MaxCount = count
elif count == result1MaxCount:
result1 = result1+", "+product_id
count2 = dfSub['quantity'].sum()
if count2>result2MaxCount:
result2 = product_id
result2MaxCount = count2
elif count2 == result2MaxCount:
result2 = result2+", "+product_id
print("Most popular product(s) based on the number of purchasers: [ "+result1+" ]")
print("Most popular product(s) based on the quantity of goods sold: [ "+result2+" ]")
if __name__ == "__main__":
FindPopularProduct("SWE sample data - Q2 data.csv")
import pandas as pd
import datetime
def ReadDataFile(fileName):
df = pd.read_csv(fileName)
df['ts']=pd.to_datetime(df['ts'])
return df
if __name__ == "__main__":
df = ReadDataFile("SWE sample data - Q3 data.csv")
#sub problem 1
#Find sub dataframe which country_id is BDV
#For each site_id, count unique user_id and find which has maximum unique user_id
dfSub = df.loc[df['country_id']=="BDV"]
result1 = ''
result1Count = 0
for site_id in dfSub['site_id'].unique():
count = dfSub.loc[dfSub['site_id']==site_id]["user_id"].nunique()
if count > result1Count:
result1 = site_id
result1Count = count
elif count == result1Count:
result1 = result1+", "+site_id
print(result1+", "+str(result1Count))
#sub problem 2
#Find sub dataframe which 'ts' is between 2019-02-03 00:00:00 and 2019-02-04 23:59:59.
#Group by the dataframe which 'user_id' and 'site_id' are same and print if the same count is more than 10
dfSub = df[df['ts'].between(datetime.datetime(2019, 2, 3, 0, 0, 0),\
datetime.datetime(2019, 2, 4, 23, 59, 59))]
dfSub2 = dfSub.groupby(['user_id','site_id']).size().reset_index(name='size')
dfSub3 = dfSub2[dfSub2['size']>10]
for index,row in dfSub3.iterrows():
print("("+row['user_id']+", "+row['site_id']+", "+str(row['size'])+")")
#sub problem 3
#Find sub dataframe for each user_id where ts is the biggest(last visiting)
#Group by site_id for count last visit users and print TOP 3.
dfSub = df.loc[df.groupby(['user_id'])['ts'].idxmax()]
dfSub2 = dfSub.groupby('site_id').size().reset_index(name='size').sort_values(["size"],ascending=[False])
dfSub2 = dfSub2.reset_index(drop=True)
for index,row in dfSub2.iterrows():
if index>2:
break
print("("+row['site_id']+", "+str(row['size'])+")")
#sub problem 4
#Find sub dataframe for each user_id where ts is the biggest and smallest.
#Find the number of user_id which first_site_id and last_site_id are same and merge them.
dfSub = df.loc[df.groupby(['user_id'])['ts'].idxmax()]
dfSub = dfSub[['user_id','site_id']].rename(columns = {'site_id':'last_site_id'})
dfSub2 = df.loc[df.groupby(['user_id'])['ts'].idxmin()]
dfSub2 = dfSub2[['user_id','site_id']].rename(columns = {'site_id':'first_site_id'})
dfSub3 = pd.merge(dfSub,dfSub2, on='user_id')
dfSub4 = dfSub3.groupby(['first_site_id','last_site_id']).size().reset_index(name='size')
dfSub4 = dfSub4[dfSub4['size']>1]
print(dfSub4['size'].sum())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment