GitForMike · September 29, 2019 10:58
diff --git a/moloco_problem1 b/moloco_problem1
 def equalsWhenOneCharRemoved(x, y):
    if len(x)<len(y):
        temp = x
        x = y
        y= temp
    if (len(x)-len(y))!=1:
        return False
    for i in range(len(y)):
        if(x[i]!=y[i]):
            if x[i+1:]==y[i:]:
                return True
            else:
                return False
    return True
diff --git a/moloco_problem2 b/moloco_problem2
 import pandas as pd
 import csv
 import json

 def FindPopularProduct(fileName):
    df = pd.DataFrame()
    f = open(fileName,'r',encoding='utf-8')
    out = csv.reader(f)
    for line in out:
        jsonLine = json.loads(line[0])
        df = df.append(jsonLine,ignore_index=True)
    f.close()
    
    result1 = ""
    result1MaxCount = 0
    result2 = ""
    result2MaxCount = 0
    for product_id in df['product_id'].unique():
        dfSub=df.loc[df['product_id']==product_id]
        count = dfSub['user_id'].nunique()
        if count>result1MaxCount:
            result1 = product_id
            result1MaxCount = count
        elif count == result1MaxCount:
            result1 = result1+", "+product_id
        count2 = dfSub['quantity'].sum()
        if count2>result2MaxCount:
            result2 = product_id
            result2MaxCount = count2
        elif count2 == result2MaxCount:
            result2 = result2+", "+product_id

    print("Most popular product(s) based on the number of purchasers: [ "+result1+" ]")
    print("Most popular product(s) based on the quantity of goods sold: [ "+result2+" ]")

 if __name__ == "__main__":
    FindPopularProduct("SWE sample data - Q2 data.csv")
diff --git a/moloco_problem3 b/moloco_problem3
 import pandas as pd
 import datetime

 def ReadDataFile(fileName):
    df = pd.read_csv(fileName)
    df['ts']=pd.to_datetime(df['ts'])
    return df

 if __name__ == "__main__":
    df = ReadDataFile("SWE sample data - Q3 data.csv")

    #sub problem 1
    #Find sub dataframe which country_id is BDV
    #For each site_id, count unique user_id and find which has maximum unique user_id
    dfSub = df.loc[df['country_id']=="BDV"]
    result1 = ''
    result1Count = 0
    for site_id in dfSub['site_id'].unique():
        count = dfSub.loc[dfSub['site_id']==site_id]["user_id"].nunique()
        if count > result1Count:
            result1 = site_id
            result1Count = count
        elif count == result1Count:
            result1 = result1+", "+site_id

    print(result1+", "+str(result1Count))

    #sub problem 2
    #Find sub dataframe which 'ts' is between 2019-02-03 00:00:00 and 2019-02-04 23:59:59.
    #Group by the dataframe which 'user_id' and 'site_id' are same and print if the same count is more than 10
    dfSub = df[df['ts'].between(datetime.datetime(2019, 2, 3, 0, 0, 0),\
                                datetime.datetime(2019, 2, 4, 23, 59, 59))]
    dfSub2 = dfSub.groupby(['user_id','site_id']).size().reset_index(name='size')
    dfSub3 = dfSub2[dfSub2['size']>10]
    for index,row in dfSub3.iterrows():
        print("("+row['user_id']+", "+row['site_id']+", "+str(row['size'])+")")

    #sub problem 3
    #Find sub dataframe for each user_id where ts is the biggest(last visiting)
    #Group by site_id for count last visit users and print TOP 3.
    dfSub = df.loc[df.groupby(['user_id'])['ts'].idxmax()]
    dfSub2 = dfSub.groupby('site_id').size().reset_index(name='size').sort_values(["size"],ascending=[False])
    dfSub2 = dfSub2.reset_index(drop=True)
    for index,row in dfSub2.iterrows():
        if index>2:
            break
        print("("+row['site_id']+", "+str(row['size'])+")")

    #sub problem 4
    #Find sub dataframe for each user_id where ts is the biggest and smallest.
    #Find the number of user_id which first_site_id and last_site_id are same and merge them.
    dfSub = df.loc[df.groupby(['user_id'])['ts'].idxmax()]
    dfSub = dfSub[['user_id','site_id']].rename(columns = {'site_id':'last_site_id'})
    dfSub2 = df.loc[df.groupby(['user_id'])['ts'].idxmin()]
    dfSub2 = dfSub2[['user_id','site_id']].rename(columns = {'site_id':'first_site_id'})
    dfSub3 = pd.merge(dfSub,dfSub2, on='user_id')
    dfSub4 = dfSub3.groupby(['first_site_id','last_site_id']).size().reset_index(name='size')
    dfSub4 = dfSub4[dfSub4['size']>1]
    print(dfSub4['size'].sum())
	def equalsWhenOneCharRemoved(x, y):
	if len(x)<len(y):
	temp = x
	x = y
	y= temp
	if (len(x)-len(y))!=1:
	return False
	for i in range(len(y)):
	if(x[i]!=y[i]):
	if x[i+1:]==y[i:]:
	return True
	else:
	return False
	return True
	import pandas as pd
	import csv
	import json

	def FindPopularProduct(fileName):
	df = pd.DataFrame()
	f = open(fileName,'r',encoding='utf-8')
	out = csv.reader(f)
	for line in out:
	jsonLine = json.loads(line[0])
	df = df.append(jsonLine,ignore_index=True)
	f.close()

	result1 = ""
	result1MaxCount = 0
	result2 = ""
	result2MaxCount = 0
	for product_id in df['product_id'].unique():
	dfSub=df.loc[df['product_id']==product_id]
	count = dfSub['user_id'].nunique()
	if count>result1MaxCount:
	result1 = product_id
	result1MaxCount = count
	elif count == result1MaxCount:
	result1 = result1+", "+product_id
	count2 = dfSub['quantity'].sum()
	if count2>result2MaxCount:
	result2 = product_id
	result2MaxCount = count2
	elif count2 == result2MaxCount:
	result2 = result2+", "+product_id

	print("Most popular product(s) based on the number of purchasers: [ "+result1+" ]")
	print("Most popular product(s) based on the quantity of goods sold: [ "+result2+" ]")

	if __name__ == "__main__":
	FindPopularProduct("SWE sample data - Q2 data.csv")
	import pandas as pd
	import datetime

	def ReadDataFile(fileName):
	df = pd.read_csv(fileName)
	df['ts']=pd.to_datetime(df['ts'])
	return df

	if __name__ == "__main__":
	df = ReadDataFile("SWE sample data - Q3 data.csv")

	#sub problem 1
	#Find sub dataframe which country_id is BDV
	#For each site_id, count unique user_id and find which has maximum unique user_id
	dfSub = df.loc[df['country_id']=="BDV"]
	result1 = ''
	result1Count = 0
	for site_id in dfSub['site_id'].unique():
	count = dfSub.loc[dfSub['site_id']==site_id]["user_id"].nunique()
	if count > result1Count:
	result1 = site_id
	result1Count = count
	elif count == result1Count:
	result1 = result1+", "+site_id

	print(result1+", "+str(result1Count))

	#sub problem 2
	#Find sub dataframe which 'ts' is between 2019-02-03 00:00:00 and 2019-02-04 23:59:59.
	#Group by the dataframe which 'user_id' and 'site_id' are same and print if the same count is more than 10
	dfSub = df[df['ts'].between(datetime.datetime(2019, 2, 3, 0, 0, 0),\
	datetime.datetime(2019, 2, 4, 23, 59, 59))]
	dfSub2 = dfSub.groupby(['user_id','site_id']).size().reset_index(name='size')
	dfSub3 = dfSub2[dfSub2['size']>10]
	for index,row in dfSub3.iterrows():
	print("("+row['user_id']+", "+row['site_id']+", "+str(row['size'])+")")

	#sub problem 3
	#Find sub dataframe for each user_id where ts is the biggest(last visiting)
	#Group by site_id for count last visit users and print TOP 3.
	dfSub = df.loc[df.groupby(['user_id'])['ts'].idxmax()]
	dfSub2 = dfSub.groupby('site_id').size().reset_index(name='size').sort_values(["size"],ascending=[False])
	dfSub2 = dfSub2.reset_index(drop=True)
	for index,row in dfSub2.iterrows():
	if index>2:
	break
	print("("+row['site_id']+", "+str(row['size'])+")")

	#sub problem 4
	#Find sub dataframe for each user_id where ts is the biggest and smallest.
	#Find the number of user_id which first_site_id and last_site_id are same and merge them.
	dfSub = df.loc[df.groupby(['user_id'])['ts'].idxmax()]
	dfSub = dfSub[['user_id','site_id']].rename(columns = {'site_id':'last_site_id'})
	dfSub2 = df.loc[df.groupby(['user_id'])['ts'].idxmin()]
	dfSub2 = dfSub2[['user_id','site_id']].rename(columns = {'site_id':'first_site_id'})
	dfSub3 = pd.merge(dfSub,dfSub2, on='user_id')
	dfSub4 = dfSub3.groupby(['first_site_id','last_site_id']).size().reset_index(name='size')
	dfSub4 = dfSub4[dfSub4['size']>1]
	print(dfSub4['size'].sum())