Skip to content

Instantly share code, notes, and snippets.

@slb350
Created January 5, 2015 22:41
Show Gist options
  • Select an option

  • Save slb350/60d19ced713096b88f8a to your computer and use it in GitHub Desktop.

Select an option

Save slb350/60d19ced713096b88f8a to your computer and use it in GitHub Desktop.
excel diffs with pandas
import pandas as pd
import numpy as np
# Define the diff function to show the changes in each field
def report_diff(x):
return x[0] if x[0] == x[1] else '{} ---> {}'.format(*x)
# Read in the two files but call the data old and new and create columns to track
old = pd.read_excel('sample-address-old.xlsx', 'Sheet1', na_values=['NA'])
new = pd.read_excel('sample-address-new.xlsx', 'Sheet1', na_values=['NA'])
old['version'] = "old"
new['version'] = "new"
#Join all the data together and ignore indexes so it all gets added
full_set = pd.concat([old,new],ignore_index=True)
# Let's see what changes in the main columns we care about
changes = full_set.drop_duplicates(subset=["account number","name","street","city","state","postal code"],take_last=True)
#We want to know where the duplicate account numbers are, that means there have been changes
dupe_accts = changes.set_index('account number').index.get_duplicates()
#Get all the duplicate rows
dupes = changes[changes["account number"].isin(dupe_accts)]
#Pull out the old and new data into separate dataframes
change_new = dupes[(dupes["version"] == "new")]
change_old = dupes[(dupes["version"] == "old")]
#Drop the temp columns - we don't need them now
change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)
#Index on the account numbers
change_new.set_index('account number',inplace=True)
change_old.set_index('account number',inplace=True)
#Now we can diff because we have two data sets of the same size with the same index
diff_panel = pd.Panel(dict(df1=change_old,df2=change_new))
diff_output = diff_panel.apply(report_diff, axis=0)
#Diff'ing is done, we need to get a list of removed items
#Flag all duplicated account numbers
changes['duplicate']=changes["account number"].isin(dupe_accts)
#Identify non-duplicated items that are in the old version and did not show in the new version
removed_accounts = changes[(changes["duplicate"] == False) & (changes["version"] == "old")]
# We have the old and diff, we need to figure out which ones are new
#Drop duplicates but keep the first item instead of the last
new_account_set = full_set.drop_duplicates(subset=["account number","name","street","city","state","postal code"],take_last=False)
#Identify dupes in this new dataframe
new_account_set['duplicate']=new_account_set["account number"].isin(dupe_accts)
#Identify added accounts
added_accounts = new_account_set[(new_account_set["duplicate"] == False) & (new_account_set["version"] == "new")]
#Save the changes to excel but only include the columns we care about
writer = pd.ExcelWriter("my-diff-2.xlsx")
diff_output.to_excel(writer,"changed")
removed_accounts.to_excel(writer,"removed",index=False,columns=["account number",
"name","street","city","state","postal code"])
added_accounts.to_excel(writer,"added",index=False,columns=["account number",
"name","street","city","state","postal code"])
writer.save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment