Created
January 5, 2015 22:41
-
-
Save slb350/60d19ced713096b88f8a to your computer and use it in GitHub Desktop.
excel diffs with pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| # Define the diff function to show the changes in each field | |
| def report_diff(x): | |
| return x[0] if x[0] == x[1] else '{} ---> {}'.format(*x) | |
| # Read in the two files but call the data old and new and create columns to track | |
| old = pd.read_excel('sample-address-old.xlsx', 'Sheet1', na_values=['NA']) | |
| new = pd.read_excel('sample-address-new.xlsx', 'Sheet1', na_values=['NA']) | |
| old['version'] = "old" | |
| new['version'] = "new" | |
| #Join all the data together and ignore indexes so it all gets added | |
| full_set = pd.concat([old,new],ignore_index=True) | |
| # Let's see what changes in the main columns we care about | |
| changes = full_set.drop_duplicates(subset=["account number","name","street","city","state","postal code"],take_last=True) | |
| #We want to know where the duplicate account numbers are, that means there have been changes | |
| dupe_accts = changes.set_index('account number').index.get_duplicates() | |
| #Get all the duplicate rows | |
| dupes = changes[changes["account number"].isin(dupe_accts)] | |
| #Pull out the old and new data into separate dataframes | |
| change_new = dupes[(dupes["version"] == "new")] | |
| change_old = dupes[(dupes["version"] == "old")] | |
| #Drop the temp columns - we don't need them now | |
| change_new = change_new.drop(['version'], axis=1) | |
| change_old = change_old.drop(['version'], axis=1) | |
| #Index on the account numbers | |
| change_new.set_index('account number',inplace=True) | |
| change_old.set_index('account number',inplace=True) | |
| #Now we can diff because we have two data sets of the same size with the same index | |
| diff_panel = pd.Panel(dict(df1=change_old,df2=change_new)) | |
| diff_output = diff_panel.apply(report_diff, axis=0) | |
| #Diff'ing is done, we need to get a list of removed items | |
| #Flag all duplicated account numbers | |
| changes['duplicate']=changes["account number"].isin(dupe_accts) | |
| #Identify non-duplicated items that are in the old version and did not show in the new version | |
| removed_accounts = changes[(changes["duplicate"] == False) & (changes["version"] == "old")] | |
| # We have the old and diff, we need to figure out which ones are new | |
| #Drop duplicates but keep the first item instead of the last | |
| new_account_set = full_set.drop_duplicates(subset=["account number","name","street","city","state","postal code"],take_last=False) | |
| #Identify dupes in this new dataframe | |
| new_account_set['duplicate']=new_account_set["account number"].isin(dupe_accts) | |
| #Identify added accounts | |
| added_accounts = new_account_set[(new_account_set["duplicate"] == False) & (new_account_set["version"] == "new")] | |
| #Save the changes to excel but only include the columns we care about | |
| writer = pd.ExcelWriter("my-diff-2.xlsx") | |
| diff_output.to_excel(writer,"changed") | |
| removed_accounts.to_excel(writer,"removed",index=False,columns=["account number", | |
| "name","street","city","state","postal code"]) | |
| added_accounts.to_excel(writer,"added",index=False,columns=["account number", | |
| "name","street","city","state","postal code"]) | |
| writer.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment