Last active
December 11, 2025 15:38
-
-
Save sanzoghenzo/73275613c592331180a24cb2ddfd5bcb to your computer and use it in GitHub Desktop.
Revisions
-
sanzoghenzo revised this gist
Feb 16, 2021 . 1 changed file with 15 additions and 12 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,19 +12,21 @@ def report_diff(x): """Function to use with groupby.apply to highlight value changes.""" return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}' def strip(x): """Function to use with applymap to strip whitespaces from a dataframe.""" return x.strip() if isinstance(x, str) else x def diff_pd(old_df, new_df, idx_col): """ Identify differences between two pandas DataFrames using a key column. Key column is assumed to have a unique row identifier, i.e. no duplicates. Args: old_df (pd.DataFrame): first dataframe new_df (pd.DataFrame): second dataframe @@ -43,7 +45,7 @@ def diff_pd(old_df, new_df, idx_col): else: removed_keys = np.setdiff1d(old_keys, new_keys) added_keys = np.setdiff1d(new_keys, old_keys) # populate the output data with non empty dataframes out_data = {} removed = old_df.loc[removed_keys] if not removed.empty: @@ -98,13 +100,14 @@ def compare_excel( else: print("No differences spotted") def build_parser(): cfg = argparse.ArgumentParser( description="Compares two Excel sheets and outputs the differences " "to a separate Excel file." ) cfg.add_argument("path1", help="Fist Excel file") cfg.add_argument("path2", help="Second Excel file") cfg.add_argument("sheetname", help="Name of the sheet to compare.") cfg.add_argument( "key_column", @@ -115,8 +118,8 @@ def build_parser(): ) cfg.add_argument("-o", "--output-path", default="compared.xlsx", help="Path of the comparison results") cfg.add_argument("--skiprows", help='Excel row containing the table headers', type=int, action='append', default=None) return cfg -
sanzoghenzo revised this gist
Feb 16, 2021 . 2 changed files with 25 additions and 9 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -43,10 +43,14 @@ def diff_pd(old_df, new_df, idx_col): else: removed_keys = np.setdiff1d(old_keys, new_keys) added_keys = np.setdiff1d(new_keys, old_keys) # populate the output data with non empty ddataframes out_data = {} removed = old_df.loc[removed_keys] if not removed.empty: out_data["removed"] = removed added = new_df.loc[added_keys] if not added.empty: out_data["added"] = added # focusing on common data of both dataframes common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True) common_columns = np.intersect1d( @@ -73,7 +77,9 @@ def diff_pd(old_df, new_df, idx_col): # using report_diff to merge the changes in a single cell with "-->" df_changed = df_all_changes.groupby(level=0, axis=1).apply( lambda frame: frame.apply(report_diff, axis=1)) # add changed dataframe to output data only if non empty if not df_changed.empty: out_data['changed'] = df_changed return out_data @@ -84,11 +90,13 @@ def compare_excel( old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs) new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs) diff = diff_pd(old_df, new_df, index_col_name) if diff: with pd.ExcelWriter(out_path) as writer: for sname, data in diff.items(): data.to_excel(writer, sheet_name=sname) print(f"Differences saved in {out_path}") else: print("No differences spotted") def build_parser(): cfg = argparse.ArgumentParser( This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -99,6 +99,13 @@ def test_multiindex_excel(): compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"]) diff = pd.read_excel(diff_io, sheet_name=None) run_assertion(diff) def test_no_diffs(): df_1, _ = sample_multiindex_dfs() diff = compare.diff_pd(df_1, df_1, ["ID", "Flavour ID"]) assert not diff print("OK.") if __name__ == '__main__': @@ -107,3 +114,4 @@ def test_multiindex_excel(): test_single_index() test_single_index_excel() test_parser() test_no_diff() -
sanzoghenzo revised this gist
Jan 9, 2021 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -86,7 +86,8 @@ def compare_excel( diff = diff_pd(old_df, new_df, index_col_name) with pd.ExcelWriter(out_path) as writer: for sname, data in diff.items(): if not data.empty: data.to_excel(writer, sheet_name=sname) print(f"Differences saved in {out_path}") def build_parser(): -
sanzoghenzo revised this gist
Feb 23, 2020 . 2 changed files with 49 additions and 19 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -35,6 +35,8 @@ def diff_pd(old_df, new_df, idx_col): old_df = old_df.set_index(idx_col) new_df = new_df.set_index(idx_col) # get the added and removed rows old_keys = old_df.index new_keys = new_df.index if isinstance(old_keys, pd.MultiIndex): removed_keys = old_keys.difference(new_keys) added_keys = new_keys.difference(old_keys) @@ -87,7 +89,6 @@ def compare_excel( data.to_excel(writer, sheet_name=sname) print(f"Differences saved in {out_path}") def build_parser(): cfg = argparse.ArgumentParser( description="Compares two excel files and outputs the differences " @@ -118,4 +119,4 @@ def main(): if __name__ == '__main__': main() This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -24,29 +24,26 @@ def build_excel_stream(df, sheetname): return output def sample_xlsx(df_1, df_2): xlsx_1 = build_excel_stream(df_1, "Sheet1") xlsx_2 = build_excel_stream(df_2, "Sheet1") return xlsx_1, xlsx_2 def sample_dfs(): df_1 = pd.DataFrame({ "ID": [123456, 654321, 543219, 432198, 765432], "Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"], "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"], }) df_2 = pd.DataFrame({ "ID": [123456, 654321, 543219, 432198, 876543], "Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"], "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Sugary"], }) return df_1, df_2 def run_assertion(diff): changed = diff["changed"] assert len(changed) == 1 @@ -60,14 +57,44 @@ def run_assertion(diff): print("OK.") def test_single_index(): df_1, df_2 = sample_dfs() diff = compare.diff_pd(df_1, df_2, ["ID"]) run_assertion(diff) def test_single_index_excel(): xlsx_1, xlsx_2 = sample_xlsx(*sample_dfs()) diff_io = io.BytesIO() compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", "ID") diff = pd.read_excel(diff_io, sheet_name=None) run_assertion(diff) def sample_multiindex_dfs(): df_1 = pd.DataFrame({ "ID": [123456, 123456, 654321, 543219, 432198, 765432], "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"], "Flavour ID": [1, 2, None, None, None, None], "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"], }) df_2 = pd.DataFrame({ "ID": [123456, 123456, 654321, 543219, 432198, 876543], "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"], "Flavour ID": [1, 2, None, None, None, None], "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"], }) return df_1, df_2 def test_multiindex(): df_1, df_2 = sample_multiindex_dfs() diff = compare.diff_pd(df_1, df_2, ["ID", "Flavour ID"]) run_assertion(diff) def test_multiindex_excel(): xlsx_1, xlsx_2 = sample_xlsx(*sample_multiindex_dfs()) diff_io = io.BytesIO() compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"]) diff = pd.read_excel(diff_io, sheet_name=None) @@ -77,4 +104,6 @@ def test_multiindex_excel(): if __name__ == '__main__': test_multiindex() test_multiindex_excel() test_single_index() test_single_index_excel() test_parser() -
sanzoghenzo revised this gist
Feb 6, 2020 . 2 changed files with 33 additions and 23 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -35,10 +35,12 @@ def diff_pd(old_df, new_df, idx_col): old_df = old_df.set_index(idx_col) new_df = new_df.set_index(idx_col) # get the added and removed rows if isinstance(old_keys, pd.MultiIndex): removed_keys = old_keys.difference(new_keys) added_keys = new_keys.difference(old_keys) else: removed_keys = np.setdiff1d(old_keys, new_keys) added_keys = np.setdiff1d(new_keys, old_keys) out_data = { 'removed': old_df.loc[removed_keys], 'added': new_df.loc[added_keys] This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,11 +1,12 @@ """Excel compare test suite.""" import io import pandas as pd import compare def test_parser(): cfg = compare.build_parser() opt = cfg.parse_args(["test1.xlsx", "test2.xlsx", "Sheet 1", "Col1", "Col2", "-o", "output.xlsx"]) assert opt.path1 == "test1.xlsx" assert opt.path2 == "test2.xlsx" @@ -25,16 +26,16 @@ def build_excel_stream(df, sheetname): def sample_dfs(): df_1 = pd.DataFrame({ "ID": [123456, 123456, 654321, 543219, 432198, 765432], "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"], "Flavour ID": [1, 2, None, None, None, None], "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"], }) df_2 = pd.DataFrame({ "ID": [123456, 123456, 654321, 543219, 432198, 876543], "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"], "Flavour ID": [1, 2, None, None, None, None], "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"], }) return df_1, df_2 @@ -46,27 +47,34 @@ def sample_xlsx(): return xlsx_1, xlsx_2 def run_assertion(diff): changed = diff["changed"] assert len(changed) == 1 assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly" added = diff["added"] assert len(added) == 1 assert added.iloc[0]["Flavour Description"] == "Sugary" removed = diff["removed"] assert len(removed) == 1 assert removed.iloc[0]["Flavour Description"] == "Smoky" print("OK.") def test_multiindex(): df_1, df_2 = sample_dfs() diff = compare.diff_pd(df_1, df_2, ["ID", "Flavour ID"]) run_assertion(diff) def test_multiindex_excel(): xlsx_1, xlsx_2 = sample_xlsx() diff_io = io.BytesIO() compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"]) diff = pd.read_excel(diff_io, sheet_name=None) run_assertion(diff) if __name__ == '__main__': test_multiindex() test_multiindex_excel() test_parser() -
sanzoghenzo revised this gist
Feb 5, 2020 . 2 changed files with 21 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -11,9 +11,6 @@ import numpy as np def report_diff(x): """Function to use with groupby.apply to highlihgt value changes.""" return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}' @@ -89,7 +86,7 @@ def compare_excel( print(f"Differences saved in {out_path}") def build_parser(): cfg = argparse.ArgumentParser( description="Compares two excel files and outputs the differences " "in another excel file." @@ -98,16 +95,21 @@ def main(): cfg.add_argument("path2", help="Second excel file") cfg.add_argument("sheetname", help="Name of the sheet to compare.") cfg.add_argument( "key_column", help="Name of the column(s) with unique row identifier. It has to be " "the actual text of the first row, not the excel notation." "Use multiple times to create a composite index.", nargs="+", ) cfg.add_argument("-o", "--output-path", default="compared.xlsx", help="Path of the comparison results") cfg.add_argument("--skiprows", help='number of rows to skip', type=int, action='append', default=None) return cfg def main(): cfg = build_parser() opt = cfg.parse_args() compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.key_column, skiprows=opt.skiprows) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,6 +4,17 @@ import compare def test_parser(): cfg = build_parser() opt = cfg.parse_args(["test1.xlsx", "test2.xlsx", "Sheet 1", "Col1", "Col2", "-o", "output.xlsx"]) assert opt.path1 == "test1.xlsx" assert opt.path2 == "test2.xlsx" assert opt.output_path == "output.xlsx" assert opt.sheetname == "Sheet 1" assert opt.key_column == ["Col1", "Col2"] assert opt.skiprows is None def build_excel_stream(df, sheetname): """Create an excel workbook as a file-like object.""" output = io.BytesIO() @@ -56,5 +67,6 @@ def test_multiindex_excel(): if __name__ == '__main__': test_parser() test_multiindex() test_multiindex_excel() -
sanzoghenzo revised this gist
Feb 5, 2020 . 2 changed files with 72 additions and 9 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,10 +1,7 @@ """ Compare two Excel sheets Inspired by https://pbpython.com/excel-diff-pandas-update.html For the documentation, download this file and type: python compare.py --help """ @@ -29,14 +26,12 @@ def strip(x): def diff_pd(old_df, new_df, idx_col): """Identify differences between two pandas DataFrames using a key column. Key column is assumed to have only unique data (like a database unique id index) Args: old_df (pd.DataFrame): first dataframe new_df (pd.DataFrame): second dataframe idx_col (str|list(str)): column name(s) of the index, needs to be present in both DataFrames """ # setting the column name as index for fast operations @@ -63,11 +58,17 @@ def diff_pd(old_df, new_df, idx_col): common_data = pd.concat( [old_common.reset_index(), new_common.reset_index()], sort=True ) changed_keys = common_data.drop_duplicates(keep=False)[idx_col] if isinstance(changed_keys, pd.Series): changed_keys = changed_keys.unique() else: changed_keys = changed_keys.drop_duplicates().set_index(idx_col).index # combining the changed rows via multi level columns df_all_changes = pd.concat( [old_common.loc[changed_keys], new_common.loc[changed_keys]], axis='columns', keys=['old', 'new'] ).swaplevel(axis='columns') # using report_diff to merge the changes in a single cell with "-->" df_changed = df_all_changes.groupby(level=0, axis=1).apply( lambda frame: frame.apply(report_diff, axis=1)) @@ -100,6 +101,8 @@ def main(): "key-column", help="Name of the column with unique row identifier. It has to be " "the actual text of the first row, not the excel notation." "Use multiple times to create a composite index.", action="append", ) cfg.add_argument("-o", "--output-path", default="compared.xlsx", help="Path of the comparison results") @@ -111,4 +114,4 @@ def main(): if __name__ == '__main__': main() This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,60 @@ """Excel compare test suite.""" import io import compare def build_excel_stream(df, sheetname): """Create an excel workbook as a file-like object.""" output = io.BytesIO() with pd.ExcelWriter(output, engine="xlsxwriter") as writer: df.to_excel(writer, sheet_name=sheetname, index=False) return output def sample_dfs(): df_1 = pd.DataFrame({ "ID": [123456, 123456, 654321, 543219, 432198], "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"], "Flavour ID": [1, 2, None, None, None], "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy"], }) df_2 = pd.DataFrame({ "ID": [123456, 123456, 654321, 543219, 432198], "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"], "Flavour ID": [1, 2, None, None, None], "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy"], }) return df_1, df_2 def sample_xlsx(): df_1, df_2 = sample_dfs() xlsx_1 = build_excel_stream(df_1, "Sheet1") xlsx_2 = build_excel_stream(df_2, "Sheet1") return xlsx_1, xlsx_2 def test_multiindex(): df_1, df_2 = sample_dfs() diff = diff_pd(df_1, df_2, ["ID", "Flavour ID"]) changed = diff["changed"] assert len(changed) == 1 assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly" print("OK.") def test_multiindex_excel(): xlsx_1, xlsx_2 = sample_xlsx() diff_io = io.BytesIO() compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"]) diff = pd.read_excel(diff_io, sheet_name=None) changed = diff["changed"] assert len(changed) == 1 assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly" print("OK.") if __name__ == '__main__': test_multiindex() test_multiindex_excel() -
sanzoghenzo revised this gist
Jan 3, 2020 . 1 changed file with 39 additions and 21 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,11 @@ """ Compare two Excel sheets Inspired by https://pbpython.com/excel-diff-pandas-update.html For the documentation, download this file and type: python compare.py --help """ import argparse @@ -10,25 +14,30 @@ import numpy as np # TODO: GUI to select which data to keep for merging def report_diff(x): """Function to use with groupby.apply to highlihgt value changes.""" return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}' def strip(x): """Function to use with applymap to strip whitespaces in a dataframe.""" return x.strip() if isinstance(x, str) else x def diff_pd(old_df, new_df, idx_col): """Identify differences between two pandas DataFrames using a key column. Key column is assumed to have only unique data (like a database unique id index) Args: old_df (pd.DataFrame): first dataframe new_df (pd.DataFrame): second dataframe idx_col (str): column name of the index, needs to be present in both DataFrames """ # setting the column name as index for fast operations old_df = old_df.set_index(idx_col) @@ -44,17 +53,21 @@ def diff_pd(old_df, new_df, idx_col): } # focusing on common data of both dataframes common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True) common_columns = np.intersect1d( old_df.columns, new_df.columns, assume_unique=True ) new_common = new_df.loc[common_keys, common_columns].applymap(strip) old_common = old_df.loc[common_keys, common_columns].applymap(strip) # get the changed rows keys by dropping identical rows # (indexes are ignored, so we'll reset them) common_data = pd.concat( [old_common.reset_index(), new_common.reset_index()], sort=True ) changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique() # combining the changed rows via multi level columns df_all_changes = pd.concat( [old_common.loc[changed_keys], new_common.loc[changed_keys]], axis='columns', keys=['old', 'new']).swaplevel(axis='columns') # using report_diff to merge the changes in a single cell with "-->" df_changed = df_all_changes.groupby(level=0, axis=1).apply( lambda frame: frame.apply(report_diff, axis=1)) @@ -63,7 +76,9 @@ def diff_pd(old_df, new_df, idx_col): return out_data def compare_excel( path1, path2, out_path, sheet_name, index_col_name, **kwargs ): old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs) new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs) diff = diff_pd(old_df, new_df, index_col_name) @@ -75,21 +90,24 @@ def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwa def main(): cfg = argparse.ArgumentParser( description="Compares two excel files and outputs the differences " "in another excel file." ) cfg.add_argument("path1", help="Fist excel file") cfg.add_argument("path2", help="Second excel file") cfg.add_argument("sheetname", help="Name of the sheet to compare.") cfg.add_argument( "key-column", help="Name of the column with unique row identifier. It has to be " "the actual text of the first row, not the excel notation." ) cfg.add_argument("-o", "--output-path", default="compared.xlsx", help="Path of the comparison results") cfg.add_argument("--skiprows", help='number of rows to skip', type=int, action='append', default=None) opt = cfg.parse_args() compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.key_column, skiprows=opt.skiprows) if __name__ == '__main__': -
sanzoghenzo revised this gist
Oct 11, 2019 . 1 changed file with 9 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,11 +10,16 @@ import numpy as np def report_diff(x): """Helper function to use with groupby.apply to highlihgt the velue changes.""" return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}' def strip(x): """Helper function to use with applymap to strip all the whitespaces in the dataframe.""" return x.strip() if isinstance(x, str) else x def diff_pd(old_df, new_df, idx_col): """Identify differences between two pandas DataFrames using a key column. @@ -39,8 +44,9 @@ def diff_pd(old_df, new_df, idx_col): } # focusing on common data of both dataframes common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True) common_columns = np.intersect1d(old_df.columns, new_df.columns, assume_unique=True) new_common = new_df.loc[common_keys, common_columns].applymap(strip) old_common = old_df.loc[common_keys, common_columns].applymap(strip) # get the changed rows keys by dropping identical rows # (indexes are ignored, so we'll reset them) common_data = pd.concat([old_common.reset_index(), new_common.reset_index()]) -
sanzoghenzo revised this gist
Oct 11, 2019 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -47,7 +47,8 @@ def diff_pd(old_df, new_df, idx_col): changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique() # combining the changed rows via multi level columns df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]], axis='columns', keys=['old', 'new']) df_all_changes = df_all_changes.swaplevel(axis='columns')[new_common.columns] # using report_diff to merge the changes in a single cell with "-->" df_changed = df_all_changes.groupby(level=0, axis=1).apply( lambda frame: frame.apply(report_diff, axis=1)) -
sanzoghenzo created this gist
Oct 10, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,89 @@ """ Compare Excel Sheeets Inspired by https://pbpython.com/excel-diff-pandas-update.html """ import argparse import pandas as pd import numpy as np # Define the diff function to show the changes in each field def report_diff(x): return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}' def diff_pd(old_df, new_df, idx_col): """Identify differences between two pandas DataFrames using a key column. Key column is assumed to have only unique data (like a database unique id index) Args: old_df (pd.DataFrame): first dataframe new_df (pd.DataFrame): second dataframe idx_col (str): column name of the index, needs to be present in both DataFrames """ # setting the column name as index for fast operations old_df = old_df.set_index(idx_col) new_df = new_df.set_index(idx_col) # get the added and removed rows old_keys = old_df.index new_keys = new_df.index removed_keys = np.setdiff1d(old_keys, new_keys) added_keys = np.setdiff1d(new_keys, old_keys) out_data = { 'removed': old_df.loc[removed_keys], 'added': new_df.loc[added_keys] } # focusing on common data of both dataframes common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True) old_common = old_df.loc[common_keys] new_common = new_df.loc[common_keys] # get the changed rows keys by dropping identical rows # (indexes are ignored, so we'll reset them) common_data = pd.concat([old_common.reset_index(), new_common.reset_index()]) changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique() # combining the changed rows via multi level columns df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]], axis='columns', keys=['old', 'new']).swaplevel(axis='columns') # using report_diff to merge the changes in a single cell with "-->" df_changed = df_all_changes.groupby(level=0, axis=1).apply( lambda frame: frame.apply(report_diff, axis=1)) out_data['changed'] = df_changed return out_data def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwargs): old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs) new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs) diff = diff_pd(old_df, new_df, index_col_name) with pd.ExcelWriter(out_path) as writer: for sname, data in diff.items(): data.to_excel(writer, sheet_name=sname) print(f"Differences saved in {out_path}") def main(): cfg = argparse.ArgumentParser( description="Compares two excel files and outputs the differences in another excel file. " "A column name can be specified as the unique row identifier." ) cfg.add_argument("path1", help="Fist excel file") cfg.add_argument("path2", help="Second excel file") cfg.add_argument("sheetname", help="Name of the sheet to compare.") cfg.add_argument("-c", "--index-column", help="Name of the column with unique row identifier", required=True) cfg.add_argument("-o", "--output-path", help="Path of the comparison results", default="compared.xlsx") cfg.add_argument("--skiprows", help='number of rows to skip', type=int, action='append', default=None) opt = cfg.parse_args() compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.index_column, skiprows=opt.skiprows) if __name__ == '__main__': main()