sanzoghenzo · December 11, 2025 15:38 · Feb 16, 2021 · Feb 16, 2021 · Jan 9, 2021 · Feb 23, 2020
diff --git a/compare.py b/compare.py
@@ -12,19 +12,21 @@
 
 
 def report_diff(x):
-    """Function to use with groupby.apply to highlihgt value changes."""
+    """Function to use with groupby.apply to highlight value changes."""
     return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
 
 
 def strip(x):
-    """Function to use with applymap to strip whitespaces in a dataframe."""
+    """Function to use with applymap to strip whitespaces from a dataframe."""
     return x.strip() if isinstance(x, str) else x
 
 
 def diff_pd(old_df, new_df, idx_col):
-    """Identify differences between two pandas DataFrames using a key column.
-    Key column is assumed to have only unique data
-    (like a database unique id index)
+    """
+    Identify differences between two pandas DataFrames using a key column.
+    
+    Key column is assumed to have a unique row identifier, i.e. no duplicates.
+    
     Args:
         old_df (pd.DataFrame): first dataframe
         new_df (pd.DataFrame): second dataframe
@@ -43,7 +45,7 @@ def diff_pd(old_df, new_df, idx_col):
     else:
         removed_keys = np.setdiff1d(old_keys, new_keys)
         added_keys = np.setdiff1d(new_keys, old_keys)
-    # populate the output data with non empty ddataframes
+    # populate the output data with non empty dataframes
     out_data = {}
     removed = old_df.loc[removed_keys]
     if not removed.empty:
@@ -98,13 +100,14 @@ def compare_excel(
     else:
         print("No differences spotted")
 
+
 def build_parser():
     cfg = argparse.ArgumentParser(
-        description="Compares two excel files and outputs the differences "
-                    "in another excel file."
+        description="Compares two Excel sheets and outputs the differences "
+                    "to a separate Excel file."
     )
-    cfg.add_argument("path1", help="Fist excel file")
-    cfg.add_argument("path2", help="Second excel file")
+    cfg.add_argument("path1", help="Fist Excel file")
+    cfg.add_argument("path2", help="Second Excel file")
     cfg.add_argument("sheetname", help="Name of the sheet to compare.")
     cfg.add_argument(
         "key_column",
@@ -115,8 +118,8 @@ def build_parser():
     )
     cfg.add_argument("-o", "--output-path", default="compared.xlsx",
                      help="Path of the comparison results")
-    cfg.add_argument("--skiprows", help='number of rows to skip', type=int,
-                     action='append', default=None)
+    cfg.add_argument("--skiprows", help='Excel row containing the table headers',
+                     type=int, action='append', default=None)
     return cfg
 
 

diff --git a/compare.py b/compare.py
@@ -43,10 +43,14 @@ def diff_pd(old_df, new_df, idx_col):
     else:
         removed_keys = np.setdiff1d(old_keys, new_keys)
         added_keys = np.setdiff1d(new_keys, old_keys)
-    out_data = {
-        'removed': old_df.loc[removed_keys],
-        'added': new_df.loc[added_keys]
-    }
+    # populate the output data with non empty ddataframes
+    out_data = {}
+    removed = old_df.loc[removed_keys]
+    if not removed.empty:
+        out_data["removed"] = removed
+    added = new_df.loc[added_keys]
+    if not added.empty:
+        out_data["added"] = added
     # focusing on common data of both dataframes
     common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
     common_columns = np.intersect1d(
@@ -73,7 +77,9 @@ def diff_pd(old_df, new_df, idx_col):
     # using report_diff to merge the changes in a single cell with "-->"
     df_changed = df_all_changes.groupby(level=0, axis=1).apply(
         lambda frame: frame.apply(report_diff, axis=1))
-    out_data['changed'] = df_changed
+    # add changed dataframe to output data only if non empty
+    if not df_changed.empty:
+        out_data['changed'] = df_changed
 
     return out_data
 
@@ -84,11 +90,13 @@ def compare_excel(
     old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
     new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
     diff = diff_pd(old_df, new_df, index_col_name)
-    with pd.ExcelWriter(out_path) as writer:
-        for sname, data in diff.items():
-            if not data.empty:
+    if diff:
+        with pd.ExcelWriter(out_path) as writer:
+            for sname, data in diff.items():
                 data.to_excel(writer, sheet_name=sname)
-    print(f"Differences saved in {out_path}")
+        print(f"Differences saved in {out_path}")
+    else:
+        print("No differences spotted")
 
 def build_parser():
     cfg = argparse.ArgumentParser(

diff --git a/test_compare.py b/test_compare.py
@@ -99,6 +99,13 @@ def test_multiindex_excel():
     compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
     diff = pd.read_excel(diff_io, sheet_name=None)
     run_assertion(diff)
+
+
+def test_no_diffs():
+    df_1, _ = sample_multiindex_dfs()
+    diff = compare.diff_pd(df_1, df_1, ["ID", "Flavour ID"])
+    assert not diff
+    print("OK.")
 
 
 if __name__ == '__main__':
@@ -107,3 +114,4 @@ def test_multiindex_excel():
     test_single_index()
     test_single_index_excel()
     test_parser()
+    test_no_diff()
diff --git a/compare.py b/compare.py
@@ -86,7 +86,8 @@ def compare_excel(
     diff = diff_pd(old_df, new_df, index_col_name)
     with pd.ExcelWriter(out_path) as writer:
         for sname, data in diff.items():
-            data.to_excel(writer, sheet_name=sname)
+            if not data.empty:
+                data.to_excel(writer, sheet_name=sname)
     print(f"Differences saved in {out_path}")
 
 def build_parser():

diff --git a/compare.py b/compare.py
@@ -35,6 +35,8 @@ def diff_pd(old_df, new_df, idx_col):
     old_df = old_df.set_index(idx_col)
     new_df = new_df.set_index(idx_col)
     # get the added and removed rows
+    old_keys = old_df.index
+    new_keys = new_df.index
     if isinstance(old_keys, pd.MultiIndex):
         removed_keys = old_keys.difference(new_keys)
         added_keys = new_keys.difference(old_keys)
@@ -87,7 +89,6 @@ def compare_excel(
             data.to_excel(writer, sheet_name=sname)
     print(f"Differences saved in {out_path}")
 
-
 def build_parser():
     cfg = argparse.ArgumentParser(
         description="Compares two excel files and outputs the differences "
@@ -118,4 +119,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/test_compare.py b/test_compare.py
@@ -24,29 +24,26 @@ def build_excel_stream(df, sheetname):
     return output
 
 
+def sample_xlsx(df_1, df_2):
+    xlsx_1 = build_excel_stream(df_1, "Sheet1")
+    xlsx_2 = build_excel_stream(df_2, "Sheet1")
+    return xlsx_1, xlsx_2
+
+
 def sample_dfs():
     df_1 = pd.DataFrame({
-        "ID": [123456, 123456, 654321, 543219, 432198, 765432],
-        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
-        "Flavour ID": [1, 2, None, None, None, None],
-        "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
+        "ID": [123456, 654321, 543219, 432198, 765432],
+        "Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
+        "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
     })
     df_2 = pd.DataFrame({
-        "ID": [123456, 123456, 654321, 543219, 432198, 876543],
-        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
-        "Flavour ID": [1, 2, None, None, None, None],
-        "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"],
+        "ID": [123456, 654321, 543219, 432198, 876543],
+        "Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
+        "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Sugary"],
     })
     return df_1, df_2
 
 
-def sample_xlsx():
-    df_1, df_2 = sample_dfs()
-    xlsx_1 = build_excel_stream(df_1, "Sheet1")
-    xlsx_2 = build_excel_stream(df_2, "Sheet1")
-    return xlsx_1, xlsx_2
-
-
 def run_assertion(diff):
     changed = diff["changed"]
     assert len(changed) == 1
@@ -60,14 +57,44 @@ def run_assertion(diff):
     print("OK.")
 
 
-def test_multiindex():
+def test_single_index():
     df_1, df_2 = sample_dfs()
+    diff = compare.diff_pd(df_1, df_2, ["ID"])
+    run_assertion(diff)
+
+
+def test_single_index_excel():
+    xlsx_1, xlsx_2 = sample_xlsx(*sample_dfs())
+    diff_io = io.BytesIO()
+    compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", "ID")
+    diff = pd.read_excel(diff_io, sheet_name=None)
+    run_assertion(diff)
+
+
+def sample_multiindex_dfs():
+    df_1 = pd.DataFrame({
+        "ID": [123456, 123456, 654321, 543219, 432198, 765432],
+        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
+        "Flavour ID": [1, 2, None, None, None, None],
+        "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
+    })
+    df_2 = pd.DataFrame({
+        "ID": [123456, 123456, 654321, 543219, 432198, 876543],
+        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
+        "Flavour ID": [1, 2, None, None, None, None],
+        "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"],
+    })
+    return df_1, df_2
+
+
+def test_multiindex():
+    df_1, df_2 = sample_multiindex_dfs()
     diff = compare.diff_pd(df_1, df_2, ["ID", "Flavour ID"])
     run_assertion(diff)
 
 
 def test_multiindex_excel():
-    xlsx_1, xlsx_2 = sample_xlsx()
+    xlsx_1, xlsx_2 = sample_xlsx(*sample_multiindex_dfs())
     diff_io = io.BytesIO()
     compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
     diff = pd.read_excel(diff_io, sheet_name=None)
@@ -77,4 +104,6 @@ def test_multiindex_excel():
 if __name__ == '__main__':
     test_multiindex()
     test_multiindex_excel()
+    test_single_index()
+    test_single_index_excel()
     test_parser()
diff --git a/compare.py b/compare.py
@@ -35,10 +35,12 @@ def diff_pd(old_df, new_df, idx_col):
     old_df = old_df.set_index(idx_col)
     new_df = new_df.set_index(idx_col)
     # get the added and removed rows
-    old_keys = old_df.index
-    new_keys = new_df.index
-    removed_keys = np.setdiff1d(old_keys, new_keys)
-    added_keys = np.setdiff1d(new_keys, old_keys)
+    if isinstance(old_keys, pd.MultiIndex):
+        removed_keys = old_keys.difference(new_keys)
+        added_keys = new_keys.difference(old_keys)
+    else:
+        removed_keys = np.setdiff1d(old_keys, new_keys)
+        added_keys = np.setdiff1d(new_keys, old_keys)
     out_data = {
         'removed': old_df.loc[removed_keys],
         'added': new_df.loc[added_keys]

diff --git a/test_compare.py b/test_compare.py
@@ -1,11 +1,12 @@
 """Excel compare test suite."""
 import io
 
-import compare
+import pandas as pd
 
+import compare
 
 def test_parser():
-    cfg = build_parser()
+    cfg = compare.build_parser()
     opt = cfg.parse_args(["test1.xlsx", "test2.xlsx", "Sheet 1", "Col1", "Col2", "-o", "output.xlsx"])
     assert opt.path1 == "test1.xlsx"
     assert opt.path2 == "test2.xlsx"
@@ -25,16 +26,16 @@ def build_excel_stream(df, sheetname):
 
 def sample_dfs():
     df_1 = pd.DataFrame({
-        "ID": [123456, 123456, 654321, 543219, 432198],
-        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"],
-        "Flavour ID": [1, 2, None, None, None],
-        "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy"],
+        "ID": [123456, 123456, 654321, 543219, 432198, 765432],
+        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
+        "Flavour ID": [1, 2, None, None, None, None],
+        "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
     })
     df_2 = pd.DataFrame({
-        "ID": [123456, 123456, 654321, 543219, 432198],
-        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"],
-        "Flavour ID": [1, 2, None, None, None],
-        "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy"],
+        "ID": [123456, 123456, 654321, 543219, 432198, 876543],
+        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
+        "Flavour ID": [1, 2, None, None, None, None],
+        "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"],
     })
     return df_1, df_2
 
@@ -46,27 +47,34 @@ def sample_xlsx():
     return xlsx_1, xlsx_2
 
 
-def test_multiindex():
-    df_1, df_2 = sample_dfs()
-    diff = diff_pd(df_1, df_2, ["ID", "Flavour ID"])
+def run_assertion(diff):
     changed = diff["changed"]
     assert len(changed) == 1
     assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
+    added = diff["added"]
+    assert len(added) == 1
+    assert added.iloc[0]["Flavour Description"] == "Sugary"
+    removed = diff["removed"]
+    assert len(removed) == 1
+    assert removed.iloc[0]["Flavour Description"] == "Smoky"
     print("OK.")
 
 
+def test_multiindex():
+    df_1, df_2 = sample_dfs()
+    diff = compare.diff_pd(df_1, df_2, ["ID", "Flavour ID"])
+    run_assertion(diff)
+
+
 def test_multiindex_excel():
     xlsx_1, xlsx_2 = sample_xlsx()
     diff_io = io.BytesIO()
-    compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
+    compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
     diff = pd.read_excel(diff_io, sheet_name=None)
-    changed = diff["changed"]
-    assert len(changed) == 1
-    assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
-    print("OK.")
+    run_assertion(diff)
 
 
 if __name__ == '__main__':
-    test_parser()
     test_multiindex()
     test_multiindex_excel()
+    test_parser()
diff --git a/compare.py b/compare.py
@@ -11,9 +11,6 @@
 import numpy as np
 
 
-# TODO: GUI to select which data to keep for merging
-
-
 def report_diff(x):
     """Function to use with groupby.apply to highlihgt value changes."""
     return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
@@ -89,7 +86,7 @@ def compare_excel(
     print(f"Differences saved in {out_path}")
 
 
-def main():
+def build_parser():
     cfg = argparse.ArgumentParser(
         description="Compares two excel files and outputs the differences "
                     "in another excel file."
@@ -98,16 +95,21 @@ def main():
     cfg.add_argument("path2", help="Second excel file")
     cfg.add_argument("sheetname", help="Name of the sheet to compare.")
     cfg.add_argument(
-        "key-column",
-        help="Name of the column with unique row identifier. It has to be "
+        "key_column",
+        help="Name of the column(s) with unique row identifier. It has to be "
              "the actual text of the first row, not the excel notation."
              "Use multiple times to create a composite index.",
-        action="append",
+        nargs="+",
     )
     cfg.add_argument("-o", "--output-path", default="compared.xlsx",
                      help="Path of the comparison results")
     cfg.add_argument("--skiprows", help='number of rows to skip', type=int,
                      action='append', default=None)
+    return cfg
+
+
+def main():
+    cfg = build_parser()
     opt = cfg.parse_args()
     compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname,
                   opt.key_column, skiprows=opt.skiprows)

diff --git a/test_compare.py b/test_compare.py
@@ -4,6 +4,17 @@
 import compare
 
 
+def test_parser():
+    cfg = build_parser()
+    opt = cfg.parse_args(["test1.xlsx", "test2.xlsx", "Sheet 1", "Col1", "Col2", "-o", "output.xlsx"])
+    assert opt.path1 == "test1.xlsx"
+    assert opt.path2 == "test2.xlsx"
+    assert opt.output_path ==  "output.xlsx"
+    assert opt.sheetname == "Sheet 1"
+    assert opt.key_column == ["Col1", "Col2"]
+    assert opt.skiprows is None
+
+
 def build_excel_stream(df, sheetname):
     """Create an excel workbook as a file-like object."""
     output = io.BytesIO()
@@ -56,5 +67,6 @@ def test_multiindex_excel():
 
 
 if __name__ == '__main__':
+    test_parser()
     test_multiindex()
     test_multiindex_excel()
diff --git a/compare.py b/compare.py
@@ -1,10 +1,7 @@
 """
 Compare two Excel sheets
-
 Inspired by https://pbpython.com/excel-diff-pandas-update.html
-
 For the documentation, download this file and type:
-
 python compare.py --help
 """
 
@@ -29,14 +26,12 @@ def strip(x):
 
 def diff_pd(old_df, new_df, idx_col):
     """Identify differences between two pandas DataFrames using a key column.
-
     Key column is assumed to have only unique data
     (like a database unique id index)
-
     Args:
         old_df (pd.DataFrame): first dataframe
         new_df (pd.DataFrame): second dataframe
-        idx_col (str): column name of the index,
+        idx_col (str|list(str)): column name(s) of the index,
           needs to be present in both DataFrames
     """
     # setting the column name as index for fast operations
@@ -63,11 +58,17 @@ def diff_pd(old_df, new_df, idx_col):
     common_data = pd.concat(
         [old_common.reset_index(), new_common.reset_index()], sort=True
     )
-    changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
+    changed_keys = common_data.drop_duplicates(keep=False)[idx_col]
+    if isinstance(changed_keys, pd.Series):
+        changed_keys = changed_keys.unique()
+    else:
+        changed_keys = changed_keys.drop_duplicates().set_index(idx_col).index
     # combining the changed rows via multi level columns
     df_all_changes = pd.concat(
         [old_common.loc[changed_keys], new_common.loc[changed_keys]],
-        axis='columns', keys=['old', 'new']).swaplevel(axis='columns')
+        axis='columns',
+        keys=['old', 'new']
+    ).swaplevel(axis='columns')
     # using report_diff to merge the changes in a single cell with "-->"
     df_changed = df_all_changes.groupby(level=0, axis=1).apply(
         lambda frame: frame.apply(report_diff, axis=1))
@@ -100,6 +101,8 @@ def main():
         "key-column",
         help="Name of the column with unique row identifier. It has to be "
              "the actual text of the first row, not the excel notation."
+             "Use multiple times to create a composite index.",
+        action="append",
     )
     cfg.add_argument("-o", "--output-path", default="compared.xlsx",
                      help="Path of the comparison results")
@@ -111,4 +114,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/test_compare.py b/test_compare.py
@@ -0,0 +1,60 @@
+"""Excel compare test suite."""
+import io
+
+import compare
+
+
+def build_excel_stream(df, sheetname):
+    """Create an excel workbook as a file-like object."""
+    output = io.BytesIO()
+    with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
+        df.to_excel(writer, sheet_name=sheetname, index=False)
+    return output
+
+
+def sample_dfs():
+    df_1 = pd.DataFrame({
+        "ID": [123456, 123456, 654321, 543219, 432198],
+        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"],
+        "Flavour ID": [1, 2, None, None, None],
+        "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy"],
+    })
+    df_2 = pd.DataFrame({
+        "ID": [123456, 123456, 654321, 543219, 432198],
+        "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"],
+        "Flavour ID": [1, 2, None, None, None],
+        "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy"],
+    })
+    return df_1, df_2
+
+
+def sample_xlsx():
+    df_1, df_2 = sample_dfs()
+    xlsx_1 = build_excel_stream(df_1, "Sheet1")
+    xlsx_2 = build_excel_stream(df_2, "Sheet1")
+    return xlsx_1, xlsx_2
+
+
+def test_multiindex():
+    df_1, df_2 = sample_dfs()
+    diff = diff_pd(df_1, df_2, ["ID", "Flavour ID"])
+    changed = diff["changed"]
+    assert len(changed) == 1
+    assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
+    print("OK.")
+
+
+def test_multiindex_excel():
+    xlsx_1, xlsx_2 = sample_xlsx()
+    diff_io = io.BytesIO()
+    compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
+    diff = pd.read_excel(diff_io, sheet_name=None)
+    changed = diff["changed"]
+    assert len(changed) == 1
+    assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
+    print("OK.")
+
+
+if __name__ == '__main__':
+    test_multiindex()
+    test_multiindex_excel()
diff --git a/compare.py b/compare.py
@@ -1,7 +1,11 @@
 """
-Compare Excel Sheeets
+Compare two Excel sheets
 
 Inspired by https://pbpython.com/excel-diff-pandas-update.html
+
+For the documentation, download this file and type:
+
+python compare.py --help
 """
 
 import argparse
@@ -10,25 +14,30 @@
 import numpy as np
 
 
+# TODO: GUI to select which data to keep for merging
+
+
 def report_diff(x):
-    """Helper function to use with groupby.apply to highlihgt the velue changes."""
+    """Function to use with groupby.apply to highlihgt value changes."""
     return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
 
 
 def strip(x):
-    """Helper function to use with applymap to strip all the whitespaces in the dataframe."""
+    """Function to use with applymap to strip whitespaces in a dataframe."""
     return x.strip() if isinstance(x, str) else x
 
 
 def diff_pd(old_df, new_df, idx_col):
     """Identify differences between two pandas DataFrames using a key column.
 
-    Key column is assumed to have only unique data (like a database unique id index)
+    Key column is assumed to have only unique data
+    (like a database unique id index)
 
     Args:
         old_df (pd.DataFrame): first dataframe
         new_df (pd.DataFrame): second dataframe
-        idx_col (str): column name of the index, needs to be present in both DataFrames
+        idx_col (str): column name of the index,
+          needs to be present in both DataFrames
     """
     # setting the column name as index for fast operations
     old_df = old_df.set_index(idx_col)
@@ -44,17 +53,21 @@ def diff_pd(old_df, new_df, idx_col):
     }
     # focusing on common data of both dataframes
     common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
-    common_columns = np.intersect1d(old_df.columns, new_df.columns, assume_unique=True)
+    common_columns = np.intersect1d(
+        old_df.columns, new_df.columns, assume_unique=True
+    )
     new_common = new_df.loc[common_keys, common_columns].applymap(strip)
     old_common = old_df.loc[common_keys, common_columns].applymap(strip)
     # get the changed rows keys by dropping identical rows
     # (indexes are ignored, so we'll reset them)
-    common_data = pd.concat([old_common.reset_index(), new_common.reset_index()])
+    common_data = pd.concat(
+        [old_common.reset_index(), new_common.reset_index()], sort=True
+    )
     changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
     # combining the changed rows via multi level columns
-    df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]],
-                               axis='columns', keys=['old', 'new'])
-    df_all_changes = df_all_changes.swaplevel(axis='columns')[new_common.columns]
+    df_all_changes = pd.concat(
+        [old_common.loc[changed_keys], new_common.loc[changed_keys]],
+        axis='columns', keys=['old', 'new']).swaplevel(axis='columns')
     # using report_diff to merge the changes in a single cell with "-->"
     df_changed = df_all_changes.groupby(level=0, axis=1).apply(
         lambda frame: frame.apply(report_diff, axis=1))
@@ -63,7 +76,9 @@ def diff_pd(old_df, new_df, idx_col):
     return out_data
 
 
-def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwargs):
+def compare_excel(
+        path1, path2, out_path, sheet_name, index_col_name, **kwargs
+):
     old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
     new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
     diff = diff_pd(old_df, new_df, index_col_name)
@@ -75,21 +90,24 @@ def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwa
 
 def main():
     cfg = argparse.ArgumentParser(
-        description="Compares two excel files and outputs the differences in another excel file. "
-                    "A column name can be specified as the unique row identifier."
+        description="Compares two excel files and outputs the differences "
+                    "in another excel file."
     )
     cfg.add_argument("path1", help="Fist excel file")
     cfg.add_argument("path2", help="Second excel file")
     cfg.add_argument("sheetname", help="Name of the sheet to compare.")
-    cfg.add_argument("-c", "--index-column", help="Name of the column with unique row identifier",
-                     required=True)
-    cfg.add_argument("-o", "--output-path", help="Path of the comparison results",
-                     default="compared.xlsx")
-    cfg.add_argument("--skiprows", help='number of rows to skip', type=int, action='append',
-                     default=None)
+    cfg.add_argument(
+        "key-column",
+        help="Name of the column with unique row identifier. It has to be "
+             "the actual text of the first row, not the excel notation."
+    )
+    cfg.add_argument("-o", "--output-path", default="compared.xlsx",
+                     help="Path of the comparison results")
+    cfg.add_argument("--skiprows", help='number of rows to skip', type=int,
+                     action='append', default=None)
     opt = cfg.parse_args()
-    compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.index_column,
-                  skiprows=opt.skiprows)
+    compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname,
+                  opt.key_column, skiprows=opt.skiprows)
 
 
 if __name__ == '__main__':

diff --git a/compare.py b/compare.py
@@ -10,11 +10,16 @@
 import numpy as np
 
 
-# Define the diff function to show the changes in each field
 def report_diff(x):
+    """Helper function to use with groupby.apply to highlihgt the velue changes."""
     return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
 
 
+def strip(x):
+    """Helper function to use with applymap to strip all the whitespaces in the dataframe."""
+    return x.strip() if isinstance(x, str) else x
+
+
 def diff_pd(old_df, new_df, idx_col):
     """Identify differences between two pandas DataFrames using a key column.
 
@@ -39,8 +44,9 @@ def diff_pd(old_df, new_df, idx_col):
     }
     # focusing on common data of both dataframes
     common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
-    old_common = old_df.loc[common_keys]
-    new_common = new_df.loc[common_keys]
+    common_columns = np.intersect1d(old_df.columns, new_df.columns, assume_unique=True)
+    new_common = new_df.loc[common_keys, common_columns].applymap(strip)
+    old_common = old_df.loc[common_keys, common_columns].applymap(strip)
     # get the changed rows keys by dropping identical rows
     # (indexes are ignored, so we'll reset them)
     common_data = pd.concat([old_common.reset_index(), new_common.reset_index()])

diff --git a/compare.py b/compare.py
@@ -47,7 +47,8 @@ def diff_pd(old_df, new_df, idx_col):
     changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
     # combining the changed rows via multi level columns
     df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]],
-                               axis='columns', keys=['old', 'new']).swaplevel(axis='columns')
+                               axis='columns', keys=['old', 'new'])
+    df_all_changes = df_all_changes.swaplevel(axis='columns')[new_common.columns]
     # using report_diff to merge the changes in a single cell with "-->"
     df_changed = df_all_changes.groupby(level=0, axis=1).apply(
         lambda frame: frame.apply(report_diff, axis=1))

diff --git a/compare.py b/compare.py
@@ -0,0 +1,89 @@
+"""
+Compare Excel Sheeets
+
+Inspired by https://pbpython.com/excel-diff-pandas-update.html
+"""
+
+import argparse
+
+import pandas as pd
+import numpy as np
+
+
+# Define the diff function to show the changes in each field
+def report_diff(x):
+    return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
+
+
+def diff_pd(old_df, new_df, idx_col):
+    """Identify differences between two pandas DataFrames using a key column.
+
+    Key column is assumed to have only unique data (like a database unique id index)
+
+    Args:
+        old_df (pd.DataFrame): first dataframe
+        new_df (pd.DataFrame): second dataframe
+        idx_col (str): column name of the index, needs to be present in both DataFrames
+    """
+    # setting the column name as index for fast operations
+    old_df = old_df.set_index(idx_col)
+    new_df = new_df.set_index(idx_col)
+    # get the added and removed rows
+    old_keys = old_df.index
+    new_keys = new_df.index
+    removed_keys = np.setdiff1d(old_keys, new_keys)
+    added_keys = np.setdiff1d(new_keys, old_keys)
+    out_data = {
+        'removed': old_df.loc[removed_keys],
+        'added': new_df.loc[added_keys]
+    }
+    # focusing on common data of both dataframes
+    common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
+    old_common = old_df.loc[common_keys]
+    new_common = new_df.loc[common_keys]
+    # get the changed rows keys by dropping identical rows
+    # (indexes are ignored, so we'll reset them)
+    common_data = pd.concat([old_common.reset_index(), new_common.reset_index()])
+    changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
+    # combining the changed rows via multi level columns
+    df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]],
+                               axis='columns', keys=['old', 'new']).swaplevel(axis='columns')
+    # using report_diff to merge the changes in a single cell with "-->"
+    df_changed = df_all_changes.groupby(level=0, axis=1).apply(
+        lambda frame: frame.apply(report_diff, axis=1))
+    out_data['changed'] = df_changed
+
+    return out_data
+
+
+def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwargs):
+    old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
+    new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
+    diff = diff_pd(old_df, new_df, index_col_name)
+    with pd.ExcelWriter(out_path) as writer:
+        for sname, data in diff.items():
+            data.to_excel(writer, sheet_name=sname)
+    print(f"Differences saved in {out_path}")
+
+
+def main():
+    cfg = argparse.ArgumentParser(
+        description="Compares two excel files and outputs the differences in another excel file. "
+                    "A column name can be specified as the unique row identifier."
+    )
+    cfg.add_argument("path1", help="Fist excel file")
+    cfg.add_argument("path2", help="Second excel file")
+    cfg.add_argument("sheetname", help="Name of the sheet to compare.")
+    cfg.add_argument("-c", "--index-column", help="Name of the column with unique row identifier",
+                     required=True)
+    cfg.add_argument("-o", "--output-path", help="Path of the comparison results",
+                     default="compared.xlsx")
+    cfg.add_argument("--skiprows", help='number of rows to skip', type=int, action='append',
+                     default=None)
+    opt = cfg.parse_args()
+    compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.index_column,
+                  skiprows=opt.skiprows)
+
+
+if __name__ == '__main__':
+    main()
No results found