Skip to content

Instantly share code, notes, and snippets.

@sanzoghenzo
Last active December 11, 2025 15:38
Show Gist options
  • Select an option

  • Save sanzoghenzo/73275613c592331180a24cb2ddfd5bcb to your computer and use it in GitHub Desktop.

Select an option

Save sanzoghenzo/73275613c592331180a24cb2ddfd5bcb to your computer and use it in GitHub Desktop.

Revisions

  1. sanzoghenzo revised this gist Feb 16, 2021. 1 changed file with 15 additions and 12 deletions.
    27 changes: 15 additions & 12 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -12,19 +12,21 @@


    def report_diff(x):
    """Function to use with groupby.apply to highlihgt value changes."""
    """Function to use with groupby.apply to highlight value changes."""
    return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'


    def strip(x):
    """Function to use with applymap to strip whitespaces in a dataframe."""
    """Function to use with applymap to strip whitespaces from a dataframe."""
    return x.strip() if isinstance(x, str) else x


    def diff_pd(old_df, new_df, idx_col):
    """Identify differences between two pandas DataFrames using a key column.
    Key column is assumed to have only unique data
    (like a database unique id index)
    """
    Identify differences between two pandas DataFrames using a key column.
    Key column is assumed to have a unique row identifier, i.e. no duplicates.
    Args:
    old_df (pd.DataFrame): first dataframe
    new_df (pd.DataFrame): second dataframe
    @@ -43,7 +45,7 @@ def diff_pd(old_df, new_df, idx_col):
    else:
    removed_keys = np.setdiff1d(old_keys, new_keys)
    added_keys = np.setdiff1d(new_keys, old_keys)
    # populate the output data with non empty ddataframes
    # populate the output data with non empty dataframes
    out_data = {}
    removed = old_df.loc[removed_keys]
    if not removed.empty:
    @@ -98,13 +100,14 @@ def compare_excel(
    else:
    print("No differences spotted")


    def build_parser():
    cfg = argparse.ArgumentParser(
    description="Compares two excel files and outputs the differences "
    "in another excel file."
    description="Compares two Excel sheets and outputs the differences "
    "to a separate Excel file."
    )
    cfg.add_argument("path1", help="Fist excel file")
    cfg.add_argument("path2", help="Second excel file")
    cfg.add_argument("path1", help="Fist Excel file")
    cfg.add_argument("path2", help="Second Excel file")
    cfg.add_argument("sheetname", help="Name of the sheet to compare.")
    cfg.add_argument(
    "key_column",
    @@ -115,8 +118,8 @@ def build_parser():
    )
    cfg.add_argument("-o", "--output-path", default="compared.xlsx",
    help="Path of the comparison results")
    cfg.add_argument("--skiprows", help='number of rows to skip', type=int,
    action='append', default=None)
    cfg.add_argument("--skiprows", help='Excel row containing the table headers',
    type=int, action='append', default=None)
    return cfg


  2. sanzoghenzo revised this gist Feb 16, 2021. 2 changed files with 25 additions and 9 deletions.
    26 changes: 17 additions & 9 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -43,10 +43,14 @@ def diff_pd(old_df, new_df, idx_col):
    else:
    removed_keys = np.setdiff1d(old_keys, new_keys)
    added_keys = np.setdiff1d(new_keys, old_keys)
    out_data = {
    'removed': old_df.loc[removed_keys],
    'added': new_df.loc[added_keys]
    }
    # populate the output data with non empty ddataframes
    out_data = {}
    removed = old_df.loc[removed_keys]
    if not removed.empty:
    out_data["removed"] = removed
    added = new_df.loc[added_keys]
    if not added.empty:
    out_data["added"] = added
    # focusing on common data of both dataframes
    common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
    common_columns = np.intersect1d(
    @@ -73,7 +77,9 @@ def diff_pd(old_df, new_df, idx_col):
    # using report_diff to merge the changes in a single cell with "-->"
    df_changed = df_all_changes.groupby(level=0, axis=1).apply(
    lambda frame: frame.apply(report_diff, axis=1))
    out_data['changed'] = df_changed
    # add changed dataframe to output data only if non empty
    if not df_changed.empty:
    out_data['changed'] = df_changed

    return out_data

    @@ -84,11 +90,13 @@ def compare_excel(
    old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
    new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
    diff = diff_pd(old_df, new_df, index_col_name)
    with pd.ExcelWriter(out_path) as writer:
    for sname, data in diff.items():
    if not data.empty:
    if diff:
    with pd.ExcelWriter(out_path) as writer:
    for sname, data in diff.items():
    data.to_excel(writer, sheet_name=sname)
    print(f"Differences saved in {out_path}")
    print(f"Differences saved in {out_path}")
    else:
    print("No differences spotted")

    def build_parser():
    cfg = argparse.ArgumentParser(
    8 changes: 8 additions & 0 deletions test_compare.py
    Original file line number Diff line number Diff line change
    @@ -99,6 +99,13 @@ def test_multiindex_excel():
    compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
    diff = pd.read_excel(diff_io, sheet_name=None)
    run_assertion(diff)


    def test_no_diffs():
    df_1, _ = sample_multiindex_dfs()
    diff = compare.diff_pd(df_1, df_1, ["ID", "Flavour ID"])
    assert not diff
    print("OK.")


    if __name__ == '__main__':
    @@ -107,3 +114,4 @@ def test_multiindex_excel():
    test_single_index()
    test_single_index_excel()
    test_parser()
    test_no_diff()
  3. sanzoghenzo revised this gist Jan 9, 2021. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion compare.py
    Original file line number Diff line number Diff line change
    @@ -86,7 +86,8 @@ def compare_excel(
    diff = diff_pd(old_df, new_df, index_col_name)
    with pd.ExcelWriter(out_path) as writer:
    for sname, data in diff.items():
    data.to_excel(writer, sheet_name=sname)
    if not data.empty:
    data.to_excel(writer, sheet_name=sname)
    print(f"Differences saved in {out_path}")

    def build_parser():
  4. sanzoghenzo revised this gist Feb 23, 2020. 2 changed files with 49 additions and 19 deletions.
    5 changes: 3 additions & 2 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -35,6 +35,8 @@ def diff_pd(old_df, new_df, idx_col):
    old_df = old_df.set_index(idx_col)
    new_df = new_df.set_index(idx_col)
    # get the added and removed rows
    old_keys = old_df.index
    new_keys = new_df.index
    if isinstance(old_keys, pd.MultiIndex):
    removed_keys = old_keys.difference(new_keys)
    added_keys = new_keys.difference(old_keys)
    @@ -87,7 +89,6 @@ def compare_excel(
    data.to_excel(writer, sheet_name=sname)
    print(f"Differences saved in {out_path}")


    def build_parser():
    cfg = argparse.ArgumentParser(
    description="Compares two excel files and outputs the differences "
    @@ -118,4 +119,4 @@ def main():


    if __name__ == '__main__':
    main()
    main()
    63 changes: 46 additions & 17 deletions test_compare.py
    Original file line number Diff line number Diff line change
    @@ -24,29 +24,26 @@ def build_excel_stream(df, sheetname):
    return output


    def sample_xlsx(df_1, df_2):
    xlsx_1 = build_excel_stream(df_1, "Sheet1")
    xlsx_2 = build_excel_stream(df_2, "Sheet1")
    return xlsx_1, xlsx_2


    def sample_dfs():
    df_1 = pd.DataFrame({
    "ID": [123456, 123456, 654321, 543219, 432198, 765432],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
    "Flavour ID": [1, 2, None, None, None, None],
    "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
    "ID": [123456, 654321, 543219, 432198, 765432],
    "Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
    "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
    })
    df_2 = pd.DataFrame({
    "ID": [123456, 123456, 654321, 543219, 432198, 876543],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
    "Flavour ID": [1, 2, None, None, None, None],
    "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"],
    "ID": [123456, 654321, 543219, 432198, 876543],
    "Name": ["Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
    "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Sugary"],
    })
    return df_1, df_2


    def sample_xlsx():
    df_1, df_2 = sample_dfs()
    xlsx_1 = build_excel_stream(df_1, "Sheet1")
    xlsx_2 = build_excel_stream(df_2, "Sheet1")
    return xlsx_1, xlsx_2


    def run_assertion(diff):
    changed = diff["changed"]
    assert len(changed) == 1
    @@ -60,14 +57,44 @@ def run_assertion(diff):
    print("OK.")


    def test_multiindex():
    def test_single_index():
    df_1, df_2 = sample_dfs()
    diff = compare.diff_pd(df_1, df_2, ["ID"])
    run_assertion(diff)


    def test_single_index_excel():
    xlsx_1, xlsx_2 = sample_xlsx(*sample_dfs())
    diff_io = io.BytesIO()
    compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", "ID")
    diff = pd.read_excel(diff_io, sheet_name=None)
    run_assertion(diff)


    def sample_multiindex_dfs():
    df_1 = pd.DataFrame({
    "ID": [123456, 123456, 654321, 543219, 432198, 765432],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
    "Flavour ID": [1, 2, None, None, None, None],
    "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
    })
    df_2 = pd.DataFrame({
    "ID": [123456, 123456, 654321, 543219, 432198, 876543],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
    "Flavour ID": [1, 2, None, None, None, None],
    "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"],
    })
    return df_1, df_2


    def test_multiindex():
    df_1, df_2 = sample_multiindex_dfs()
    diff = compare.diff_pd(df_1, df_2, ["ID", "Flavour ID"])
    run_assertion(diff)


    def test_multiindex_excel():
    xlsx_1, xlsx_2 = sample_xlsx()
    xlsx_1, xlsx_2 = sample_xlsx(*sample_multiindex_dfs())
    diff_io = io.BytesIO()
    compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
    diff = pd.read_excel(diff_io, sheet_name=None)
    @@ -77,4 +104,6 @@ def test_multiindex_excel():
    if __name__ == '__main__':
    test_multiindex()
    test_multiindex_excel()
    test_single_index()
    test_single_index_excel()
    test_parser()
  5. sanzoghenzo revised this gist Feb 6, 2020. 2 changed files with 33 additions and 23 deletions.
    10 changes: 6 additions & 4 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -35,10 +35,12 @@ def diff_pd(old_df, new_df, idx_col):
    old_df = old_df.set_index(idx_col)
    new_df = new_df.set_index(idx_col)
    # get the added and removed rows
    old_keys = old_df.index
    new_keys = new_df.index
    removed_keys = np.setdiff1d(old_keys, new_keys)
    added_keys = np.setdiff1d(new_keys, old_keys)
    if isinstance(old_keys, pd.MultiIndex):
    removed_keys = old_keys.difference(new_keys)
    added_keys = new_keys.difference(old_keys)
    else:
    removed_keys = np.setdiff1d(old_keys, new_keys)
    added_keys = np.setdiff1d(new_keys, old_keys)
    out_data = {
    'removed': old_df.loc[removed_keys],
    'added': new_df.loc[added_keys]
    46 changes: 27 additions & 19 deletions test_compare.py
    Original file line number Diff line number Diff line change
    @@ -1,11 +1,12 @@
    """Excel compare test suite."""
    import io

    import compare
    import pandas as pd

    import compare

    def test_parser():
    cfg = build_parser()
    cfg = compare.build_parser()
    opt = cfg.parse_args(["test1.xlsx", "test2.xlsx", "Sheet 1", "Col1", "Col2", "-o", "output.xlsx"])
    assert opt.path1 == "test1.xlsx"
    assert opt.path2 == "test2.xlsx"
    @@ -25,16 +26,16 @@ def build_excel_stream(df, sheetname):

    def sample_dfs():
    df_1 = pd.DataFrame({
    "ID": [123456, 123456, 654321, 543219, 432198],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"],
    "Flavour ID": [1, 2, None, None, None],
    "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy"],
    "ID": [123456, 123456, 654321, 543219, 432198, 765432],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Tobacco"],
    "Flavour ID": [1, 2, None, None, None, None],
    "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Smoky"],
    })
    df_2 = pd.DataFrame({
    "ID": [123456, 123456, 654321, 543219, 432198],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"],
    "Flavour ID": [1, 2, None, None, None],
    "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy"],
    "ID": [123456, 123456, 654321, 543219, 432198, 876543],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch", "Soda"],
    "Flavour ID": [1, 2, None, None, None, None],
    "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy", "Sugary"],
    })
    return df_1, df_2

    @@ -46,27 +47,34 @@ def sample_xlsx():
    return xlsx_1, xlsx_2


    def test_multiindex():
    df_1, df_2 = sample_dfs()
    diff = diff_pd(df_1, df_2, ["ID", "Flavour ID"])
    def run_assertion(diff):
    changed = diff["changed"]
    assert len(changed) == 1
    assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
    added = diff["added"]
    assert len(added) == 1
    assert added.iloc[0]["Flavour Description"] == "Sugary"
    removed = diff["removed"]
    assert len(removed) == 1
    assert removed.iloc[0]["Flavour Description"] == "Smoky"
    print("OK.")


    def test_multiindex():
    df_1, df_2 = sample_dfs()
    diff = compare.diff_pd(df_1, df_2, ["ID", "Flavour ID"])
    run_assertion(diff)


    def test_multiindex_excel():
    xlsx_1, xlsx_2 = sample_xlsx()
    diff_io = io.BytesIO()
    compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
    compare.compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
    diff = pd.read_excel(diff_io, sheet_name=None)
    changed = diff["changed"]
    assert len(changed) == 1
    assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
    print("OK.")
    run_assertion(diff)


    if __name__ == '__main__':
    test_parser()
    test_multiindex()
    test_multiindex_excel()
    test_parser()
  6. sanzoghenzo revised this gist Feb 5, 2020. 2 changed files with 21 additions and 7 deletions.
    16 changes: 9 additions & 7 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -11,9 +11,6 @@
    import numpy as np


    # TODO: GUI to select which data to keep for merging


    def report_diff(x):
    """Function to use with groupby.apply to highlihgt value changes."""
    return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'
    @@ -89,7 +86,7 @@ def compare_excel(
    print(f"Differences saved in {out_path}")


    def main():
    def build_parser():
    cfg = argparse.ArgumentParser(
    description="Compares two excel files and outputs the differences "
    "in another excel file."
    @@ -98,16 +95,21 @@ def main():
    cfg.add_argument("path2", help="Second excel file")
    cfg.add_argument("sheetname", help="Name of the sheet to compare.")
    cfg.add_argument(
    "key-column",
    help="Name of the column with unique row identifier. It has to be "
    "key_column",
    help="Name of the column(s) with unique row identifier. It has to be "
    "the actual text of the first row, not the excel notation."
    "Use multiple times to create a composite index.",
    action="append",
    nargs="+",
    )
    cfg.add_argument("-o", "--output-path", default="compared.xlsx",
    help="Path of the comparison results")
    cfg.add_argument("--skiprows", help='number of rows to skip', type=int,
    action='append', default=None)
    return cfg


    def main():
    cfg = build_parser()
    opt = cfg.parse_args()
    compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname,
    opt.key_column, skiprows=opt.skiprows)
    12 changes: 12 additions & 0 deletions test_compare.py
    Original file line number Diff line number Diff line change
    @@ -4,6 +4,17 @@
    import compare


    def test_parser():
    cfg = build_parser()
    opt = cfg.parse_args(["test1.xlsx", "test2.xlsx", "Sheet 1", "Col1", "Col2", "-o", "output.xlsx"])
    assert opt.path1 == "test1.xlsx"
    assert opt.path2 == "test2.xlsx"
    assert opt.output_path == "output.xlsx"
    assert opt.sheetname == "Sheet 1"
    assert opt.key_column == ["Col1", "Col2"]
    assert opt.skiprows is None


    def build_excel_stream(df, sheetname):
    """Create an excel workbook as a file-like object."""
    output = io.BytesIO()
    @@ -56,5 +67,6 @@ def test_multiindex_excel():


    if __name__ == '__main__':
    test_parser()
    test_multiindex()
    test_multiindex_excel()
  7. sanzoghenzo revised this gist Feb 5, 2020. 2 changed files with 72 additions and 9 deletions.
    21 changes: 12 additions & 9 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -1,10 +1,7 @@
    """
    Compare two Excel sheets
    Inspired by https://pbpython.com/excel-diff-pandas-update.html
    For the documentation, download this file and type:
    python compare.py --help
    """

    @@ -29,14 +26,12 @@ def strip(x):

    def diff_pd(old_df, new_df, idx_col):
    """Identify differences between two pandas DataFrames using a key column.
    Key column is assumed to have only unique data
    (like a database unique id index)
    Args:
    old_df (pd.DataFrame): first dataframe
    new_df (pd.DataFrame): second dataframe
    idx_col (str): column name of the index,
    idx_col (str|list(str)): column name(s) of the index,
    needs to be present in both DataFrames
    """
    # setting the column name as index for fast operations
    @@ -63,11 +58,17 @@ def diff_pd(old_df, new_df, idx_col):
    common_data = pd.concat(
    [old_common.reset_index(), new_common.reset_index()], sort=True
    )
    changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
    changed_keys = common_data.drop_duplicates(keep=False)[idx_col]
    if isinstance(changed_keys, pd.Series):
    changed_keys = changed_keys.unique()
    else:
    changed_keys = changed_keys.drop_duplicates().set_index(idx_col).index
    # combining the changed rows via multi level columns
    df_all_changes = pd.concat(
    [old_common.loc[changed_keys], new_common.loc[changed_keys]],
    axis='columns', keys=['old', 'new']).swaplevel(axis='columns')
    axis='columns',
    keys=['old', 'new']
    ).swaplevel(axis='columns')
    # using report_diff to merge the changes in a single cell with "-->"
    df_changed = df_all_changes.groupby(level=0, axis=1).apply(
    lambda frame: frame.apply(report_diff, axis=1))
    @@ -100,6 +101,8 @@ def main():
    "key-column",
    help="Name of the column with unique row identifier. It has to be "
    "the actual text of the first row, not the excel notation."
    "Use multiple times to create a composite index.",
    action="append",
    )
    cfg.add_argument("-o", "--output-path", default="compared.xlsx",
    help="Path of the comparison results")
    @@ -111,4 +114,4 @@ def main():


    if __name__ == '__main__':
    main()
    main()
    60 changes: 60 additions & 0 deletions test_compare.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    """Excel compare test suite."""
    import io

    import compare


    def build_excel_stream(df, sheetname):
    """Create an excel workbook as a file-like object."""
    output = io.BytesIO()
    with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
    df.to_excel(writer, sheet_name=sheetname, index=False)
    return output


    def sample_dfs():
    df_1 = pd.DataFrame({
    "ID": [123456, 123456, 654321, 543219, 432198],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"],
    "Flavour ID": [1, 2, None, None, None],
    "Flavour Description": ["Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy", "Fuzzy"],
    })
    df_2 = pd.DataFrame({
    "ID": [123456, 123456, 654321, 543219, 432198],
    "Name": ["Lemonade", "Lemonade", "Cola", "Orange", "Fruit Punch"],
    "Flavour ID": [1, 2, None, None, None],
    "Flavour Description": ["Fuzzy", "Bubbly", "Fuzzy", "Fuzzy", "Fuzzy"],
    })
    return df_1, df_2


    def sample_xlsx():
    df_1, df_2 = sample_dfs()
    xlsx_1 = build_excel_stream(df_1, "Sheet1")
    xlsx_2 = build_excel_stream(df_2, "Sheet1")
    return xlsx_1, xlsx_2


    def test_multiindex():
    df_1, df_2 = sample_dfs()
    diff = diff_pd(df_1, df_2, ["ID", "Flavour ID"])
    changed = diff["changed"]
    assert len(changed) == 1
    assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
    print("OK.")


    def test_multiindex_excel():
    xlsx_1, xlsx_2 = sample_xlsx()
    diff_io = io.BytesIO()
    compare_excel(xlsx_1, xlsx_2, diff_io, "Sheet1", ["ID", "Flavour ID"])
    diff = pd.read_excel(diff_io, sheet_name=None)
    changed = diff["changed"]
    assert len(changed) == 1
    assert changed.iloc[0]["Flavour Description"] == "Fuzzy ---> Bubbly"
    print("OK.")


    if __name__ == '__main__':
    test_multiindex()
    test_multiindex_excel()
  8. sanzoghenzo revised this gist Jan 3, 2020. 1 changed file with 39 additions and 21 deletions.
    60 changes: 39 additions & 21 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,11 @@
    """
    Compare Excel Sheeets
    Compare two Excel sheets
    Inspired by https://pbpython.com/excel-diff-pandas-update.html
    For the documentation, download this file and type:
    python compare.py --help
    """

    import argparse
    @@ -10,25 +14,30 @@
    import numpy as np


    # TODO: GUI to select which data to keep for merging


    def report_diff(x):
    """Helper function to use with groupby.apply to highlihgt the velue changes."""
    """Function to use with groupby.apply to highlihgt value changes."""
    return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'


    def strip(x):
    """Helper function to use with applymap to strip all the whitespaces in the dataframe."""
    """Function to use with applymap to strip whitespaces in a dataframe."""
    return x.strip() if isinstance(x, str) else x


    def diff_pd(old_df, new_df, idx_col):
    """Identify differences between two pandas DataFrames using a key column.
    Key column is assumed to have only unique data (like a database unique id index)
    Key column is assumed to have only unique data
    (like a database unique id index)
    Args:
    old_df (pd.DataFrame): first dataframe
    new_df (pd.DataFrame): second dataframe
    idx_col (str): column name of the index, needs to be present in both DataFrames
    idx_col (str): column name of the index,
    needs to be present in both DataFrames
    """
    # setting the column name as index for fast operations
    old_df = old_df.set_index(idx_col)
    @@ -44,17 +53,21 @@ def diff_pd(old_df, new_df, idx_col):
    }
    # focusing on common data of both dataframes
    common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
    common_columns = np.intersect1d(old_df.columns, new_df.columns, assume_unique=True)
    common_columns = np.intersect1d(
    old_df.columns, new_df.columns, assume_unique=True
    )
    new_common = new_df.loc[common_keys, common_columns].applymap(strip)
    old_common = old_df.loc[common_keys, common_columns].applymap(strip)
    # get the changed rows keys by dropping identical rows
    # (indexes are ignored, so we'll reset them)
    common_data = pd.concat([old_common.reset_index(), new_common.reset_index()])
    common_data = pd.concat(
    [old_common.reset_index(), new_common.reset_index()], sort=True
    )
    changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
    # combining the changed rows via multi level columns
    df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]],
    axis='columns', keys=['old', 'new'])
    df_all_changes = df_all_changes.swaplevel(axis='columns')[new_common.columns]
    df_all_changes = pd.concat(
    [old_common.loc[changed_keys], new_common.loc[changed_keys]],
    axis='columns', keys=['old', 'new']).swaplevel(axis='columns')
    # using report_diff to merge the changes in a single cell with "-->"
    df_changed = df_all_changes.groupby(level=0, axis=1).apply(
    lambda frame: frame.apply(report_diff, axis=1))
    @@ -63,7 +76,9 @@ def diff_pd(old_df, new_df, idx_col):
    return out_data


    def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwargs):
    def compare_excel(
    path1, path2, out_path, sheet_name, index_col_name, **kwargs
    ):
    old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
    new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
    diff = diff_pd(old_df, new_df, index_col_name)
    @@ -75,21 +90,24 @@ def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwa

    def main():
    cfg = argparse.ArgumentParser(
    description="Compares two excel files and outputs the differences in another excel file. "
    "A column name can be specified as the unique row identifier."
    description="Compares two excel files and outputs the differences "
    "in another excel file."
    )
    cfg.add_argument("path1", help="Fist excel file")
    cfg.add_argument("path2", help="Second excel file")
    cfg.add_argument("sheetname", help="Name of the sheet to compare.")
    cfg.add_argument("-c", "--index-column", help="Name of the column with unique row identifier",
    required=True)
    cfg.add_argument("-o", "--output-path", help="Path of the comparison results",
    default="compared.xlsx")
    cfg.add_argument("--skiprows", help='number of rows to skip', type=int, action='append',
    default=None)
    cfg.add_argument(
    "key-column",
    help="Name of the column with unique row identifier. It has to be "
    "the actual text of the first row, not the excel notation."
    )
    cfg.add_argument("-o", "--output-path", default="compared.xlsx",
    help="Path of the comparison results")
    cfg.add_argument("--skiprows", help='number of rows to skip', type=int,
    action='append', default=None)
    opt = cfg.parse_args()
    compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.index_column,
    skiprows=opt.skiprows)
    compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname,
    opt.key_column, skiprows=opt.skiprows)


    if __name__ == '__main__':
  9. sanzoghenzo revised this gist Oct 11, 2019. 1 changed file with 9 additions and 3 deletions.
    12 changes: 9 additions & 3 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -10,11 +10,16 @@
    import numpy as np


    # Define the diff function to show the changes in each field
    def report_diff(x):
    """Helper function to use with groupby.apply to highlihgt the velue changes."""
    return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'


    def strip(x):
    """Helper function to use with applymap to strip all the whitespaces in the dataframe."""
    return x.strip() if isinstance(x, str) else x


    def diff_pd(old_df, new_df, idx_col):
    """Identify differences between two pandas DataFrames using a key column.
    @@ -39,8 +44,9 @@ def diff_pd(old_df, new_df, idx_col):
    }
    # focusing on common data of both dataframes
    common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
    old_common = old_df.loc[common_keys]
    new_common = new_df.loc[common_keys]
    common_columns = np.intersect1d(old_df.columns, new_df.columns, assume_unique=True)
    new_common = new_df.loc[common_keys, common_columns].applymap(strip)
    old_common = old_df.loc[common_keys, common_columns].applymap(strip)
    # get the changed rows keys by dropping identical rows
    # (indexes are ignored, so we'll reset them)
    common_data = pd.concat([old_common.reset_index(), new_common.reset_index()])
  10. sanzoghenzo revised this gist Oct 11, 2019. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion compare.py
    Original file line number Diff line number Diff line change
    @@ -47,7 +47,8 @@ def diff_pd(old_df, new_df, idx_col):
    changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
    # combining the changed rows via multi level columns
    df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]],
    axis='columns', keys=['old', 'new']).swaplevel(axis='columns')
    axis='columns', keys=['old', 'new'])
    df_all_changes = df_all_changes.swaplevel(axis='columns')[new_common.columns]
    # using report_diff to merge the changes in a single cell with "-->"
    df_changed = df_all_changes.groupby(level=0, axis=1).apply(
    lambda frame: frame.apply(report_diff, axis=1))
  11. sanzoghenzo created this gist Oct 10, 2019.
    89 changes: 89 additions & 0 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,89 @@
    """
    Compare Excel Sheeets
    Inspired by https://pbpython.com/excel-diff-pandas-update.html
    """

    import argparse

    import pandas as pd
    import numpy as np


    # Define the diff function to show the changes in each field
    def report_diff(x):
    return x[0] if x[0] == x[1] or pd.isna(x).all() else f'{x[0]} ---> {x[1]}'


    def diff_pd(old_df, new_df, idx_col):
    """Identify differences between two pandas DataFrames using a key column.
    Key column is assumed to have only unique data (like a database unique id index)
    Args:
    old_df (pd.DataFrame): first dataframe
    new_df (pd.DataFrame): second dataframe
    idx_col (str): column name of the index, needs to be present in both DataFrames
    """
    # setting the column name as index for fast operations
    old_df = old_df.set_index(idx_col)
    new_df = new_df.set_index(idx_col)
    # get the added and removed rows
    old_keys = old_df.index
    new_keys = new_df.index
    removed_keys = np.setdiff1d(old_keys, new_keys)
    added_keys = np.setdiff1d(new_keys, old_keys)
    out_data = {
    'removed': old_df.loc[removed_keys],
    'added': new_df.loc[added_keys]
    }
    # focusing on common data of both dataframes
    common_keys = np.intersect1d(old_keys, new_keys, assume_unique=True)
    old_common = old_df.loc[common_keys]
    new_common = new_df.loc[common_keys]
    # get the changed rows keys by dropping identical rows
    # (indexes are ignored, so we'll reset them)
    common_data = pd.concat([old_common.reset_index(), new_common.reset_index()])
    changed_keys = common_data.drop_duplicates(keep=False)[idx_col].unique()
    # combining the changed rows via multi level columns
    df_all_changes = pd.concat([old_common.loc[changed_keys], new_common.loc[changed_keys]],
    axis='columns', keys=['old', 'new']).swaplevel(axis='columns')
    # using report_diff to merge the changes in a single cell with "-->"
    df_changed = df_all_changes.groupby(level=0, axis=1).apply(
    lambda frame: frame.apply(report_diff, axis=1))
    out_data['changed'] = df_changed

    return out_data


    def compare_excel(path1, path2, out_path, sheet_name, index_col_name=None, **kwargs):
    old_df = pd.read_excel(path1, sheet_name=sheet_name, **kwargs)
    new_df = pd.read_excel(path2, sheet_name=sheet_name, **kwargs)
    diff = diff_pd(old_df, new_df, index_col_name)
    with pd.ExcelWriter(out_path) as writer:
    for sname, data in diff.items():
    data.to_excel(writer, sheet_name=sname)
    print(f"Differences saved in {out_path}")


    def main():
    cfg = argparse.ArgumentParser(
    description="Compares two excel files and outputs the differences in another excel file. "
    "A column name can be specified as the unique row identifier."
    )
    cfg.add_argument("path1", help="Fist excel file")
    cfg.add_argument("path2", help="Second excel file")
    cfg.add_argument("sheetname", help="Name of the sheet to compare.")
    cfg.add_argument("-c", "--index-column", help="Name of the column with unique row identifier",
    required=True)
    cfg.add_argument("-o", "--output-path", help="Path of the comparison results",
    default="compared.xlsx")
    cfg.add_argument("--skiprows", help='number of rows to skip', type=int, action='append',
    default=None)
    opt = cfg.parse_args()
    compare_excel(opt.path1, opt.path2, opt.output_path, opt.sheetname, opt.index_column,
    skiprows=opt.skiprows)


    if __name__ == '__main__':
    main()