bencharb · December 10, 2023 00:53 · Nov 30, 2016 · Nov 15, 2016
diff --git a/parse_boa_statements.py b/parse_boa_statements.py
@@ -124,90 +124,6 @@
 def is_true(val):
     return unicode(val).lower() in ['true', 'yes']
 
-def split_check_records(*lines):
-    if len(lines) == 1:
-        lines = lines[0]
-    if isinstance(lines, basestring):
-        lines = lines.split('\n')
-    for l in lines:
-        rx = r'\d+/\d+/\d+\s\d+\*?\s-?\d+\.\d\s?'
-        found = re.findall(rx, l)
-        if found:
-            for f in found:
-                yield f
-
-def test_split_check_records():
-    checklines =  ['Date Check # Amount Date Check # Amount\n',
-             '06/29/15 440 -333.00 07/23/15 447* -65.00\n',
-             '07/21/15 441 -183.00 07/27/15 449* -20.00\n',
-             '07/15/15 443* -15.00\n']
-    result = list(split_check_records(*checklines))
-    expected = ['06/29/15 440 -333.0',
-             '07/23/15 447* -65.0',
-             '07/21/15 441 -183.0',
-             '07/27/15 449* -20.0',
-             '07/15/15 443* -15.0']
-    assert result == expected
-test_split_check_records()
-
-def is_check_line(l):
-    multi_line = '05/12/15 1417 -120000.00 05/19/15 1422 -750000.00'
-    is_multi_line = len(l) <= len(multi_line)+2 and l.count('/') == 4
-    if is_multi_line:
-        return True
-    single_line = '05/12/15 1417 -120000.00'
-    second_word = single_line.split()[1]
-    is_single_line = len(l) <= len(single_line)+2 and l.count('/') == 2 and unicode(second_word.split('*')[0]).isnumeric()
-    return is_single_line
-
-def test_is_check_line():
-    assert is_check_line('05/12/15 1417 -120000.00 05/19/15 1422 -750000.00')
-    assert is_check_line('05/12/15 1417 -120000.00')
-    assert is_check_line('05/12/15 1417* -120000.00 05/19/15 1422* -750000.00')
-    assert is_check_line('05/12/15 1417* -120000.00')
-
-test_is_check_line()
-
-def should_join_line(l, min_line_len=18):
-#     not checks or long lines
-    has_any_date = l.count('/') >= 2
-    if has_any_date:
-        return False
-    is_header = l.startswith('Date')
-    if is_header:
-        return False
-    is_check = is_check_line(l)
-    if is_check:
-        return False
-    return True
-    short_line = len(l) <= min_line_len
-    return short_line
-
-def join_split_lines(multiline_data):
-    if isinstance(multiline_data, basestring):
-        lines = multiline_data.split('\n')
-    else:
-        lines = multiline_data
-    min_line_len = sorted([len(l) for l in lines])[-1]/2
-    joined_lines = []
-    rev_lines = reversed(lines)
-    staging_line = None
-    for line in rev_lines:
-        should_join = should_join_line(line, min_line_len=min_line_len)
-        if should_join:
-            if staging_line:
-                staging_line = ' '.join([line,staging_line])
-            else:
-                staging_line = line
-        else:
-            if staging_line:
-                line = ' '.join([line,staging_line])
-                staging_line = None
-            joined_lines.append(line)
-    if staging_line:
-        joined_lines.append(staging_line)
-    return list(reversed(joined_lines))
-
 def validate_line(l):
     return unicode(l[0]).isnumeric() and '/' in l.split()[0]
 
@@ -230,81 +146,19 @@ def validate_data(data):
         raise Exception('Invalid lines %s' % '\n'.join(invalids))
     return True
 
-def join_transaction_lines(data, validate=True):
-    joined = join_split_lines(data)
-    if validate:
-        validate_data(joined)
-    return joined
-
-def split_check_rows(*lines):
-    if len(lines) == 1 and isinstance(lines[0], basestring):
-        lines = lines[0].split('\n')
-    more_joined = []
-    for line in lines:
-        if is_check_line(line):
-            records = list(split_check_records(line))
-            more_joined.extend(records)
-        else:
-            if line.count('Date Check #') > 1:
-                second_index = line[1:].index('Date Check #')
-                line = line[:second_index]
-            more_joined.append(line)
-    return more_joined
-
-def test_split_check_rows():
-    expected = ['Date Check # Amount',
- '05/12/15 417 -12.0',
- '05/19/15 422 -75.0',
- '05/04/15 419* -228.0',
- '05/01/15 420 -275.0',
- '05/11/15 421 -50.0']
-
-    data = '''Date Check # Amount Date Check # Amount
-05/12/15 417 -12.00 05/19/15 422 -75.00
-05/04/15 419* -228.00 05/18/15 423 -2,090.00
-05/01/15 420 -275.00 05/26/15 426* -1,000.00
-05/11/15 421 -50.00'''
-    result = split_check_rows(data)
-    assert result == expected
-
-test_split_check_rows()
-
-def filter_line(l):
-    junk_words = ['Total',
-                 'To help',
-                 'continued on',
-                 'in Online Banking',
-                 'Your checking',
-                 'BENJAMIN ',
-                 'Deposits and',
-                 'Withdrawals and other ',
-                 'Page',
-                 'Checks',
-                 'Deposits',
-                 '* There is']
-    word = None
-    for word in junk_words:
-        if l.startswith(word):
-            return False
-    return True
 
-def format_transaction(t):
-    trans = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', t)
-    if not trans:
-        return
-    return '\t'.join(trans[0])
-
-def parse_dump(*lines_of_data):
-    lines = filter(filter_line, lines_of_data)
-    joined = join_transaction_lines(lines)
-    joined = split_check_rows(*joined)
-    for line in joined:
-        parts = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', line)
-        if not parts:
-            continue
+def line_to_dict(line):
+    parts = re.findall(r'(\d+/\d+/\d+)\s(.*)\s(.*\d+)$', line)
+    if parts:
         date, des, amt = parts[0]
-        yield {'Date':date, 'Description':des, 'Amount':amt}
+        return {'Date':date, 'Description':des, 'Amount':amt}
+
+boa_rx = re.compile('\d\d\/\d\d\/\d\d(?:.(?!\nTotal)(?!\d\d\/\d\d\/\d\d)(?!\n\d\d\/\d\d\/\d\d)|\n)+\.\d\d', re.IGNORECASE)
 
+def parse_boa_dump(stringdata):
+    records = boa_rx.findall(stringdata)
+    records = [r.replace('\n', ' ') for r in records]
+    return records
 
 class TransformedStatement(object):
 
@@ -313,6 +167,7 @@ def __init__(self, dump_path, account='unknown'):
         self.dump_path = dump_path
         self.account = account
 
+
     def write(self, out_path, columns, mode='a'):
         fullpath = os.path.abspath(out_path)
 
@@ -326,12 +181,9 @@ def add_account(items, account_name):
                         d['Account'] = account_name
 
                 # remove line endings
-                for d in fin.readlines():
-                    if d[:-1] == '\n':
-                        data.append(d[:-1])
-                    else:
-                        data.append(d)
-                joined_dict = list(parse_dump(*data))
+                file_data = fin.read()
+                parsed_lines = parse_boa_dump(file_data)
+                joined_dict = map(line_to_dict, parsed_lines)
                 add_account(joined_dict, self.account)
 
                 writer.writerows(joined_dict)

diff --git a/parse_boa_statements.py b/parse_boa_statements.py
@@ -0,0 +1,414 @@
+'''
+Transform Bank of America PDF text into tables. 
+
+Example: 
+python parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv
+
+Example of original PDF text (copy/pasted from PDF):
+    MISTER NAME | Account # 999 99999 9999  | November 24, 2015 to December 24, 2015
+    Your checking account
+    Page 3 of 4
+    Deposits and other additions
+    Date Description Amount
+    11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00
+    12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME
+    ID:000299229294 PPD
+    918.57
+    12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME
+    ID:000299229294 PPD
+    918.57
+    Total deposits and other additions $2,037.14
+    Withdrawals and other subtractions
+    Date Description Amount
+    11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00
+    12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00
+    Total withdrawals and other subtractions -$3,120.00
+    To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
+    in Online Banking for a printable version of the How to Balance Your Account Worksheet.
+
+    Checks
+    Date Check # Amount Date Check # Amount
+    10/14/14 833 -183.00 10/24/14 835 -61.00
+    10/09/14 833 -9.95
+    Total checks -$253.95
+    Total # of checks 3
+    Note your Ending Balance already reflects the subtraction of Service Fees.
+    To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
+    in Online Banking for a printable version of the How to Balance Your Account Worksheet.
+
+Example output: 
+    Date,Description,Amount,Account
+    11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount
+    11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST,-120.00,testaccount
+    10/14/14 ,833,-183.0,testaccount
+    10/24/14 ,835,-61.0,testaccount'
+'''
+import re
+import csv
+import shutil
+
+# Meh, cant test now, hard to assert equal. 
+# def test_transform():
+#     test_data = '''MISTER NAME | Account # 999 99999 9999  | November 24, 2015 to December 24, 2015
+# Your checking account
+# Page 3 of 4
+# Deposits and other additions
+# Date Description Amount
+# 11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00
+# 12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME
+# ID:000299229294 PPD
+# 918.57
+# 12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME
+# ID:000299229294 PPD
+# 918.57
+# Total deposits and other additions $2,037.14
+# Withdrawals and other subtractions
+# Date Description Amount
+# 11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00
+# 12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00
+# Total withdrawals and other subtractions -$3,120.00
+# To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
+# in Online Banking for a printable version of the How to Balance Your Account Worksheet.
+
+# Checks
+# Date Check # Amount Date Check # Amount
+# 10/14/14 833 -183.00 10/24/14 835 -61.00
+# 10/09/14 833 -9.95
+# Total checks -$253.95
+# Total # of checks 3
+# Note your Ending Balance already reflects the subtraction of Service Fees.
+# To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
+# in Online Banking for a printable version of the How to Balance Your Account Worksheet.'''
+#     expected = '''Date,Description,Amount,Account
+# 11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount
+# 11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ALEXANDRIA VA,-120.00,testaccount
+# 10/14/14 ,833,-183.0,testaccount
+# 10/24/14 ,835,-61.0,testaccount'''
+
+
+# #     import tempfile
+
+# #     tmp_path = os.path.join(tempfile.mkdtemp(),'testaccount')
+# #     tmp_path_output = tmp_path+'_final_output.csv'
+# #     tmp_path_output_expected = tmp_path+'_final_output_expected.csv'
+
+# #     with open(tmp_path, 'w') as fout:
+# #         fout.write(test_data)
+# #     columns = ('Date', 'Description', 'Amount', 'Account',)
+
+# #     rowcounts = main([tmp_path], tmp_path_output, columns=columns)
+# #     with open(tmp_path_output_expected, 'w') as fout:
+# #         writer = csv.DictWriter(fout, columns)
+# #         writer.writeheader()
+# #         writer.writerows(expected.split('\n'))
+
+# #     with open(tmp_path_output, 'r') as fin:
+# #         orig_reader = csv.reader(fin)
+# #         orig_data = [l for l in orig_reader]
+
+# #     with open(tmp_path_output_expected, 'r') as fin:
+# #         expected_reader = csv.reader(fin)
+# #         expected_data = [l for l in expected_reader]
+
+# #     print 'expected_data', expected_data
+# #     print 'orig_data', orig_data
+# #     assert expected_data == orig_data
+# #     # try:
+
+# #     # except AssertionError:
+# #     #     print 'tmp_path_output_expected', tmp_path_output_expected
+# #     #     print 'tmp_path_output', tmp_path_output
+# #     #     raise
+
+
+def is_true(val):
+    return unicode(val).lower() in ['true', 'yes']
+
+def split_check_records(*lines):
+    if len(lines) == 1:
+        lines = lines[0]
+    if isinstance(lines, basestring):
+        lines = lines.split('\n')
+    for l in lines:
+        rx = r'\d+/\d+/\d+\s\d+\*?\s-?\d+\.\d\s?'
+        found = re.findall(rx, l)
+        if found:
+            for f in found:
+                yield f
+
+def test_split_check_records():
+    checklines =  ['Date Check # Amount Date Check # Amount\n',
+             '06/29/15 440 -333.00 07/23/15 447* -65.00\n',
+             '07/21/15 441 -183.00 07/27/15 449* -20.00\n',
+             '07/15/15 443* -15.00\n']
+    result = list(split_check_records(*checklines))
+    expected = ['06/29/15 440 -333.0',
+             '07/23/15 447* -65.0',
+             '07/21/15 441 -183.0',
+             '07/27/15 449* -20.0',
+             '07/15/15 443* -15.0']
+    assert result == expected
+test_split_check_records()
+
+def is_check_line(l):
+    multi_line = '05/12/15 1417 -120000.00 05/19/15 1422 -750000.00'
+    is_multi_line = len(l) <= len(multi_line)+2 and l.count('/') == 4
+    if is_multi_line:
+        return True
+    single_line = '05/12/15 1417 -120000.00'
+    second_word = single_line.split()[1]
+    is_single_line = len(l) <= len(single_line)+2 and l.count('/') == 2 and unicode(second_word.split('*')[0]).isnumeric()
+    return is_single_line
+
+def test_is_check_line():
+    assert is_check_line('05/12/15 1417 -120000.00 05/19/15 1422 -750000.00')
+    assert is_check_line('05/12/15 1417 -120000.00')
+    assert is_check_line('05/12/15 1417* -120000.00 05/19/15 1422* -750000.00')
+    assert is_check_line('05/12/15 1417* -120000.00')
+
+test_is_check_line()
+
+def should_join_line(l, min_line_len=18):
+#     not checks or long lines
+    has_any_date = l.count('/') >= 2
+    if has_any_date:
+        return False
+    is_header = l.startswith('Date')
+    if is_header:
+        return False
+    is_check = is_check_line(l)
+    if is_check:
+        return False
+    return True
+    short_line = len(l) <= min_line_len
+    return short_line
+
+def join_split_lines(multiline_data):
+    if isinstance(multiline_data, basestring):
+        lines = multiline_data.split('\n')
+    else:
+        lines = multiline_data
+    min_line_len = sorted([len(l) for l in lines])[-1]/2
+    joined_lines = []
+    rev_lines = reversed(lines)
+    staging_line = None
+    for line in rev_lines:
+        should_join = should_join_line(line, min_line_len=min_line_len)
+        if should_join:
+            if staging_line:
+                staging_line = ' '.join([line,staging_line])
+            else:
+                staging_line = line
+        else:
+            if staging_line:
+                line = ' '.join([line,staging_line])
+                staging_line = None
+            joined_lines.append(line)
+    if staging_line:
+        joined_lines.append(staging_line)
+    return list(reversed(joined_lines))
+
+def validate_line(l):
+    return unicode(l[0]).isnumeric() and '/' in l.split()[0]
+
+def validate_header_line(l):
+    ok_len = len(l) < len('Date Check # Amount Date Check # Amount') + 10
+    valid = ok_len and l.startswith('Date')
+    return valid
+
+def validate_data(data):
+    if isinstance(data, basestring):
+        working_data = data.split('\n')
+    else:
+        working_data = data
+    invalids = []
+    for line in working_data:
+        if not validate_line(line):
+            if not validate_header_line(line):
+                invalids.append(line)
+    if invalids:
+        raise Exception('Invalid lines %s' % '\n'.join(invalids))
+    return True
+
+def join_transaction_lines(data, validate=True):
+    joined = join_split_lines(data)
+    if validate:
+        validate_data(joined)
+    return joined
+
+def split_check_rows(*lines):
+    if len(lines) == 1 and isinstance(lines[0], basestring):
+        lines = lines[0].split('\n')
+    more_joined = []
+    for line in lines:
+        if is_check_line(line):
+            records = list(split_check_records(line))
+            more_joined.extend(records)
+        else:
+            if line.count('Date Check #') > 1:
+                second_index = line[1:].index('Date Check #')
+                line = line[:second_index]
+            more_joined.append(line)
+    return more_joined
+
+def test_split_check_rows():
+    expected = ['Date Check # Amount',
+ '05/12/15 417 -12.0',
+ '05/19/15 422 -75.0',
+ '05/04/15 419* -228.0',
+ '05/01/15 420 -275.0',
+ '05/11/15 421 -50.0']
+
+    data = '''Date Check # Amount Date Check # Amount
+05/12/15 417 -12.00 05/19/15 422 -75.00
+05/04/15 419* -228.00 05/18/15 423 -2,090.00
+05/01/15 420 -275.00 05/26/15 426* -1,000.00
+05/11/15 421 -50.00'''
+    result = split_check_rows(data)
+    assert result == expected
+
+test_split_check_rows()
+
+def filter_line(l):
+    junk_words = ['Total',
+                 'To help',
+                 'continued on',
+                 'in Online Banking',
+                 'Your checking',
+                 'BENJAMIN ',
+                 'Deposits and',
+                 'Withdrawals and other ',
+                 'Page',
+                 'Checks',
+                 'Deposits',
+                 '* There is']
+    word = None
+    for word in junk_words:
+        if l.startswith(word):
+            return False
+    return True
+
+def format_transaction(t):
+    trans = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', t)
+    if not trans:
+        return
+    return '\t'.join(trans[0])
+
+def parse_dump(*lines_of_data):
+    lines = filter(filter_line, lines_of_data)
+    joined = join_transaction_lines(lines)
+    joined = split_check_rows(*joined)
+    for line in joined:
+        parts = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', line)
+        if not parts:
+            continue
+        date, des, amt = parts[0]
+        yield {'Date':date, 'Description':des, 'Amount':amt}
+
+
+class TransformedStatement(object):
+
+    def __init__(self, dump_path, account='unknown'):
+        super(TransformedStatement, self).__init__()
+        self.dump_path = dump_path
+        self.account = account
+
+    def write(self, out_path, columns, mode='a'):
+        fullpath = os.path.abspath(out_path)
+
+        with open(out_path, mode) as fout:
+            writer = csv.DictWriter(fout, columns)
+
+            with open(self.dump_path, 'r') as fin:
+                data = []
+                def add_account(items, account_name):
+                    for d in items:
+                        d['Account'] = account_name
+
+                # remove line endings
+                for d in fin.readlines():
+                    if d[:-1] == '\n':
+                        data.append(d[:-1])
+                    else:
+                        data.append(d)
+                joined_dict = list(parse_dump(*data))
+                add_account(joined_dict, self.account)
+
+                writer.writerows(joined_dict)
+                return len(joined_dict)
+
+def transform_dump(transformed_statements, 
+                   columns=None,
+                   out_path='transformed_statements.csv', 
+                   override_existing_file=False):
+    out_path = os.path.abspath(out_path)
+    exists = os.path.exists(out_path)
+    if not override_existing_file and exists:
+        raise Exception('File exists: %s' % out_path)
+
+    if isinstance(transformed_statements, TransformedStatement):
+        transformed_statements = [transformed_statements]
+    else:
+        for ts in transformed_statements:
+            if not isinstance(ts, TransformedStatement):
+                raise TypeError('%s not of type TransformedStatement' % ts)
+
+    out_path_original = out_path
+    out_path_temp = out_path+'.temp'
+
+    with open(out_path_temp, 'w') as fout:
+        writer = csv.DictWriter(fout, columns)
+        writer.writeheader()
+
+    rowcounts = []
+    for stmt in transformed_statements:
+        rowcounts.append(stmt.write(out_path_temp, columns, mode='a'))
+
+    shutil.move(out_path_temp, out_path)
+    return rowcounts
+
+def main(files, output_file, columns=None, override_existing_file=True):
+
+    basenames = [os.path.basename(os.path.splitext(f)[0]) for f in files]
+    source_paths = zip(files,basenames)
+    stmts = [TransformedStatement(path, account=act) for path, act in source_paths]
+    rowcounts = transform_dump(stmts, columns=columns, out_path=output_file, override_existing_file=override_existing_file)
+    return rowcounts
+
+
+if __name__ == '__main__':
+    import sys
+    import argparse
+    import os
+
+    description='Parse copy/pasted text from BofA statements. \nPrerequisites: \n(1) Download BofA pdfs. \n(2) For each PDF, keep only pages with tables on them; trash all other pages without tables. \n(3) For each PDF, copy/paste PDF text content from each statement into one text file. Just Cmd+a/Ctl+a > copy and paste into a file for the account name, such as "checking6443.txt"'
+
+    description+='\nExample: \npython parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv\n\n'
+
+    import textwrap 
+    description = '\n'.join(['\n'.join(textwrap.wrap(line, 100)) for line in description.split('\n')])
+
+    parser = argparse.ArgumentParser(description)
+    parser.add_argument('file', nargs='*', help='Separate multiple files with a comma. Paths to the files you created for each account. Each file should contain content for only one account. Generate the content for each file by copy/pasting the text from the PDFs into one file. Detailed description elsewhere in this help file.')
+    parser.add_argument('--output', help='Output path', default='combined_statements.csv')
+    parser.add_argument('--override_output', help='Override existing output file if it exists', action='store_true', default=True)
+    if len(sys.argv)==1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+
+    # files = args.files.split(',')
+    files = args.file #.split(',')
+    out_path = args.output
+    override_existing_file = is_true(args.override_output)
+    columns = ('Date', 'Description', 'Amount', 'Account',)
+
+    rowcounts = main(files, out_path, columns=columns, override_existing_file=override_existing_file)
+
+    if not rowcounts:
+        message = 'No rows transformed to file %s' % out_path
+    else:
+        message = '%d rows from %d files transformed to file %s' % (sum(rowcounts),len(rowcounts), out_path,)
+
+    print message
No results found