Last active
December 10, 2023 00:53
-
-
Save bencharb/13249225206dd2b2979942d98efe87cd to your computer and use it in GitHub Desktop.
Revisions
-
bencharb revised this gist
Nov 30, 2016 . 1 changed file with 14 additions and 162 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -124,90 +124,6 @@ def is_true(val): return unicode(val).lower() in ['true', 'yes'] def validate_line(l): return unicode(l[0]).isnumeric() and '/' in l.split()[0] @@ -230,81 +146,19 @@ def validate_data(data): raise Exception('Invalid lines %s' % '\n'.join(invalids)) return True def line_to_dict(line): parts = re.findall(r'(\d+/\d+/\d+)\s(.*)\s(.*\d+)$', line) if parts: date, des, amt = parts[0] return {'Date':date, 'Description':des, 'Amount':amt} boa_rx = re.compile('\d\d\/\d\d\/\d\d(?:.(?!\nTotal)(?!\d\d\/\d\d\/\d\d)(?!\n\d\d\/\d\d\/\d\d)|\n)+\.\d\d', re.IGNORECASE) def parse_boa_dump(stringdata): records = boa_rx.findall(stringdata) records = [r.replace('\n', ' ') for r in records] return records class TransformedStatement(object): @@ -313,6 +167,7 @@ def __init__(self, dump_path, account='unknown'): self.dump_path = dump_path self.account = account def write(self, out_path, columns, mode='a'): fullpath = os.path.abspath(out_path) @@ -326,12 +181,9 @@ def add_account(items, account_name): d['Account'] = account_name # remove line endings file_data = fin.read() parsed_lines = parse_boa_dump(file_data) joined_dict = map(line_to_dict, parsed_lines) add_account(joined_dict, self.account) writer.writerows(joined_dict) -
bencharb created this gist
Nov 15, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,414 @@ ''' Transform Bank of America PDF text into tables. Example: python parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv Example of original PDF text (copy/pasted from PDF): MISTER NAME | Account # 999 99999 9999 | November 24, 2015 to December 24, 2015 Your checking account Page 3 of 4 Deposits and other additions Date Description Amount 11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00 12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME ID:000299229294 PPD 918.57 12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME ID:000299229294 PPD 918.57 Total deposits and other additions $2,037.14 Withdrawals and other subtractions Date Description Amount 11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00 12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00 Total withdrawals and other subtractions -$3,120.00 To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab in Online Banking for a printable version of the How to Balance Your Account Worksheet. Checks Date Check # Amount Date Check # Amount 10/14/14 833 -183.00 10/24/14 835 -61.00 10/09/14 833 -9.95 Total checks -$253.95 Total # of checks 3 Note your Ending Balance already reflects the subtraction of Service Fees. To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab in Online Banking for a printable version of the How to Balance Your Account Worksheet. Example output: Date,Description,Amount,Account 11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount 11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST,-120.00,testaccount 10/14/14 ,833,-183.0,testaccount 10/24/14 ,835,-61.0,testaccount' ''' import re import csv import shutil # Meh, cant test now, hard to assert equal. # def test_transform(): # test_data = '''MISTER NAME | Account # 999 99999 9999 | November 24, 2015 to December 24, 2015 # Your checking account # Page 3 of 4 # Deposits and other additions # Date Description Amount # 11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00 # 12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME # ID:000299229294 PPD # 918.57 # 12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME # ID:000299229294 PPD # 918.57 # Total deposits and other additions $2,037.14 # Withdrawals and other subtractions # Date Description Amount # 11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00 # 12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00 # Total withdrawals and other subtractions -$3,120.00 # To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab # in Online Banking for a printable version of the How to Balance Your Account Worksheet. # Checks # Date Check # Amount Date Check # Amount # 10/14/14 833 -183.00 10/24/14 835 -61.00 # 10/09/14 833 -9.95 # Total checks -$253.95 # Total # of checks 3 # Note your Ending Balance already reflects the subtraction of Service Fees. # To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab # in Online Banking for a printable version of the How to Balance Your Account Worksheet.''' # expected = '''Date,Description,Amount,Account # 11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount # 11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ALEXANDRIA VA,-120.00,testaccount # 10/14/14 ,833,-183.0,testaccount # 10/24/14 ,835,-61.0,testaccount''' # # import tempfile # # tmp_path = os.path.join(tempfile.mkdtemp(),'testaccount') # # tmp_path_output = tmp_path+'_final_output.csv' # # tmp_path_output_expected = tmp_path+'_final_output_expected.csv' # # with open(tmp_path, 'w') as fout: # # fout.write(test_data) # # columns = ('Date', 'Description', 'Amount', 'Account',) # # rowcounts = main([tmp_path], tmp_path_output, columns=columns) # # with open(tmp_path_output_expected, 'w') as fout: # # writer = csv.DictWriter(fout, columns) # # writer.writeheader() # # writer.writerows(expected.split('\n')) # # with open(tmp_path_output, 'r') as fin: # # orig_reader = csv.reader(fin) # # orig_data = [l for l in orig_reader] # # with open(tmp_path_output_expected, 'r') as fin: # # expected_reader = csv.reader(fin) # # expected_data = [l for l in expected_reader] # # print 'expected_data', expected_data # # print 'orig_data', orig_data # # assert expected_data == orig_data # # # try: # # # except AssertionError: # # # print 'tmp_path_output_expected', tmp_path_output_expected # # # print 'tmp_path_output', tmp_path_output # # # raise def is_true(val): return unicode(val).lower() in ['true', 'yes'] def split_check_records(*lines): if len(lines) == 1: lines = lines[0] if isinstance(lines, basestring): lines = lines.split('\n') for l in lines: rx = r'\d+/\d+/\d+\s\d+\*?\s-?\d+\.\d\s?' found = re.findall(rx, l) if found: for f in found: yield f def test_split_check_records(): checklines = ['Date Check # Amount Date Check # Amount\n', '06/29/15 440 -333.00 07/23/15 447* -65.00\n', '07/21/15 441 -183.00 07/27/15 449* -20.00\n', '07/15/15 443* -15.00\n'] result = list(split_check_records(*checklines)) expected = ['06/29/15 440 -333.0', '07/23/15 447* -65.0', '07/21/15 441 -183.0', '07/27/15 449* -20.0', '07/15/15 443* -15.0'] assert result == expected test_split_check_records() def is_check_line(l): multi_line = '05/12/15 1417 -120000.00 05/19/15 1422 -750000.00' is_multi_line = len(l) <= len(multi_line)+2 and l.count('/') == 4 if is_multi_line: return True single_line = '05/12/15 1417 -120000.00' second_word = single_line.split()[1] is_single_line = len(l) <= len(single_line)+2 and l.count('/') == 2 and unicode(second_word.split('*')[0]).isnumeric() return is_single_line def test_is_check_line(): assert is_check_line('05/12/15 1417 -120000.00 05/19/15 1422 -750000.00') assert is_check_line('05/12/15 1417 -120000.00') assert is_check_line('05/12/15 1417* -120000.00 05/19/15 1422* -750000.00') assert is_check_line('05/12/15 1417* -120000.00') test_is_check_line() def should_join_line(l, min_line_len=18): # not checks or long lines has_any_date = l.count('/') >= 2 if has_any_date: return False is_header = l.startswith('Date') if is_header: return False is_check = is_check_line(l) if is_check: return False return True short_line = len(l) <= min_line_len return short_line def join_split_lines(multiline_data): if isinstance(multiline_data, basestring): lines = multiline_data.split('\n') else: lines = multiline_data min_line_len = sorted([len(l) for l in lines])[-1]/2 joined_lines = [] rev_lines = reversed(lines) staging_line = None for line in rev_lines: should_join = should_join_line(line, min_line_len=min_line_len) if should_join: if staging_line: staging_line = ' '.join([line,staging_line]) else: staging_line = line else: if staging_line: line = ' '.join([line,staging_line]) staging_line = None joined_lines.append(line) if staging_line: joined_lines.append(staging_line) return list(reversed(joined_lines)) def validate_line(l): return unicode(l[0]).isnumeric() and '/' in l.split()[0] def validate_header_line(l): ok_len = len(l) < len('Date Check # Amount Date Check # Amount') + 10 valid = ok_len and l.startswith('Date') return valid def validate_data(data): if isinstance(data, basestring): working_data = data.split('\n') else: working_data = data invalids = [] for line in working_data: if not validate_line(line): if not validate_header_line(line): invalids.append(line) if invalids: raise Exception('Invalid lines %s' % '\n'.join(invalids)) return True def join_transaction_lines(data, validate=True): joined = join_split_lines(data) if validate: validate_data(joined) return joined def split_check_rows(*lines): if len(lines) == 1 and isinstance(lines[0], basestring): lines = lines[0].split('\n') more_joined = [] for line in lines: if is_check_line(line): records = list(split_check_records(line)) more_joined.extend(records) else: if line.count('Date Check #') > 1: second_index = line[1:].index('Date Check #') line = line[:second_index] more_joined.append(line) return more_joined def test_split_check_rows(): expected = ['Date Check # Amount', '05/12/15 417 -12.0', '05/19/15 422 -75.0', '05/04/15 419* -228.0', '05/01/15 420 -275.0', '05/11/15 421 -50.0'] data = '''Date Check # Amount Date Check # Amount 05/12/15 417 -12.00 05/19/15 422 -75.00 05/04/15 419* -228.00 05/18/15 423 -2,090.00 05/01/15 420 -275.00 05/26/15 426* -1,000.00 05/11/15 421 -50.00''' result = split_check_rows(data) assert result == expected test_split_check_rows() def filter_line(l): junk_words = ['Total', 'To help', 'continued on', 'in Online Banking', 'Your checking', 'BENJAMIN ', 'Deposits and', 'Withdrawals and other ', 'Page', 'Checks', 'Deposits', '* There is'] word = None for word in junk_words: if l.startswith(word): return False return True def format_transaction(t): trans = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', t) if not trans: return return '\t'.join(trans[0]) def parse_dump(*lines_of_data): lines = filter(filter_line, lines_of_data) joined = join_transaction_lines(lines) joined = split_check_rows(*joined) for line in joined: parts = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', line) if not parts: continue date, des, amt = parts[0] yield {'Date':date, 'Description':des, 'Amount':amt} class TransformedStatement(object): def __init__(self, dump_path, account='unknown'): super(TransformedStatement, self).__init__() self.dump_path = dump_path self.account = account def write(self, out_path, columns, mode='a'): fullpath = os.path.abspath(out_path) with open(out_path, mode) as fout: writer = csv.DictWriter(fout, columns) with open(self.dump_path, 'r') as fin: data = [] def add_account(items, account_name): for d in items: d['Account'] = account_name # remove line endings for d in fin.readlines(): if d[:-1] == '\n': data.append(d[:-1]) else: data.append(d) joined_dict = list(parse_dump(*data)) add_account(joined_dict, self.account) writer.writerows(joined_dict) return len(joined_dict) def transform_dump(transformed_statements, columns=None, out_path='transformed_statements.csv', override_existing_file=False): out_path = os.path.abspath(out_path) exists = os.path.exists(out_path) if not override_existing_file and exists: raise Exception('File exists: %s' % out_path) if isinstance(transformed_statements, TransformedStatement): transformed_statements = [transformed_statements] else: for ts in transformed_statements: if not isinstance(ts, TransformedStatement): raise TypeError('%s not of type TransformedStatement' % ts) out_path_original = out_path out_path_temp = out_path+'.temp' with open(out_path_temp, 'w') as fout: writer = csv.DictWriter(fout, columns) writer.writeheader() rowcounts = [] for stmt in transformed_statements: rowcounts.append(stmt.write(out_path_temp, columns, mode='a')) shutil.move(out_path_temp, out_path) return rowcounts def main(files, output_file, columns=None, override_existing_file=True): basenames = [os.path.basename(os.path.splitext(f)[0]) for f in files] source_paths = zip(files,basenames) stmts = [TransformedStatement(path, account=act) for path, act in source_paths] rowcounts = transform_dump(stmts, columns=columns, out_path=output_file, override_existing_file=override_existing_file) return rowcounts if __name__ == '__main__': import sys import argparse import os description='Parse copy/pasted text from BofA statements. \nPrerequisites: \n(1) Download BofA pdfs. \n(2) For each PDF, keep only pages with tables on them; trash all other pages without tables. \n(3) For each PDF, copy/paste PDF text content from each statement into one text file. Just Cmd+a/Ctl+a > copy and paste into a file for the account name, such as "checking6443.txt"' description+='\nExample: \npython parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv\n\n' import textwrap description = '\n'.join(['\n'.join(textwrap.wrap(line, 100)) for line in description.split('\n')]) parser = argparse.ArgumentParser(description) parser.add_argument('file', nargs='*', help='Separate multiple files with a comma. Paths to the files you created for each account. Each file should contain content for only one account. Generate the content for each file by copy/pasting the text from the PDFs into one file. Detailed description elsewhere in this help file.') parser.add_argument('--output', help='Output path', default='combined_statements.csv') parser.add_argument('--override_output', help='Override existing output file if it exists', action='store_true', default=True) if len(sys.argv)==1: parser.print_help() sys.exit(1) args = parser.parse_args() # files = args.files.split(',') files = args.file #.split(',') out_path = args.output override_existing_file = is_true(args.override_output) columns = ('Date', 'Description', 'Amount', 'Account',) rowcounts = main(files, out_path, columns=columns, override_existing_file=override_existing_file) if not rowcounts: message = 'No rows transformed to file %s' % out_path else: message = '%d rows from %d files transformed to file %s' % (sum(rowcounts),len(rowcounts), out_path,) print message