Skip to content

Instantly share code, notes, and snippets.

@bencharb
Last active December 10, 2023 00:53
Show Gist options
  • Select an option

  • Save bencharb/13249225206dd2b2979942d98efe87cd to your computer and use it in GitHub Desktop.

Select an option

Save bencharb/13249225206dd2b2979942d98efe87cd to your computer and use it in GitHub Desktop.

Revisions

  1. bencharb revised this gist Nov 30, 2016. 1 changed file with 14 additions and 162 deletions.
    176 changes: 14 additions & 162 deletions parse_boa_statements.py
    Original file line number Diff line number Diff line change
    @@ -124,90 +124,6 @@
    def is_true(val):
    return unicode(val).lower() in ['true', 'yes']

    def split_check_records(*lines):
    if len(lines) == 1:
    lines = lines[0]
    if isinstance(lines, basestring):
    lines = lines.split('\n')
    for l in lines:
    rx = r'\d+/\d+/\d+\s\d+\*?\s-?\d+\.\d\s?'
    found = re.findall(rx, l)
    if found:
    for f in found:
    yield f

    def test_split_check_records():
    checklines = ['Date Check # Amount Date Check # Amount\n',
    '06/29/15 440 -333.00 07/23/15 447* -65.00\n',
    '07/21/15 441 -183.00 07/27/15 449* -20.00\n',
    '07/15/15 443* -15.00\n']
    result = list(split_check_records(*checklines))
    expected = ['06/29/15 440 -333.0',
    '07/23/15 447* -65.0',
    '07/21/15 441 -183.0',
    '07/27/15 449* -20.0',
    '07/15/15 443* -15.0']
    assert result == expected
    test_split_check_records()

    def is_check_line(l):
    multi_line = '05/12/15 1417 -120000.00 05/19/15 1422 -750000.00'
    is_multi_line = len(l) <= len(multi_line)+2 and l.count('/') == 4
    if is_multi_line:
    return True
    single_line = '05/12/15 1417 -120000.00'
    second_word = single_line.split()[1]
    is_single_line = len(l) <= len(single_line)+2 and l.count('/') == 2 and unicode(second_word.split('*')[0]).isnumeric()
    return is_single_line

    def test_is_check_line():
    assert is_check_line('05/12/15 1417 -120000.00 05/19/15 1422 -750000.00')
    assert is_check_line('05/12/15 1417 -120000.00')
    assert is_check_line('05/12/15 1417* -120000.00 05/19/15 1422* -750000.00')
    assert is_check_line('05/12/15 1417* -120000.00')

    test_is_check_line()

    def should_join_line(l, min_line_len=18):
    # not checks or long lines
    has_any_date = l.count('/') >= 2
    if has_any_date:
    return False
    is_header = l.startswith('Date')
    if is_header:
    return False
    is_check = is_check_line(l)
    if is_check:
    return False
    return True
    short_line = len(l) <= min_line_len
    return short_line

    def join_split_lines(multiline_data):
    if isinstance(multiline_data, basestring):
    lines = multiline_data.split('\n')
    else:
    lines = multiline_data
    min_line_len = sorted([len(l) for l in lines])[-1]/2
    joined_lines = []
    rev_lines = reversed(lines)
    staging_line = None
    for line in rev_lines:
    should_join = should_join_line(line, min_line_len=min_line_len)
    if should_join:
    if staging_line:
    staging_line = ' '.join([line,staging_line])
    else:
    staging_line = line
    else:
    if staging_line:
    line = ' '.join([line,staging_line])
    staging_line = None
    joined_lines.append(line)
    if staging_line:
    joined_lines.append(staging_line)
    return list(reversed(joined_lines))

    def validate_line(l):
    return unicode(l[0]).isnumeric() and '/' in l.split()[0]

    @@ -230,81 +146,19 @@ def validate_data(data):
    raise Exception('Invalid lines %s' % '\n'.join(invalids))
    return True

    def join_transaction_lines(data, validate=True):
    joined = join_split_lines(data)
    if validate:
    validate_data(joined)
    return joined

    def split_check_rows(*lines):
    if len(lines) == 1 and isinstance(lines[0], basestring):
    lines = lines[0].split('\n')
    more_joined = []
    for line in lines:
    if is_check_line(line):
    records = list(split_check_records(line))
    more_joined.extend(records)
    else:
    if line.count('Date Check #') > 1:
    second_index = line[1:].index('Date Check #')
    line = line[:second_index]
    more_joined.append(line)
    return more_joined

    def test_split_check_rows():
    expected = ['Date Check # Amount',
    '05/12/15 417 -12.0',
    '05/19/15 422 -75.0',
    '05/04/15 419* -228.0',
    '05/01/15 420 -275.0',
    '05/11/15 421 -50.0']

    data = '''Date Check # Amount Date Check # Amount
    05/12/15 417 -12.00 05/19/15 422 -75.00
    05/04/15 419* -228.00 05/18/15 423 -2,090.00
    05/01/15 420 -275.00 05/26/15 426* -1,000.00
    05/11/15 421 -50.00'''
    result = split_check_rows(data)
    assert result == expected

    test_split_check_rows()

    def filter_line(l):
    junk_words = ['Total',
    'To help',
    'continued on',
    'in Online Banking',
    'Your checking',
    'BENJAMIN ',
    'Deposits and',
    'Withdrawals and other ',
    'Page',
    'Checks',
    'Deposits',
    '* There is']
    word = None
    for word in junk_words:
    if l.startswith(word):
    return False
    return True

    def format_transaction(t):
    trans = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', t)
    if not trans:
    return
    return '\t'.join(trans[0])

    def parse_dump(*lines_of_data):
    lines = filter(filter_line, lines_of_data)
    joined = join_transaction_lines(lines)
    joined = split_check_rows(*joined)
    for line in joined:
    parts = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', line)
    if not parts:
    continue
    def line_to_dict(line):
    parts = re.findall(r'(\d+/\d+/\d+)\s(.*)\s(.*\d+)$', line)
    if parts:
    date, des, amt = parts[0]
    yield {'Date':date, 'Description':des, 'Amount':amt}
    return {'Date':date, 'Description':des, 'Amount':amt}

    boa_rx = re.compile('\d\d\/\d\d\/\d\d(?:.(?!\nTotal)(?!\d\d\/\d\d\/\d\d)(?!\n\d\d\/\d\d\/\d\d)|\n)+\.\d\d', re.IGNORECASE)

    def parse_boa_dump(stringdata):
    records = boa_rx.findall(stringdata)
    records = [r.replace('\n', ' ') for r in records]
    return records

    class TransformedStatement(object):

    @@ -313,6 +167,7 @@ def __init__(self, dump_path, account='unknown'):
    self.dump_path = dump_path
    self.account = account


    def write(self, out_path, columns, mode='a'):
    fullpath = os.path.abspath(out_path)

    @@ -326,12 +181,9 @@ def add_account(items, account_name):
    d['Account'] = account_name

    # remove line endings
    for d in fin.readlines():
    if d[:-1] == '\n':
    data.append(d[:-1])
    else:
    data.append(d)
    joined_dict = list(parse_dump(*data))
    file_data = fin.read()
    parsed_lines = parse_boa_dump(file_data)
    joined_dict = map(line_to_dict, parsed_lines)
    add_account(joined_dict, self.account)

    writer.writerows(joined_dict)
  2. bencharb created this gist Nov 15, 2016.
    414 changes: 414 additions & 0 deletions parse_boa_statements.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,414 @@
    '''
    Transform Bank of America PDF text into tables.
    Example:
    python parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv
    Example of original PDF text (copy/pasted from PDF):
    MISTER NAME | Account # 999 99999 9999 | November 24, 2015 to December 24, 2015
    Your checking account
    Page 3 of 4
    Deposits and other additions
    Date Description Amount
    11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00
    12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME
    ID:000299229294 PPD
    918.57
    12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME
    ID:000299229294 PPD
    918.57
    Total deposits and other additions $2,037.14
    Withdrawals and other subtractions
    Date Description Amount
    11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00
    12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00
    Total withdrawals and other subtractions -$3,120.00
    To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
    in Online Banking for a printable version of the How to Balance Your Account Worksheet.
    Checks
    Date Check # Amount Date Check # Amount
    10/14/14 833 -183.00 10/24/14 835 -61.00
    10/09/14 833 -9.95
    Total checks -$253.95
    Total # of checks 3
    Note your Ending Balance already reflects the subtraction of Service Fees.
    To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
    in Online Banking for a printable version of the How to Balance Your Account Worksheet.
    Example output:
    Date,Description,Amount,Account
    11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount
    11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST,-120.00,testaccount
    10/14/14 ,833,-183.0,testaccount
    10/24/14 ,835,-61.0,testaccount'
    '''
    import re
    import csv
    import shutil

    # Meh, cant test now, hard to assert equal.
    # def test_transform():
    # test_data = '''MISTER NAME | Account # 999 99999 9999 | November 24, 2015 to December 24, 2015
    # Your checking account
    # Page 3 of 4
    # Deposits and other additions
    # Date Description Amount
    # 11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00
    # 12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME
    # ID:000299229294 PPD
    # 918.57
    # 12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME
    # ID:000299229294 PPD
    # 918.57
    # Total deposits and other additions $2,037.14
    # Withdrawals and other subtractions
    # Date Description Amount
    # 11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00
    # 12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00
    # Total withdrawals and other subtractions -$3,120.00
    # To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
    # in Online Banking for a printable version of the How to Balance Your Account Worksheet.

    # Checks
    # Date Check # Amount Date Check # Amount
    # 10/14/14 833 -183.00 10/24/14 835 -61.00
    # 10/09/14 833 -9.95
    # Total checks -$253.95
    # Total # of checks 3
    # Note your Ending Balance already reflects the subtraction of Service Fees.
    # To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
    # in Online Banking for a printable version of the How to Balance Your Account Worksheet.'''
    # expected = '''Date,Description,Amount,Account
    # 11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount
    # 11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ALEXANDRIA VA,-120.00,testaccount
    # 10/14/14 ,833,-183.0,testaccount
    # 10/24/14 ,835,-61.0,testaccount'''


    # # import tempfile

    # # tmp_path = os.path.join(tempfile.mkdtemp(),'testaccount')
    # # tmp_path_output = tmp_path+'_final_output.csv'
    # # tmp_path_output_expected = tmp_path+'_final_output_expected.csv'

    # # with open(tmp_path, 'w') as fout:
    # # fout.write(test_data)
    # # columns = ('Date', 'Description', 'Amount', 'Account',)

    # # rowcounts = main([tmp_path], tmp_path_output, columns=columns)
    # # with open(tmp_path_output_expected, 'w') as fout:
    # # writer = csv.DictWriter(fout, columns)
    # # writer.writeheader()
    # # writer.writerows(expected.split('\n'))

    # # with open(tmp_path_output, 'r') as fin:
    # # orig_reader = csv.reader(fin)
    # # orig_data = [l for l in orig_reader]

    # # with open(tmp_path_output_expected, 'r') as fin:
    # # expected_reader = csv.reader(fin)
    # # expected_data = [l for l in expected_reader]

    # # print 'expected_data', expected_data
    # # print 'orig_data', orig_data
    # # assert expected_data == orig_data
    # # # try:

    # # # except AssertionError:
    # # # print 'tmp_path_output_expected', tmp_path_output_expected
    # # # print 'tmp_path_output', tmp_path_output
    # # # raise


    def is_true(val):
    return unicode(val).lower() in ['true', 'yes']

    def split_check_records(*lines):
    if len(lines) == 1:
    lines = lines[0]
    if isinstance(lines, basestring):
    lines = lines.split('\n')
    for l in lines:
    rx = r'\d+/\d+/\d+\s\d+\*?\s-?\d+\.\d\s?'
    found = re.findall(rx, l)
    if found:
    for f in found:
    yield f

    def test_split_check_records():
    checklines = ['Date Check # Amount Date Check # Amount\n',
    '06/29/15 440 -333.00 07/23/15 447* -65.00\n',
    '07/21/15 441 -183.00 07/27/15 449* -20.00\n',
    '07/15/15 443* -15.00\n']
    result = list(split_check_records(*checklines))
    expected = ['06/29/15 440 -333.0',
    '07/23/15 447* -65.0',
    '07/21/15 441 -183.0',
    '07/27/15 449* -20.0',
    '07/15/15 443* -15.0']
    assert result == expected
    test_split_check_records()

    def is_check_line(l):
    multi_line = '05/12/15 1417 -120000.00 05/19/15 1422 -750000.00'
    is_multi_line = len(l) <= len(multi_line)+2 and l.count('/') == 4
    if is_multi_line:
    return True
    single_line = '05/12/15 1417 -120000.00'
    second_word = single_line.split()[1]
    is_single_line = len(l) <= len(single_line)+2 and l.count('/') == 2 and unicode(second_word.split('*')[0]).isnumeric()
    return is_single_line

    def test_is_check_line():
    assert is_check_line('05/12/15 1417 -120000.00 05/19/15 1422 -750000.00')
    assert is_check_line('05/12/15 1417 -120000.00')
    assert is_check_line('05/12/15 1417* -120000.00 05/19/15 1422* -750000.00')
    assert is_check_line('05/12/15 1417* -120000.00')

    test_is_check_line()

    def should_join_line(l, min_line_len=18):
    # not checks or long lines
    has_any_date = l.count('/') >= 2
    if has_any_date:
    return False
    is_header = l.startswith('Date')
    if is_header:
    return False
    is_check = is_check_line(l)
    if is_check:
    return False
    return True
    short_line = len(l) <= min_line_len
    return short_line

    def join_split_lines(multiline_data):
    if isinstance(multiline_data, basestring):
    lines = multiline_data.split('\n')
    else:
    lines = multiline_data
    min_line_len = sorted([len(l) for l in lines])[-1]/2
    joined_lines = []
    rev_lines = reversed(lines)
    staging_line = None
    for line in rev_lines:
    should_join = should_join_line(line, min_line_len=min_line_len)
    if should_join:
    if staging_line:
    staging_line = ' '.join([line,staging_line])
    else:
    staging_line = line
    else:
    if staging_line:
    line = ' '.join([line,staging_line])
    staging_line = None
    joined_lines.append(line)
    if staging_line:
    joined_lines.append(staging_line)
    return list(reversed(joined_lines))

    def validate_line(l):
    return unicode(l[0]).isnumeric() and '/' in l.split()[0]

    def validate_header_line(l):
    ok_len = len(l) < len('Date Check # Amount Date Check # Amount') + 10
    valid = ok_len and l.startswith('Date')
    return valid

    def validate_data(data):
    if isinstance(data, basestring):
    working_data = data.split('\n')
    else:
    working_data = data
    invalids = []
    for line in working_data:
    if not validate_line(line):
    if not validate_header_line(line):
    invalids.append(line)
    if invalids:
    raise Exception('Invalid lines %s' % '\n'.join(invalids))
    return True

    def join_transaction_lines(data, validate=True):
    joined = join_split_lines(data)
    if validate:
    validate_data(joined)
    return joined

    def split_check_rows(*lines):
    if len(lines) == 1 and isinstance(lines[0], basestring):
    lines = lines[0].split('\n')
    more_joined = []
    for line in lines:
    if is_check_line(line):
    records = list(split_check_records(line))
    more_joined.extend(records)
    else:
    if line.count('Date Check #') > 1:
    second_index = line[1:].index('Date Check #')
    line = line[:second_index]
    more_joined.append(line)
    return more_joined

    def test_split_check_rows():
    expected = ['Date Check # Amount',
    '05/12/15 417 -12.0',
    '05/19/15 422 -75.0',
    '05/04/15 419* -228.0',
    '05/01/15 420 -275.0',
    '05/11/15 421 -50.0']

    data = '''Date Check # Amount Date Check # Amount
    05/12/15 417 -12.00 05/19/15 422 -75.00
    05/04/15 419* -228.00 05/18/15 423 -2,090.00
    05/01/15 420 -275.00 05/26/15 426* -1,000.00
    05/11/15 421 -50.00'''
    result = split_check_rows(data)
    assert result == expected

    test_split_check_rows()

    def filter_line(l):
    junk_words = ['Total',
    'To help',
    'continued on',
    'in Online Banking',
    'Your checking',
    'BENJAMIN ',
    'Deposits and',
    'Withdrawals and other ',
    'Page',
    'Checks',
    'Deposits',
    '* There is']
    word = None
    for word in junk_words:
    if l.startswith(word):
    return False
    return True

    def format_transaction(t):
    trans = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', t)
    if not trans:
    return
    return '\t'.join(trans[0])

    def parse_dump(*lines_of_data):
    lines = filter(filter_line, lines_of_data)
    joined = join_transaction_lines(lines)
    joined = split_check_rows(*joined)
    for line in joined:
    parts = re.findall(r'(\d+/\d+/\d+\s)(.*)\s(.*\d+)$', line)
    if not parts:
    continue
    date, des, amt = parts[0]
    yield {'Date':date, 'Description':des, 'Amount':amt}


    class TransformedStatement(object):

    def __init__(self, dump_path, account='unknown'):
    super(TransformedStatement, self).__init__()
    self.dump_path = dump_path
    self.account = account

    def write(self, out_path, columns, mode='a'):
    fullpath = os.path.abspath(out_path)

    with open(out_path, mode) as fout:
    writer = csv.DictWriter(fout, columns)

    with open(self.dump_path, 'r') as fin:
    data = []
    def add_account(items, account_name):
    for d in items:
    d['Account'] = account_name

    # remove line endings
    for d in fin.readlines():
    if d[:-1] == '\n':
    data.append(d[:-1])
    else:
    data.append(d)
    joined_dict = list(parse_dump(*data))
    add_account(joined_dict, self.account)

    writer.writerows(joined_dict)
    return len(joined_dict)

    def transform_dump(transformed_statements,
    columns=None,
    out_path='transformed_statements.csv',
    override_existing_file=False):
    out_path = os.path.abspath(out_path)
    exists = os.path.exists(out_path)
    if not override_existing_file and exists:
    raise Exception('File exists: %s' % out_path)

    if isinstance(transformed_statements, TransformedStatement):
    transformed_statements = [transformed_statements]
    else:
    for ts in transformed_statements:
    if not isinstance(ts, TransformedStatement):
    raise TypeError('%s not of type TransformedStatement' % ts)

    out_path_original = out_path
    out_path_temp = out_path+'.temp'

    with open(out_path_temp, 'w') as fout:
    writer = csv.DictWriter(fout, columns)
    writer.writeheader()

    rowcounts = []
    for stmt in transformed_statements:
    rowcounts.append(stmt.write(out_path_temp, columns, mode='a'))

    shutil.move(out_path_temp, out_path)
    return rowcounts

    def main(files, output_file, columns=None, override_existing_file=True):

    basenames = [os.path.basename(os.path.splitext(f)[0]) for f in files]
    source_paths = zip(files,basenames)
    stmts = [TransformedStatement(path, account=act) for path, act in source_paths]
    rowcounts = transform_dump(stmts, columns=columns, out_path=output_file, override_existing_file=override_existing_file)
    return rowcounts


    if __name__ == '__main__':
    import sys
    import argparse
    import os

    description='Parse copy/pasted text from BofA statements. \nPrerequisites: \n(1) Download BofA pdfs. \n(2) For each PDF, keep only pages with tables on them; trash all other pages without tables. \n(3) For each PDF, copy/paste PDF text content from each statement into one text file. Just Cmd+a/Ctl+a > copy and paste into a file for the account name, such as "checking6443.txt"'

    description+='\nExample: \npython parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv\n\n'

    import textwrap
    description = '\n'.join(['\n'.join(textwrap.wrap(line, 100)) for line in description.split('\n')])

    parser = argparse.ArgumentParser(description)
    parser.add_argument('file', nargs='*', help='Separate multiple files with a comma. Paths to the files you created for each account. Each file should contain content for only one account. Generate the content for each file by copy/pasting the text from the PDFs into one file. Detailed description elsewhere in this help file.')
    parser.add_argument('--output', help='Output path', default='combined_statements.csv')
    parser.add_argument('--override_output', help='Override existing output file if it exists', action='store_true', default=True)
    if len(sys.argv)==1:
    parser.print_help()
    sys.exit(1)

    args = parser.parse_args()

    # files = args.files.split(',')
    files = args.file #.split(',')
    out_path = args.output
    override_existing_file = is_true(args.override_output)
    columns = ('Date', 'Description', 'Amount', 'Account',)

    rowcounts = main(files, out_path, columns=columns, override_existing_file=override_existing_file)

    if not rowcounts:
    message = 'No rows transformed to file %s' % out_path
    else:
    message = '%d rows from %d files transformed to file %s' % (sum(rowcounts),len(rowcounts), out_path,)

    print message