Skip to content

Instantly share code, notes, and snippets.

@ccwang002
Created February 13, 2019 14:29
Show Gist options
  • Select an option

  • Save ccwang002/c6fca09bf010bbc20060c194a56697b7 to your computer and use it in GitHub Desktop.

Select an option

Save ccwang002/c6fca09bf010bbc20060c194a56697b7 to your computer and use it in GitHub Desktop.

Revisions

  1. ccwang002 created this gist Feb 13, 2019.
    96 changes: 96 additions & 0 deletions gen_barcode.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,96 @@
    from itertools import combinations, product


    def gen_pos_sets_to_sub(barcode, max_sub=1):
    """
    Generate all the possible position combinations (sets) within
    the given maximal number of substitutions.
    Examples:
    >>> list(gen_pos_sets_to_sub('ATG', 1))
    [(0,), (1,), (2,)]
    >>> list(gen_pos_sets_to_sub('ATG', 2))
    [(0,), (1,), (2,), (0, 1), (0, 2), (1, 2)]
    """
    assert max_sub >= 0
    all_pos = range(len(barcode))

    for num_sub in range(1, max_sub + 1):
    yield from combinations(all_pos, num_sub)


    def sub_barcode(barcode, pos_set):
    """
    Generate all possible barcode sequences with the substitutions
    at the given set of indices.
    Examples:
    >>> list(sub_barcode('AT', (0,)))
    ['TT', 'CT', 'GT']
    >>> list(sub_barcode('AT', (0, 1)))
    ['TA', 'TC', 'TG', 'CA', 'CC', 'CG', 'GA', 'GC', 'GG']
    """
    # Convert the barcode string into a list of nts
    barcode = list(barcode)

    # Make the possible nucleotide per position
    possible_nts_per_pos = [
    # Exclude the original nucleotide
    [nt for nt in 'ATCG' if nt != barcode[pos]]
    for pos in pos_set
    ]

    # The first for loop assign the new nucleotide
    # for each position to be replaced
    for new_nts in product(*possible_nts_per_pos):
    # Copy the original barcode (list of nts)
    barcode_with_sub = barcode[:]
    # Replace the barcode sequence with the new nucleotides
    # at the given corresponding positions
    for ix, nt in zip(pos_set, new_nts):
    barcode_with_sub[ix] = nt
    # Combine the new barcode into a single string
    yield ''.join(barcode_with_sub)


    def gen_all_barcodes_with_sub(barcode, max_sub=1):
    """
    Generate all possible barcode sequences within the given number of
    substitutions.
    Examples:
    >>> list(gen_all_barcodes_with_sub('AT', 1))
    ['TT', 'CT', 'GT', 'AA', 'AC', 'AG']
    A more realistic example using a longer barcode:
    >>> barcode = 'ATATCGATCG'; len(barcode)
    10
    Count the number of all possible barcodes with <= 2 substitutions
    >>> len(list(gen_all_barcodes_with_sub(barcode, 2)))
    435
    Calculate the theoretical number of barcodes
    >>> 10 * (10 - 1) // 2 * ((4 - 1)**2) + len(barcode) * (4 - 1)
    435
    >>> len(list(gen_all_barcodes_with_sub(barcode, len(barcode)))) == 4 ** len(barcode) - 1
    True
    """
    barcode = barcode.upper()
    assert all(nt in 'ATCG' for nt in barcode)

    for pos_set in gen_pos_sets_to_sub(barcode, max_sub):
    yield from sub_barcode(barcode, pos_set)



    if __name__ == '__main__':
    # Run all the examples
    import doctest
    doctest.testmod(verbose=True)