from itertools import combinations, product def gen_pos_sets_to_sub(barcode, max_sub=1): """ Generate all the possible position combinations (sets) within the given maximal number of substitutions. Examples: >>> list(gen_pos_sets_to_sub('ATG', 1)) [(0,), (1,), (2,)] >>> list(gen_pos_sets_to_sub('ATG', 2)) [(0,), (1,), (2,), (0, 1), (0, 2), (1, 2)] """ assert max_sub >= 0 all_pos = range(len(barcode)) for num_sub in range(1, max_sub + 1): yield from combinations(all_pos, num_sub) def sub_barcode(barcode, pos_set): """ Generate all possible barcode sequences with the substitutions at the given set of indices. Examples: >>> list(sub_barcode('AT', (0,))) ['TT', 'CT', 'GT'] >>> list(sub_barcode('AT', (0, 1))) ['TA', 'TC', 'TG', 'CA', 'CC', 'CG', 'GA', 'GC', 'GG'] """ # Convert the barcode string into a list of nts barcode = list(barcode) # Make the possible nucleotide per position possible_nts_per_pos = [ # Exclude the original nucleotide [nt for nt in 'ATCG' if nt != barcode[pos]] for pos in pos_set ] # The first for loop assign the new nucleotide # for each position to be replaced for new_nts in product(*possible_nts_per_pos): # Copy the original barcode (list of nts) barcode_with_sub = barcode[:] # Replace the barcode sequence with the new nucleotides # at the given corresponding positions for ix, nt in zip(pos_set, new_nts): barcode_with_sub[ix] = nt # Combine the new barcode into a single string yield ''.join(barcode_with_sub) def gen_all_barcodes_with_sub(barcode, max_sub=1): """ Generate all possible barcode sequences within the given number of substitutions. Examples: >>> list(gen_all_barcodes_with_sub('AT', 1)) ['TT', 'CT', 'GT', 'AA', 'AC', 'AG'] A more realistic example using a longer barcode: >>> barcode = 'ATATCGATCG'; len(barcode) 10 Count the number of all possible barcodes with <= 2 substitutions >>> len(list(gen_all_barcodes_with_sub(barcode, 2))) 435 Calculate the theoretical number of barcodes >>> 10 * (10 - 1) // 2 * ((4 - 1)**2) + len(barcode) * (4 - 1) 435 >>> len(list(gen_all_barcodes_with_sub(barcode, len(barcode)))) == 4 ** len(barcode) - 1 True """ barcode = barcode.upper() assert all(nt in 'ATCG' for nt in barcode) for pos_set in gen_pos_sets_to_sub(barcode, max_sub): yield from sub_barcode(barcode, pos_set) if __name__ == '__main__': # Run all the examples import doctest doctest.testmod(verbose=True)