Created
February 13, 2019 14:29
-
-
Save ccwang002/c6fca09bf010bbc20060c194a56697b7 to your computer and use it in GitHub Desktop.
Revisions
-
ccwang002 created this gist
Feb 13, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,96 @@ from itertools import combinations, product def gen_pos_sets_to_sub(barcode, max_sub=1): """ Generate all the possible position combinations (sets) within the given maximal number of substitutions. Examples: >>> list(gen_pos_sets_to_sub('ATG', 1)) [(0,), (1,), (2,)] >>> list(gen_pos_sets_to_sub('ATG', 2)) [(0,), (1,), (2,), (0, 1), (0, 2), (1, 2)] """ assert max_sub >= 0 all_pos = range(len(barcode)) for num_sub in range(1, max_sub + 1): yield from combinations(all_pos, num_sub) def sub_barcode(barcode, pos_set): """ Generate all possible barcode sequences with the substitutions at the given set of indices. Examples: >>> list(sub_barcode('AT', (0,))) ['TT', 'CT', 'GT'] >>> list(sub_barcode('AT', (0, 1))) ['TA', 'TC', 'TG', 'CA', 'CC', 'CG', 'GA', 'GC', 'GG'] """ # Convert the barcode string into a list of nts barcode = list(barcode) # Make the possible nucleotide per position possible_nts_per_pos = [ # Exclude the original nucleotide [nt for nt in 'ATCG' if nt != barcode[pos]] for pos in pos_set ] # The first for loop assign the new nucleotide # for each position to be replaced for new_nts in product(*possible_nts_per_pos): # Copy the original barcode (list of nts) barcode_with_sub = barcode[:] # Replace the barcode sequence with the new nucleotides # at the given corresponding positions for ix, nt in zip(pos_set, new_nts): barcode_with_sub[ix] = nt # Combine the new barcode into a single string yield ''.join(barcode_with_sub) def gen_all_barcodes_with_sub(barcode, max_sub=1): """ Generate all possible barcode sequences within the given number of substitutions. Examples: >>> list(gen_all_barcodes_with_sub('AT', 1)) ['TT', 'CT', 'GT', 'AA', 'AC', 'AG'] A more realistic example using a longer barcode: >>> barcode = 'ATATCGATCG'; len(barcode) 10 Count the number of all possible barcodes with <= 2 substitutions >>> len(list(gen_all_barcodes_with_sub(barcode, 2))) 435 Calculate the theoretical number of barcodes >>> 10 * (10 - 1) // 2 * ((4 - 1)**2) + len(barcode) * (4 - 1) 435 >>> len(list(gen_all_barcodes_with_sub(barcode, len(barcode)))) == 4 ** len(barcode) - 1 True """ barcode = barcode.upper() assert all(nt in 'ATCG' for nt in barcode) for pos_set in gen_pos_sets_to_sub(barcode, max_sub): yield from sub_barcode(barcode, pos_set) if __name__ == '__main__': # Run all the examples import doctest doctest.testmod(verbose=True)