-
-
Save mkuhn/6339927 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //SMILES, Inchi Regex , by lo sauer - lsauer.com | |
| //Here's a PREG version for SMILES validation (JavaScript) beyond a length of 5: | |
| var x = "OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2" | |
| x.trim().match(/^([^J][a-z0-9@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)[0] | |
| >"OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2" | |
| //for the most frequent organic molecules | |
| x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})$/ig) | |
| //generic Perl RegEx: | |
| /^([^J][A-Za-z0-9@+\-\[\]\(\)\\\/%=#$]+)$/ | |
| //Note: The only letter not appearing on the Periodic Table is the letter "J" | |
| //Annotated | |
| x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})$/ig) | |
| //if you need a carbon count: | |
| x.toLowerCase().split('').map(function(v,k){return +'c'==v;}) | |
| >[false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
| false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
| false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
| false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
| false, false, false, false] | |
| x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;}) | |
| >[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, | |
| 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0] | |
| if(!Array.prototype.hasOwnProperty('sum')) | |
| { | |
| function (){return this.reduce(function(a,b){return a+b})} | |
| } | |
| x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;}).sum() | |
| >14 | |
| Array.prototype.atomCount = function(t){ return this.map(function(v,k){return t==v|0;}).reduce(function(a,b){return a+b}) }; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // International Chemical Identifier Regex, by lo sauer - lsauer.com | |
| // Morphine InchI: | |
| var x="InChI=1S/C17H19NO3/c1-18-7-6-17-10-3-5-13(20)16(17)21-15-12(19)4-2-9(14(15)17)8-11(10)18/h2-5,10-11,13,16,19-20H,6-8H2,1H3/t10-,11+,13-,16-,17-/m0/s1" | |
| // applying an organic character-subset | |
| // we could check for the length property, but in case of 0 matches 'null' is returned -> hence !!.. \ generally equal to Boolean(..) | |
| !!x.trim().match(/^((InChI=)?[^J][0-9BCOHNSOPrIFla+\-\(\)\\\/,pqbtmsih]{6,})$/ig) | |
| >true | |
| //generic: | |
| x.trim().match(/^((InChI=)?[^J][0-9a-z+\-\(\)\\\/,]+)$/ig) | |
| >true |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Simplified Molecular Input Line Entry Specification (SMILES) Regex annotated, by lo sauer - lsauer.com | |
| /^( //starting with | |
| [^J] //must not contain J, which is the only letter absent from the periodic table | |
| [0-9 // number for cyclic connection quantifiers and charge; e.g. [Co+3] or [Co+++] | |
| BCOHNSOPrIFla //"organic subset": B, C, N, O, P, S, F, Cl, Br, I -> do not require [] in SMILES; additionally: Na, Cl, Fl | |
| @ // Stereocenter configuration descriptor, usually tetrahedral carbon e.g. L-Ala N[C@@H](C)C(=O)O vs. D-Ala N[C@H](C)C(=O)O | |
| % // unique labels: C14 is a carbon hold the ring closure labels 1 and 2; C%12 holds the unique label 12 | |
| +\- // +- the charge sign | |
| \[\] // [] is used to delinate atoms e.g. [Co+++] | |
| \(\) // () branching-descriptors of the sparse-tree (rings are broken in SMILES), e.g. CCC(=O)O for propionic acid | |
| \\\/ // /\ for configuration around double bonds e.g. F/C=C/F... trans-difluoroethene, | |
| = // double bonds e.g. O=C=O (carbon dioxide) | |
| # // triple bonds e.g. C#N (hydrogen cyanide,) | |
| $ // quadruple bonds e.g. [Ga-]$[As+] (gallium arsenide) | |
| ]{6,} // length must be > 5 | |
| )$ | |
| /ig //ending with, search global, case-insensitive | |
| /** | |
| * SMARTS | |
| */ | |
| // SMARTS...Smiles arbitrary target specification has commas in addition to the SMILES character-set | |
| // bonds are defined as: '-' (single), '=' (double), '#' (triple), ':' (aromatic) and '~' (any) | |
| // logic operators: | |
| // OR operator ',' | |
| // AND operator '&' (lower priority ';') | |
| // NOT operator '!' | |
| // more information: http://en.wikipedia.org/wiki/Smiles_arbitrary_target_specification | |
| //REGEX | |
| /^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~&!]{6,})$/ | |
| //examples | |
| // definitions of hydrogen bond donors and acceptors used to apply Lipinski's Rule of Five: | |
| var x="[N,n,O;!H0]" | |
| var y="[#7,#8;!H0]" | |
| !!x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~;&!]{6,})$/ig) | |
| >true | |
| y.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~;&!]{6,})$/ig) | |
| >true | |
| // definition of aliphatic amines, likely to protonate at physiological pH: | |
| var x="[$([NH2][CX4]),$([NH]([CX4])[CX4]),$[NX3]([CX4])([CX4])[CX4])]" | |
| >true |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment