Skip to content

Instantly share code, notes, and snippets.

@mkuhn
Forked from lsauer/InChi.js
Last active December 21, 2015 17:19
Show Gist options
  • Select an option

  • Save mkuhn/6339927 to your computer and use it in GitHub Desktop.

Select an option

Save mkuhn/6339927 to your computer and use it in GitHub Desktop.

Revisions

  1. mkuhn revised this gist Aug 26, 2013. 1 changed file with 11 additions and 0 deletions.
    11 changes: 11 additions & 0 deletions _smiles_inchi_annotated.js
    Original file line number Diff line number Diff line change
    @@ -1,10 +1,21 @@
    //SMILES, Inchi Regex , by lo sauer - lsauer.com
    //Here's a PREG version for SMILES validation (JavaScript) beyond a length of 5:
    var x = "OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"
    var y = "Not=a=SMILES=String"

    x.trim().match(/^([^J][a-z0-9@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)[0]
    >"OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"

    y.trim().match(/^([^J][a-z0-9@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)[0]
    >"Not=a=SMILES=String"

    // explicitely search for all elements
    x.trim().match(/^(A[cglmrstu]|B[aehikr]?|C[adeflmorsu]?|D[bsy]|E[rsu]|F[emr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airu]|M[dgnot]|N[abdeiop]?|Os?|P[abdmortu]?|R[abefghnu]|S[bcegimnr]?|T[abcehilm]|U|Uu.|V|W|Xe|Yb?|Z[nr]|[0-9@+\-\[\]\(\)\\\/%=#$]){6,}$/ig) )
    >"OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"

    y.trim().match(/^(A[cglmrstu]|B[aehikr]?|C[adeflmorsu]?|D[bsy]|E[rsu]|F[emr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airu]|M[dgnot]|N[abdeiop]?|Os?|P[abdmortu]?|R[abefghnu]|S[bcegimnr]?|T[abcehilm]|U|Uu.|V|W|Xe|Yb?|Z[nr]|[0-9@+\-\[\]\(\)\\\/%=#$]){6,}$/ig) )
    > null

    //for the most frequent organic molecules
    x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)

  2. @lsauer lsauer revised this gist Oct 26, 2011. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion _smiles_inchi_annotated.js
    Original file line number Diff line number Diff line change
    @@ -27,7 +27,7 @@ x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;})
    0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]
    if(!Array.prototype.hasOwnProperty('sum'))
    {
    function (){return this.reduce(function(a,b){return a+b})}
    Array.prototype.sum = function (){return this.reduce(function(a,b){return a+b})}
    }

    x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;}).sum()
  3. @lsauer lsauer revised this gist Oct 25, 2011. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion InChiKey.js
    Original file line number Diff line number Diff line change
    @@ -2,7 +2,7 @@
    // The InChIKey, or hashed InChI, is a fixed length (25 character) condensed digital representation
    // of the InChI, which tries to be unique but is not human-comprehensible. It uses a BASE26 alphabet (Hexavigesimal)!
    // The last character of an InChIKey is computed from the rest of the InChIKey
    // The InChIKey specification facilitates web searches for chemical compounds, owing to unqiue referencing
    // The InChIKey specification facilitates web searches for chemical compounds, owing to sufficiently unqiue referencing
    // of compounds with a concise key, which is problematic with the full-length InChI (e.g. GET url limit is 1600 chars)

    // From the official documents ( http://chemdata.nist.gov/InChI/inchi-hash.pdf ):
  4. @lsauer lsauer revised this gist Oct 25, 2011. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions InChiKey.js
    Original file line number Diff line number Diff line change
    @@ -50,3 +50,7 @@ var x = 'RYGMFSIKBFXOCR-UHFFFAOYSA-N'
    27===x.length && '-'===x[14] && '-'===x[25]
    && !!x.match(/^([0-9A-Z\-]+)$/)
    >true


    //Collisions do occurs:
    //see http://www.chemconnector.com/2011/09/01/an-inchikey-collision-is-discovered-and-not-based-on-stereochemistry/
  5. @lsauer lsauer renamed this gist Oct 25, 2011. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  6. @lsauer lsauer revised this gist Oct 25, 2011. 1 changed file with 52 additions and 0 deletions.
    52 changes: 52 additions & 0 deletions InChiKey
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,52 @@
    // International Chemical Identifier KEY Regex, by lo sauer - lsauer.com
    // The InChIKey, or hashed InChI, is a fixed length (25 character) condensed digital representation
    // of the InChI, which tries to be unique but is not human-comprehensible. It uses a BASE26 alphabet (Hexavigesimal)!
    // The last character of an InChIKey is computed from the rest of the InChIKey
    // The InChIKey specification facilitates web searches for chemical compounds, owing to unqiue referencing
    // of compounds with a concise key, which is problematic with the full-length InChI (e.g. GET url limit is 1600 chars)

    // From the official documents ( http://chemdata.nist.gov/InChI/inchi-hash.pdf ):
    // "The InChIKey is a character signature based on a hash code of the InChI string. Also, this hash
    // may serve as a checksum for verifying InChI, for example, after transmission over a network."

    // InChIKey has four (4) distinct components: a 14-character hash of the basic (Mobile-H)
    // InChI layer (without /p segment accounting for added or removed protons); a 8-character
    // hash of the remaining layers; a 1 character is a flag indicating selected features (e.g.
    // presence of fixed-H layer); a 1 character is a “check” character. The overall length of
    // InChIKey is fixed at 25 characters, including separator:
    // AAAAAAAAAAAAAA-BBBBBBBBCD

    // This is significantly shorter than a typical InChI string (for example, the average length
    // of InChI string for Pubchem collection is 146 characters).
    // --------------------------------
    // InChIKey layout is as follows:
    // --------------------------------
    // AAAAAAAAAAAAAA
    // First block (14 letters)
    // Encodes molecular skeleton (connectivity)
    // BBBBBBBB
    // Second block (8 letters)
    // Encodes proton positions (tautomers), stereochemistry, isotopomers, reconnected layer
    // C
    // Flag character
    // Indicates InChI version, presence of a fixed-H layer, isotopes, and stereochemical
    // information.
    // D
    // Check character, obtained from all symbols except delimiters, i.e. from
    // AAAAAAAAAAAAAABBBBBBBBC

    // All symbols except the delimiter (a dash, that is, a minus) are uppercase English letters
    // representing a “base-26” encoding.
    // see also:http://en.wikipedia.org/wiki/Hexavigesimal

    //InChiKey v1.2 length: 14-10-1
    //InChIKey v1.2 for morphine is BQJCRHHNABKAKU-KBQPJGBKSA-N
    var x = 'BQJCRHHNABKAKU-KBQPJGBKSA-N'
    25===x.length && '-'===x[14]
    && !!x.match(/^([0-9A-Z\-]+)$/)
    >false
    //enzyme ligand Copper - InchiKey: RYGMFSIKBFXOCR-UHFFFAOYSA-N
    var x = 'RYGMFSIKBFXOCR-UHFFFAOYSA-N'
    27===x.length && '-'===x[14] && '-'===x[25]
    && !!x.match(/^([0-9A-Z\-]+)$/)
    >true
  7. @lsauer lsauer revised this gist Oct 25, 2011. 3 changed files with 45 additions and 4 deletions.
    11 changes: 11 additions & 0 deletions InChi.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,11 @@
    // International Chemical Identifier Regex, by lo sauer - lsauer.com
    // Morphine InchI:
    var x="InChI=1S/C17H19NO3/c1-18-7-6-17-10-3-5-13(20)16(17)21-15-12(19)4-2-9(14(15)17)8-11(10)18/h2-5,10-11,13,16,19-20H,6-8H2,1H3/t10-,11+,13-,16-,17-/m0/s1"

    // applying an organic character-subset
    // we could check for the length property, but in case of 0 matches 'null' is returned -> hence !!.. \ generally equal to Boolean(..)
    !!x.trim().match(/^((InChI=)?[^J][0-9BCOHNSOPrIFla+\-\(\)\\\/,pqbtmsih]{6,})$/ig)
    >true
    //generic:
    x.trim().match(/^((InChI=)?[^J][0-9a-z+\-\(\)\\\/,]+)$/ig)
    >true
    6 changes: 4 additions & 2 deletions _smiles_inchi_annotated.js
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    //SMILES, Inchi Regex , by lo sauer - lsauer.com
    //Here's a PREG version for SMILES validation (JavaScript) beyond a length of 5:
    x = "OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"
    var x = "OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"

    x.trim().match(/^([^J][a-z0-9@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)[0]
    >"OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"
    @@ -31,4 +31,6 @@ if(!Array.prototype.hasOwnProperty('sum'))
    }

    x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;}).sum()
    >14
    >14

    Array.prototype.atomCount = function(t){ return this.map(function(v,k){return t==v|0;}).reduce(function(a,b){return a+b}) };
    32 changes: 30 additions & 2 deletions smiles_inchi_annotated.js
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    //SMILES Regex annotated, by lo sauer - lsauer.com
    // Simplified Molecular Input Line Entry Specification (SMILES) Regex annotated, by lo sauer - lsauer.com
    /^( //starting with
    [^J] //must not contain J, which is the only letter absent from the periodic table
    [0-9 // number for cyclic connection quantifiers and charge; e.g. [Co+3] or [Co+++]
    @@ -14,4 +14,32 @@ BCOHNSOPrIFla //"organic subset": B, C, N, O, P, S, F, Cl, Br, I -> do not requ
    $ // quadruple bonds e.g. [Ga-]$[As+] (gallium arsenide)
    ]{6,} // length must be > 5
    )$
    /ig //ending with, search global, case-insensitive
    /ig //ending with, search global, case-insensitive

    /**
    * SMARTS
    */
    // SMARTS...Smiles arbitrary target specification has commas in addition to the SMILES character-set
    // bonds are defined as: '-' (single), '=' (double), '#' (triple), ':' (aromatic) and '~' (any)
    // logic operators:
    // OR operator ','
    // AND operator '&' (lower priority ';')
    // NOT operator '!'
    // more information: http://en.wikipedia.org/wiki/Smiles_arbitrary_target_specification

    //REGEX
    /^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~&!]{6,})$/


    //examples
    // definitions of hydrogen bond donors and acceptors used to apply Lipinski's Rule of Five:
    var x="[N,n,O;!H0]"
    var y="[#7,#8;!H0]"

    !!x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~;&!]{6,})$/ig)
    >true
    y.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~;&!]{6,})$/ig)
    >true
    // definition of aliphatic amines, likely to protonate at physiological pH:
    var x="[$([NH2][CX4]),$([NH]([CX4])[CX4]),$[NX3]([CX4])([CX4])[CX4])]"
    >true
  8. @lsauer lsauer revised this gist Oct 25, 2011. 1 changed file with 7 additions and 2 deletions.
    9 changes: 7 additions & 2 deletions _smiles_inchi_annotated.js
    Original file line number Diff line number Diff line change
    @@ -17,9 +17,14 @@ x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)

    //if you need a carbon count:
    x.toLowerCase().split('').map(function(v,k){return +'c'==v;})
    >[false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false]
    >[false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
    false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
    false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
    false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
    false, false, false, false]
    x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;})
    >[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]
    >[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
    0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]
    if(!Array.prototype.hasOwnProperty('sum'))
    {
    function (){return this.reduce(function(a,b){return a+b})}
  9. @lsauer lsauer revised this gist Oct 25, 2011. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion _smiles_inchi_annotated.js
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    //SMILES, Inchi Regex , by lo sauer - lsauer.com
    Here's a PREG version for SMILES validation (JavaScript) beyond a length of 5:
    //Here's a PREG version for SMILES validation (JavaScript) beyond a length of 5:
    x = "OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"

    x.trim().match(/^([^J][a-z0-9@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)[0]
  10. @lsauer lsauer created this gist Oct 25, 2011.
    29 changes: 29 additions & 0 deletions _smiles_inchi_annotated.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,29 @@
    //SMILES, Inchi Regex , by lo sauer - lsauer.com
    Here's a PREG version for SMILES validation (JavaScript) beyond a length of 5:
    x = "OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"

    x.trim().match(/^([^J][a-z0-9@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)[0]
    >"OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2"

    //for the most frequent organic molecules
    x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)

    //generic Perl RegEx:
    /^([^J][A-Za-z0-9@+\-\[\]\(\)\\\/%=#$]+)$/

    //Note: The only letter not appearing on the Periodic Table is the letter "J"
    //Annotated
    x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)

    //if you need a carbon count:
    x.toLowerCase().split('').map(function(v,k){return +'c'==v;})
    >[false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false]
    x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;})
    >[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]
    if(!Array.prototype.hasOwnProperty('sum'))
    {
    function (){return this.reduce(function(a,b){return a+b})}
    }

    x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;}).sum()
    >14
    17 changes: 17 additions & 0 deletions smiles_inchi_annotated.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,17 @@
    //SMILES Regex annotated, by lo sauer - lsauer.com
    /^( //starting with
    [^J] //must not contain J, which is the only letter absent from the periodic table
    [0-9 // number for cyclic connection quantifiers and charge; e.g. [Co+3] or [Co+++]
    BCOHNSOPrIFla //"organic subset": B, C, N, O, P, S, F, Cl, Br, I -> do not require [] in SMILES; additionally: Na, Cl, Fl
    @ // Stereocenter configuration descriptor, usually tetrahedral carbon e.g. L-Ala N[C@@H](C)C(=O)O vs. D-Ala N[C@H](C)C(=O)O
    % // unique labels: C14 is a carbon hold the ring closure labels 1 and 2; C%12 holds the unique label 12
    +\- // +- the charge sign
    \[\] // [] is used to delinate atoms e.g. [Co+++]
    \(\) // () branching-descriptors of the sparse-tree (rings are broken in SMILES), e.g. CCC(=O)O for propionic acid
    \\\/ // /\ for configuration around double bonds e.g. F/C=C/F... trans-difluoroethene,
    = // double bonds e.g. O=C=O (carbon dioxide)
    # // triple bonds e.g. C#N (hydrogen cyanide,)
    $ // quadruple bonds e.g. [Ga-]$[As+] (gallium arsenide)
    ]{6,} // length must be > 5
    )$
    /ig //ending with, search global, case-insensitive