Skip to content

Instantly share code, notes, and snippets.

@Oldes
Last active May 13, 2020 10:05
Show Gist options
  • Select an option

  • Save Oldes/01fb15df1e7d2abd04ed83ebdaedb0cb to your computer and use it in GitHub Desktop.

Select an option

Save Oldes/01fb15df1e7d2abd04ed83ebdaedb0cb to your computer and use it in GitHub Desktop.

Revisions

  1. Oldes revised this gist May 13, 2020. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions html-entities.red
    Original file line number Diff line number Diff line change
    @@ -3,7 +3,7 @@ Red [
    Purpose: "To decode HTML entities in a text"
    Author: "Oldes"
    Date: 12-May-2020
    Version: 1.0.1
    Version: 1.0.2
    License: MIT
    Usage: [
    "Test: ♠ & ¢ <a> and Δδ ¾" =
    @@ -280,7 +280,7 @@ context [
    out: make string! length? val
    parse val [
    any [
    s: some any-except-& e: ( append out copy/part s e )
    s: some any-except-& e: ( append/part out s e )
    | #"&" [
    #"#" copy char 1 4 digits #";" (
    append out to char! to integer! char
  2. Oldes revised this gist May 12, 2020. 1 changed file with 6 additions and 6 deletions.
    12 changes: 6 additions & 6 deletions html-entities.red
    Original file line number Diff line number Diff line change
    @@ -3,11 +3,11 @@ Red [
    Purpose: "To decode HTML entities in a text"
    Author: "Oldes"
    Date: 12-May-2020
    Version: 1.0.0
    Version: 1.0.1
    License: MIT
    Usage: [
    "Test: ♠ & ¢ <a> and Δδ" =
    decode-html-entities {Test: &spades; & &#162; &lt;a&gt;&#32;and &Delta;&delta;}
    "Test: ♠ & ¢ <a> and Δδ ¾" =
    decode-html-entities {Test: &spades; & &#162; &lt;a&gt;&#32;and &Delta;&delta; &frac34;}
    ]
    TODO: {Encoder?}
    ]
    @@ -269,8 +269,8 @@ html-entities: #(
    )
    context [
    any-except-&: complement charset "&"
    chars: charset [#"a" - #"z" #"A" - #"Z"]
    digits: charset [#"0" - #"9"]
    alphanum: charset [#"0" - #"9" #"a" - #"z" #"A" - #"Z"]
    digits: charset [#"0" - #"9"]

    set 'decode-html-entities func [
    {Creates a new string with possible HTML entities converted to chars}
    @@ -285,7 +285,7 @@ context [
    #"#" copy char 1 4 digits #";" (
    append out to char! to integer! char
    )
    | s: copy char 1 10 chars #";" e: (
    | s: copy char 1 10 alphanum #";" e: (
    char: select/case html-entities char
    unless char [ char: #"&" e: :s ]
    append out char
  3. Oldes created this gist May 12, 2020.
    299 changes: 299 additions & 0 deletions html-entities.red
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,299 @@
    Red [
    Title: "HTML entities"
    Purpose: "To decode HTML entities in a text"
    Author: "Oldes"
    Date: 12-May-2020
    Version: 1.0.0
    License: MIT
    Usage: [
    "Test: ♠ & ¢ <a> and Δδ" =
    decode-html-entities {Test: &spades; & &#162; &lt;a&gt;&#32;and &Delta;&delta;}
    ]
    TODO: {Encoder?}
    ]

    html-entities: #(
    ;@@ https://eastmanreference.com/list-of-html-entity-names-and-numbers
    ;-- Punctuation, programming, and other common symbols
    "lt" #"^(3C)" ; 60 Open tag
    "gt" #"^(3E)" ; 62 Close tag
    "quot" #"^(22)" ; 34 Double quote
    "apos" #"^(27)" ; 39 Apostrophe / single quote
    "amp" #"^(26)" ; 38 Ampersand
    "nbsp" #"^(A0)" ; 160 Space (non-breaking)
    "brvbar" #"^(A6)" ; 166 Broken bar
    "iexcl" #"^(A1)" ; 161 Upside down exclamation mark
    "iquest" #"^(BF)" ; 191 Upside down question mark
    "sect" #"^(A7)" ; 167 Section symbol
    "uml" #"^(A8)" ; 168 Umlaut
    "ordf" #"^(AA)" ; 170 Feminine ordinal indicator
    "ordm" #"^(BA)" ; 186 Masculine ordinal indicator
    "laquo" #"^(AB)" ; 171 Open double angles
    "raquo" #"^(BB)" ; 187 Close double angles
    "not" #"^(AC)" ; 172 Not sign
    "shy" #"^(AD)" ; 173 Soft hyphen
    "macr" #"^(AF)" ; 175 Overline
    "acute" #"^(B4)" ; 180 Acute accent
    "para" #"^(B6)" ; 182 Pilcrow (paragraph)
    "middot" #"^(B7)" ; 183 Georgian comma
    "cedil" #"^(B8)" ; 184 Cedilla
    ;-- Math symbols
    "minus" #"^(2212)" ; 8722 Minus sign (subtraction)
    "times" #"^(D7)" ; 215 Multiplication sign
    "divide" #"^(F7)" ; 247 Division sign
    "plusmn" #"^(B1)" ; 177 Plus / minus
    "le" #"^(2264)" ; 8804 Less or equal
    "ge" #"^(2265)" ; 8805 Greater or equal
    "sup1" #"^(B9)" ; 185 Superscript 1
    "sup2" #"^(B2)" ; 178 Superscript 2
    "sup3" #"^(B3)" ; 179 Superscript 3
    "frac14" #"^(BC)" ; 188 1/4
    "frac12" #"^(BD)" ; 189 1/2
    "frac34" #"^(BE)" ; 190 3/4
    "forall" #"^(2200)" ; 8704 For all
    "part" #"^(2202)" ; 8706 Part
    "exist" #"^(2203)" ; 8707 Exist
    "empty" #"^(2205)" ; 8709 Empty
    "nabla" #"^(2207)" ; 8711 Nabla
    "isin" #"^(2208)" ; 8712 Is in
    "notin" #"^(2209)" ; 8713 Not in
    "ni" #"^(220B)" ; 8715 Ni
    "prod" #"^(220F)" ; 8719 Product
    "sum" #"^(2211)" ; 8721 Sum
    "lowast" #"^(2217)" ; 8727 Asterisk (Lowast)
    "radic" #"^(221A)" ; 8730 Square root
    "prop" #"^(221D)" ; 8733 Proportional to
    "infin" #"^(221E)" ; 8734 Infinity
    "ang" #"^(2220)" ; 8736 Angle
    "and" #"^(2227)" ; 8743 And
    "or" #"^(2228)" ; 8744 Or
    "cap" #"^(2229)" ; 8745 Cap
    "cup" #"^(222A)" ; 8746 Cup
    "int" #"^(222B)" ; 8747 Integral
    "there4" #"^(2234)" ; 8756 Therefore
    "sim" #"^(223C)" ; 8764 Similar to
    "cong" #"^(2245)" ; 8773 Congurent to
    "asymp" #"^(2248)" ; 8776 Almost equal
    "ne" #"^(2260)" ; 8800 Not equal
    "equiv" #"^(2261)" ; 8801 Equivalent
    "sub" #"^(2282)" ; 8834 Subset of
    "sup" #"^(2283)" ; 8835 Superset of
    "nsub" #"^(2284)" ; 8836 Not subset of
    "sube" #"^(2286)" ; 8838 Subset or equal
    "supe" #"^(2287)" ; 8839 Superset or equal
    "oplus" #"^(2295)" ; 8853 Circled plus
    "otimes" #"^(2297)" ; 8855 Circled times
    ;-- Unit of measure symbols
    "deg" #"^(B0)" ; 176 Degrees
    "micro" #"^(B5)" ; 181 Micro
    ;-- Copyright, registered, trademark
    "copy" #"^(A9)" ; 169 Copyright
    "reg" #"^(AE)" ; 174 Registered trademark
    "trade" #"^(2122)" ; 8482 Trademark
    ;-- Currency symbols
    "curren" #"^(A4)" ; 164 Currency sign
    "cent" #"^(A2)" ; 162 Cents
    "pound" #"^(A3)" ; 163 British pounds
    "euro" #"^(20AC)" ; 8364 Euro
    "yen" #"^(A5)" ; 165 Yen
    ;-- Greek alphabet
    "Alpha" #"^(391)" ; 913 UPPERCASE ALPHA
    "alpha" #"^(3B1)" ; 945 lowercase alpha
    "Beta" #"^(392)" ; 914 UPPERCASE BETA
    "beta" #"^(3B2)" ; 946 lowercase beta
    "Gamma" #"^(393)" ; 915 UPPERCASE GAMMA
    "gamma" #"^(3B3)" ; 947 lowercase gamma
    "Delta" #"^(394)" ; 916 UPPERCASE DELTA
    "delta" #"^(3B4)" ; 948 lowercase delta
    "Epsilon" #"^(395)" ; 917 UPPERCASE EPSILON
    "epsilon" #"^(3B5)" ; 949 lowercase epsilon
    "Zeta" #"^(396)" ; 918 UPPERCASE ZETA
    "zeta" #"^(3B6)" ; 950 lowercase zeta
    "Eta" #"^(397)" ; 919 UPPERCASE ETA
    "eta" #"^(3B7)" ; 951 lowercase eta
    "Theta" #"^(398)" ; 920 UPPERCASE THETA
    "theta" #"^(3B8)" ; 952 lowercase theta
    "thetasym" #"^(3D1)" ; 977 alternate lowercase theta
    "Iota" #"^(399)" ; 921 UPPERCASE IOTA
    "iota" #"^(3B9)" ; 953 lowercase iota
    "Kappa" #"^(39A)" ; 922 UPPERCASE KAPPA
    "kappa" #"^(3BA)" ; 954 lowercase kappa
    "Lambda" #"^(39B)" ; 923 UPPERCASE LAMBDA
    "lambda" #"^(3BB)" ; 955 lowercase lambda
    "Mu" #"^(39C)" ; 924 UPPERCASE MU
    "mu" #"^(3BC)" ; 956 lowercase mu
    "Nu" #"^(39D)" ; 925 UPPERCASE NU
    "nu" #"^(3BD)" ; 957 lowercase nu
    "Xi" #"^(39E)" ; 926 UPPERCASE XI
    "xi" #"^(3BE)" ; 958 lowercase xi
    "Omicron" #"^(39F)" ; 927 UPPERCASE OMICRON
    "omicron" #"^(3BF)" ; 959 lowercase omicron
    "Pi" #"^(3A0)" ; 928 UPPERCASE PI
    "pi" #"^(3C0)" ; 960 lowercase pi
    "piv" #"^(3D6)" ; 982 alternative lowercase pi
    "Rho" #"^(3A1)" ; 929 UPPERCASE RHO
    "rho" #"^(3C1)" ; 961 lowercase rho
    "Sigma" #"^(3A3)" ; 931 UPPERCASE SIGMA
    "sigma" #"^(3C3)" ; 963 lowercase sigma
    "sigmaf" #"^(3C2)" ; 962 final form lowercase sigma
    "Tau" #"^(3A4)" ; 932 UPPERCASE TAU
    "tau" #"^(3C4)" ; 964 lowercase tau
    "Upsilon" #"^(3A5)" ; 933 UPPERCASE UPSILON
    "upsilon" #"^(3C5)" ; 965 lowercase upsilon
    "upsih" #"^(3D2)" ; 978 alternative lowercase upsilon
    "Phi" #"^(3A6)" ; 934 UPPERCASE PHI
    "phi" #"^(3C6)" ; 966 lowercase phi
    "Chi" #"^(3A7)" ; 935 UPPERCASE CHI
    "chi" #"^(3C7)" ; 967 lowercase chi
    "Psi" #"^(3A8)" ; 936 UPPERCASE PSI
    "psi" #"^(3C8)" ; 968 lowercase psi
    "Omega" #"^(3A9)" ; 937 UPPERCASE OMEGA
    "omega" #"^(3C9)" ; 969 lowercase omega
    ;-- Arrows
    "larr" #"^(2190)" ; 8592 Left arrow
    "uarr" #"^(2191)" ; 8593 Up arrow
    "rarr" #"^(2192)" ; 8594 Right arrow
    "darr" #"^(2193)" ; 8595 Down arrow
    "harr" #"^(2194)" ; 8596 Left &amp; right arrow
    "crarr" #"^(21B5)" ; 8629 Carriage return arrow
    ;-- Spade, club, heart, diamond
    "spades" #"^(2660)" ; 9824 Spade
    "clubs" #"^(2663)" ; 9827 Club
    "hearts" #"^(2665)" ; 9829 Heart
    "diams" #"^(2666)" ; 9830 Diamond
    ;-- Accented letters
    "Agrave" #"^(C0)" ; 192 CAPITAL A GRAVE ACCENT
    "agrave" #"^(E0)" ; 224 lowercase a grave accent
    "Aacute" #"^(C1)" ; 193 CAPITAL A ACUTE ACCENT
    "aacute" #"^(E1)" ; 225 lowercase a acute accent
    "Acirc" #"^(C2)" ; 194 CAPITAL A CIRCUMFLEX ACCENT
    "acirc" #"^(E2)" ; 226 lowercase a circumflex accent
    "Atilde" #"^(C3)" ; 195 CAPITAL A TILDE ACCENT
    "atilde" #"^(E3)" ; 227 lowercase a tilde accent
    "Auml" #"^(C4)" ; 196 CAPITAL A UMLAUT ACCENT
    "auml" #"^(E4)" ; 228 lowercase a umlaut accent
    "Aring" #"^(C5)" ; 197 CAPITAL A RING ABOVE ACCENT
    "aring" #"^(E5)" ; 229 lowercase a ring accent
    "AElig" #"^(C6)" ; 198 CAPITAL AE
    "aelig" #"^(E6)" ; 230 lowercase ae
    "Ccedil" #"^(C7)" ; 199 CAPITAL C CEDILLA ACCENT
    "ccedil" #"^(E7)" ; 231 lowercase c cedilla accent
    "Egrave" #"^(C8)" ; 200 CAPITAL E GRAVE ACCENT
    "egrave" #"^(E8)" ; 232 lowercase e grave accent
    "Eacute" #"^(C9)" ; 201 CAPITAL E ACUTE ACCENT
    "eacute" #"^(E9)" ; 233 lowercase e acute accent
    "Ecirc" #"^(CA)" ; 202 CAPITAL E CIRCUMFLEX ACCENT
    "ecirc" #"^(EA)" ; 234 lowercase e circumflex accent
    "ecirc" #"^(EA)" ; 234 lowercase e circumflex accent
    "Euml" #"^(CB)" ; 203 CAPITAL E UMLAUT ACCENT
    "euml" #"^(EB)" ; 235 lowercase e umlaut accent
    "Igrave" #"^(CC)" ; 204 CAPITAL I GRAVE ACCENT
    "igrave" #"^(EC)" ; 236 lowercase i grave accent
    "Iacute" #"^(CD)" ; 205 CAPITAL I ACUTE ACCENT
    "iacute" #"^(ED)" ; 237 lowercase i acute accent
    "Icirc" #"^(CE)" ; 206 CAPITAL I CIRCUMFLEX ACCENT
    "icirc" #"^(EE)" ; 238 lowercase i circumflex accent
    "Iuml" #"^(CF)" ; 207 CAPITAL I UMLAUT ACCENT
    "iuml" #"^(EF)" ; 239 lowercase i umlaut accent
    "ETH" #"^(D0)" ; 208 CAPITAL ICELANDIC ETH
    "eth" #"^(F0)" ; 240 lowercase Icelandic eth
    "Ntilde" #"^(D1)" ; 209 CAPITAL N TILDE ACCENT
    "ntilde" #"^(F1)" ; 241 lowercase n tilde accent
    "Ograve" #"^(D2)" ; 210 CAPITAL O GRAVE ACCENT
    "ograve" #"^(F2)" ; 242 lowercase o grave accent
    "Oacute" #"^(D3)" ; 211 CAPITAL O ACUTE ACCENT
    "oacute" #"^(F3)" ; 243 lowercase o acute accent
    "Ocirc" #"^(D4)" ; 212 CAPITAL O CIRCUMFLEX ACCENT
    "ocirc" #"^(F4)" ; 244 lowercase o circumflex accent
    "Otilde" #"^(D5)" ; 213 CAPITAL O TILDE ACCENT
    "otilde" #"^(F5)" ; 245 lowercase o tilde accent
    "Ouml" #"^(D6)" ; 214 CAPITAL O UMLAUT ACCENT
    "ouml" #"^(F6)" ; 246 lowercase o umlaut accent
    "Oslash" #"^(D8)" ; 216 CAPITAL O SLASH ACCENT
    "oslash" #"^(F8)" ; 248 lowercase o slash
    "Ugrave" #"^(D9)" ; 217 CAPITAL U GRAVE ACCENT
    "ugrave" #"^(F9)" ; 249 lowercase u grave accent
    "Uacute" #"^(DA)" ; 218 CAPITAL U ACUTE ACCENT
    "uacute" #"^(FA)" ; 250 lowercase u acute accent
    "Ucirc" #"^(DB)" ; 219 CAPITAL U CIRCUMFLEX ACCENT
    "ucirc" #"^(FB)" ; 251 lowercase u circumflex accent
    "Uuml" #"^(DC)" ; 220 CAPITAL U UMLAUT
    "uuml" #"^(FC)" ; 252 lowercase u umlaut accent
    "Yacute" #"^(DD)" ; 221 CAPITAL Y ACUTE ACCENT
    "yacute" #"^(FD)" ; 253 lowercase y acute accent
    "yuml" #"^(FF)" ; 255 lowercase y umlaut accent
    "THORN" #"^(DE)" ; 222 CAPITAL ICELANDIC THORN
    "thorn" #"^(FE)" ; 254 lowercase Icelandic thorn
    "szlig" #"^(DF)" ; 223 lowercase German sharp s
    ;-- Miscellaneous
    "bull" #"^(2022)" ; 8226 Bullet
    "hellip" #"^(2026)" ; 8230 Horizontal ellipsis
    "fnof" #"^(192)" ; 402 lowercase Latin f with hook
    "perp" #"^(22A5)" ; 8869 Perpendicular
    "sdot" #"^(22C5)" ; 8901 Dot operator
    "OElig" #"^(152)" ; 338 UPPERCASE LATIN OE LIGATURE
    "oelig" #"^(153)" ; 339 lowercase Latin oe ligature
    "Scaron" #"^(160)" ; 352 UPPERCASE S WITH CARON
    "scaron" #"^(161)" ; 353 lowercase s with caron
    "Yuml" #"^(178)" ; 376 CAPITAL Y WITH DIAERES
    "circ" #"^(2C6)" ; 710 Circumflex accent
    "tilde" #"^(2DC)" ; 732 Tilde (different from the tilde my keyboard generates)
    "ndash" #"^(2013)" ; 8211 En dash
    "mdash" #"^(2014)" ; 8212 Em dash
    "lsquo" #"^(2018)" ; 8216 Left single quotation mark
    "rsquo" #"^(2019)" ; 8217 Right single quotation mark
    "sbquo" #"^(201A)" ; 8218 Single low-9 quotation mark
    "ldquo" #"^(201C)" ; 8220 Left double quotation mark
    "rdquo" #"^(201D)" ; 8221 Right double quotation mark
    "bdquo" #"^(201E)" ; 8222 Double low-9 quotation mark
    "dagger" #"^(2020)" ; 8224 Dagger
    "Dagger" #"^(2021)" ; 8225 Double dagger
    "permil" #"^(2030)" ; 8240 Per mille
    "prime" #"^(2032)" ; 8242 Minutes (Degrees)
    "Prime" #"^(2033)" ; 8243 Seconds (Degrees)
    "lsaquo" #"^(2039)" ; 8249 Single left angle quotation
    "rsaquo" #"^(2039)" ; 8249 Single right angle quotation
    "oline" #"^(203E)" ; 8254 Overline
    "lceil" #"^(2308)" ; 8968 Left ceiling
    "rceil" #"^(2309)" ; 8969 Right ceiling
    "lfloor" #"^(230A)" ; 8970 Left floor
    "rfloor" #"^(230B)" ; 8971 Right floor
    "loz" #"^(25CA)" ; 9674 Lozenge
    "ensp" #"^(2002)" ; 8194 En space
    "emsp" #"^(2003)" ; 8195 Em space
    "thinsp" #"^(2009)" ; 8201 Thin space
    "zwnj" #"^(200C)" ; 8204 Zero width non-joiner
    "zwj" #"^(200D)" ; 8205 Zero width joiner
    "lrm" #"^(200E)" ; 8206 Left-to-right mark
    "rlm" #"^(200F)" ; 8207 Right-to-left mark
    )
    context [
    any-except-&: complement charset "&"
    chars: charset [#"a" - #"z" #"A" - #"Z"]
    digits: charset [#"0" - #"9"]

    set 'decode-html-entities func [
    {Creates a new string with possible HTML entities converted to chars}
    val [string!] {Input string}
    /local out s e char
    ][
    out: make string! length? val
    parse val [
    any [
    s: some any-except-& e: ( append out copy/part s e )
    | #"&" [
    #"#" copy char 1 4 digits #";" (
    append out to char! to integer! char
    )
    | s: copy char 1 10 chars #";" e: (
    char: select/case html-entities char
    unless char [ char: #"&" e: :s ]
    append out char
    ) :e
    | (append out #"&")
    ]
    ]
    ]
    out
    ]
    ]