* Output will be the generated regular expression. * * Get a current copy of emoji-data.txt from http://www.unicode.org/Public/emoji/latest/emoji-data.txt */ if (!isset($argv[1]) || !file_exists($argv[1])) { print "Usage: build-hashtag-regexp.php \n"; print "\n"; print "Prints the generated PHP compatible Regular Expression to STDOUT.\n"; print "\n"; print "Grab emoji-data.txt from http://www.unicode.org/Public/emoji/latest/emoji-data.txt\n"; die(1); } $emojiFilename = $argv[1]; $emojiFile = file($emojiFilename); $emojiData = array(); $emojiClasses = array(); foreach ($emojiFile as $line) { $pos = strpos($line,'#'); if ($pos!==false) { $line = substr($line,0,$pos); } $line = trim($line); if (!$line) { continue; } $line = explode(';',$line); if (count($line)!=2) { continue; } $range = strtoupper(trim($line[0])); $class = trim($line[1]); if (!isset($emojiClasses[$class])) { $emojiClasses[$class] = array(); } $range = explode('..',$range); if (count($range)==1) { $emojiClasses[$class][] = '\\x{'.$range[0].'}'; } else { $emojiClasses[$class][] = '[\\x{'.$range[0].'}-\\x{'.$range[1].'}]'; } } $emojiRegexp = '(?:\\p{Emoji_Modifier_Base}\\p{Emoji_Modifier}?|\\p{Emoji_Presentation}|\\p{Emoji}\\x{FE0F}?)'; foreach ($emojiClasses as $class=>$components) { $emojiRegexp = str_replace('\\p{'.$class.'}','(?:'.implode('|',$components).')',$emojiRegexp); } print $emojiRegexp;