Created
January 12, 2016 20:51
-
-
Save bitterbit/af658d6083e54c2eb1fb to your computer and use it in GitHub Desktop.
Detected different text languges
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| class MY_Lang_detector { | |
| const MAX_WORDS_TO_CHECK = 50; | |
| public function __construct() | |
| { | |
| $this->languages = $this->getLangs(); | |
| } | |
| private function getLangs() | |
| { | |
| return array( | |
| 'en' => new Language( | |
| array('the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at'), | |
| Language::MODE_INCLUDE_WORD | |
| ), | |
| 'de' => new Language( | |
| array('der', 'die', 'und', 'in', 'den', 'von', 'zu', 'das', 'mit', 'sich', 'des', 'auf', 'für', 'ist', 'im', 'dem', 'nicht', 'ein', 'eine'), | |
| Language::MODE_INCLUDE_WORD | |
| ), | |
| 'kr' => new Language( | |
| array('것', '하다', '있다', '수', '하다', '나', '없다', '않다', '사람', '우리', '그', '아니다', '보다', '거', '보다', '같다', '주다', '대하다', '가다', '년', '한', '말', '일', '이', '당신에게','필요한','건','많은','리뷰가','아니에요중요한','건','내','피부에','맞는','제품과비슷한','사람의','평가죠피부테스트와','사용제품','두','개로가치를','경험하세요'), | |
| Language::MODE_INCLUDE_CHAR | |
| ), | |
| 'he' => new Language( | |
| array('א','ב','ג','ד','ה','ו','ז','ח','ט','י','כ','ל','מ','נ','ס','ע','פ','צ','ק','ר','ש','ת'), | |
| Language::MODE_INCLUDE_CHAR | |
| ) | |
| ); | |
| } | |
| public function detect_language($text, $default) | |
| { | |
| $text = strtolower($text); | |
| foreach ($this->languages as $lang_name => $lang) | |
| { | |
| $counter[$lang_name] = $lang->match_count($text); | |
| } | |
| $max = max($counter); | |
| $maxs = array_keys($counter, $max); | |
| // if more than one winner, dont | |
| if (count($maxs) == 1) { | |
| $winner = $maxs[0]; | |
| $second = 0; | |
| // get runner-up (second place) | |
| foreach ($this->languages as $language => $words) { | |
| if ($language <> $winner) { | |
| if ($counter[$language]>$second) { | |
| $second = $counter[$language]; | |
| } | |
| } | |
| } | |
| // check that there is a minimal margin (10%) | |
| if (($second / $max) < 0.1) { | |
| return $winner; | |
| } | |
| } | |
| if (count($maxs) == 0) | |
| { | |
| return $default; | |
| } | |
| // if there are two winners - fall back to default! | |
| // return $default; | |
| return $maxs[0]; | |
| } | |
| } | |
| class Language { | |
| private $words; | |
| private $mode; | |
| const MAX_WORDS_TO_CHECK = 50; | |
| const MODE_INCLUDE_WORD = 1; | |
| const MODE_INCLUDE_CHAR = 2; | |
| public function __construct($words, $mode=self::MODE_INCLUDE_WORD) | |
| { | |
| $this->words = $words; | |
| $this->mode = $mode; | |
| } | |
| // count the occurrences of the most frequent words | |
| public function match_count($text) | |
| { | |
| $counter = 0; | |
| $max_runs = min(self::MAX_WORDS_TO_CHECK, count($this->words)); | |
| for ($i = 0; $i < $max_runs; $i++) | |
| { | |
| $counter += $this->get_matches($text, $this->words[$i]); | |
| } | |
| return $counter; | |
| } | |
| private function get_matches($text, $word) | |
| { | |
| if ($this->mode == self::MODE_INCLUDE_WORD) | |
| { | |
| return mb_substr_count($text, ' '.$word.' '); | |
| } else if ($this->mode == self::MODE_INCLUDE_CHAR) | |
| { | |
| return mb_substr_count($text, $word); | |
| } | |
| return 0; | |
| } | |
| } | |
| $a = new MY_Lang_detector(); | |
| var_dump($a->detect_language('You need is not a lot of reviews importantlys assessment of the product with similar people fit my skin test and your experience using the product two values', 'en')); | |
| var_dump($a->detect_language('Bei einem Terroranschlag in Istanbul sind auch mindestens acht Deutsche getötet worden. Neun weitere Bundesbürger wurden zum Teil schwer verletzt. Der Selbstmordattentäter soll der Terrormiliz IS angehört haben. Lesen Sie alle Information', 'en')); | |
| var_dump($a->detect_language('딱 1주일 다이어트 습관 : 요요없는 건강한 다이어트', 'n\a')); | |
| var_dump($a->detect_language('גביע המדינה בכדורגל: מכבי פ"ת הדיחה את הפועל קריית שמונה', 'n\a')); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment