pip install 'konoha[SentenceTokenizer]'
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def convert_hex_char(string): | |
| decoded_string = bytes(string, 'utf-8').decode('unicode-escape') | |
| return decoded_string |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import Iterable | |
| from typing import Callable | |
| from typing import Text | |
| from MeCab import Tagger | |
| def _get_tagger() -> Tagger: | |
| opts = getenv('MECAB_OPTS', '-d /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic') | |
| tagger = Tagger(opts) | |
| # for some reason the first request to the tagger doesn't produce output |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Reference | |
| # https://advertools.readthedocs.io/en/master/advertools.stopwords.html | |
| # install using 'pip install advertools' | |
| import advertools as adv | |
| adv.stopwords.keys() | |
| # languages supported | |
| stopwords = adv.stopwords['japanese'] | |
| # returns list of stopwords in japanese |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from fugashi import GenericTagger as Tagger | |
| tagger = Tagger('-r /dev/null -d /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic') | |
| def tokenize_lemmatize(text, remove_stopwords = True, lemmatize = False): | |
| tokens = tagger.parseToNodeList(text) | |
| if remove_stopwords: | |
| tokens = filter(lambda token : not is_stopword(token.surface.strip()), tokens) | |
| if lemmatize: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from MeCab import Tagger | |
| def _get_tagger() -> Tagger: | |
| opts = getenv('MECAB_OPTS', '-d /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic') | |
| tagger = Tagger(opts) | |
| # for some reason the first request to the tagger doesn't produce output | |
| # so pre-warming it here once to avoid serving daft results later | |
| parsed = tagger.parseToNode('サザエさんは走った') | |
| while parsed: | |
| parsed = parsed.next |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| if [ ! -f .env ] | |
| then | |
| export $(cat .env | xargs) | |
| fi |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## Create the virtual environment | |
| conda create -n 'environment_name' | |
| ## Activate the virtual environment | |
| conda activate 'environment_name' | |
| ## Make sure that ipykernel is installed | |
| pip install --user ipykernel | |
| ## Add the new virtual environment to Jupyter |
pip install spacy
python -m spacy download ja_core_news_sm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import unicodedata | |
| jText = "あ 32" | |
| jTextNormal = unicodedata.normalize('NFKD', jText) |
NewerOlder