Created
August 21, 2019 04:14
-
-
Save suyash/b77334af071113606b26db4aaefd5154 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.8" | |
| }, | |
| "colab": { | |
| "name": "spm2.ipynb", | |
| "version": "0.3.2", | |
| "provenance": [], | |
| "collapsed_sections": [] | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "2hEa2g4yX0Y5", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 680 | |
| }, | |
| "outputId": "a84e4aaa-5392-4b29-a2d2-c882562d44dd" | |
| }, | |
| "source": [ | |
| "!pip install tensorflow==2.0.0b1 sentencepiece tf_sentencepiece" | |
| ], | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Collecting tensorflow==2.0.0b1\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/29/6c/2c9a5c4d095c63c2fb37d20def0e4f92685f7aee9243d6aae25862694fd1/tensorflow-2.0.0b1-cp36-cp36m-manylinux1_x86_64.whl (87.9MB)\n", | |
| "\u001b[K |████████████████████████████████| 87.9MB 346kB/s \n", | |
| "\u001b[?25hCollecting sentencepiece\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)\n", | |
| "\u001b[K |████████████████████████████████| 1.0MB 35.0MB/s \n", | |
| "\u001b[?25hCollecting tf_sentencepiece\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/dc/2c/20800032089a9271757921f3adc1f2c7ec2d294ec9fa07b3115fab9d27c2/tf_sentencepiece-0.1.83-py2.py3-none-manylinux1_x86_64.whl (2.7MB)\n", | |
| "\u001b[K |████████████████████████████████| 2.7MB 36.8MB/s \n", | |
| "\u001b[?25hRequirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.2.2)\n", | |
| "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.11.2)\n", | |
| "Collecting tb-nightly<1.14.0a20190604,>=1.14.0a20190603 (from tensorflow==2.0.0b1)\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a4/96/571b875cd81dda9d5dfa1422a4f9d749e67c0a8d4f4f0b33a4e5f5f35e27/tb_nightly-1.14.0a20190603-py3-none-any.whl (3.1MB)\n", | |
| "\u001b[K |████████████████████████████████| 3.1MB 38.1MB/s \n", | |
| "\u001b[?25hRequirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.8.0)\n", | |
| "Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (3.7.1)\n", | |
| "Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.16.4)\n", | |
| "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.1.0)\n", | |
| "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.1.0)\n", | |
| "Collecting tf-estimator-nightly<1.14.0.dev2019060502,>=1.14.0.dev2019060501 (from tensorflow==2.0.0b1)\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/32/dd/99c47dd007dcf10d63fd895611b063732646f23059c618a373e85019eb0e/tf_estimator_nightly-1.14.0.dev2019060501-py2.py3-none-any.whl (496kB)\n", | |
| "\u001b[K |████████████████████████████████| 501kB 31.9MB/s \n", | |
| "\u001b[?25hRequirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.33.4)\n", | |
| "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.12.0)\n", | |
| "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.15.0)\n", | |
| "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.7.1)\n", | |
| "Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.1.7)\n", | |
| "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.0.8)\n", | |
| "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (3.1.1)\n", | |
| "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (0.15.5)\n", | |
| "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (41.0.1)\n", | |
| "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.6->tensorflow==2.0.0b1) (2.8.0)\n", | |
| "Installing collected packages: tb-nightly, tf-estimator-nightly, tensorflow, sentencepiece, tf-sentencepiece\n", | |
| " Found existing installation: tensorflow 1.14.0\n", | |
| " Uninstalling tensorflow-1.14.0:\n", | |
| " Successfully uninstalled tensorflow-1.14.0\n", | |
| "Successfully installed sentencepiece-0.1.83 tb-nightly-1.14.0a20190603 tensorflow-2.0.0b1 tf-estimator-nightly-1.14.0.dev2019060501 tf-sentencepiece-0.1.83\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "mndedWQYX0ZE", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "import sentencepiece as spm\n", | |
| "import tensorflow as tf\n", | |
| "import tf_sentencepiece as tfs" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "X8UBiY3CX0ZL", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 208 | |
| }, | |
| "outputId": "a8796557-7450-49a0-fbb5-902a031d8c90" | |
| }, | |
| "source": [ | |
| "!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt" | |
| ], | |
| "execution_count": 3, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "--2019-08-21 04:12:26-- https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt\n", | |
| "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", | |
| "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", | |
| "HTTP request sent, awaiting response... 200 OK\n", | |
| "Length: 278779 (272K) [text/plain]\n", | |
| "Saving to: ‘botchan.txt’\n", | |
| "\n", | |
| "\rbotchan.txt 0%[ ] 0 --.-KB/s \rbotchan.txt 100%[===================>] 272.25K --.-KB/s in 0.03s \n", | |
| "\n", | |
| "2019-08-21 04:12:27 (9.21 MB/s) - ‘botchan.txt’ saved [278779/278779]\n", | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "gmBlKDIQX0ZS", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "a0732ecd-7f5d-4439-b3ca-946d740487be" | |
| }, | |
| "source": [ | |
| "spm.SentencePieceTrainer.train('--model_prefix=m --input=botchan.txt --vocab_size=1200')" | |
| ], | |
| "execution_count": 4, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 4 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "AQ3Z5PgNX0ZW", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "### Get piece size" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "y40NpIjuX0ZY", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "20011571-aa58-4220-c5bd-db9e7f39a4ea" | |
| }, | |
| "source": [ | |
| "size = tfs.piece_size(model_file='m.model')\n", | |
| "size" | |
| ], | |
| "execution_count": 5, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=0, shape=(), dtype=int32, numpy=1200>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 5 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "v35YSLUxX0Zd", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "### id_to_piece and piece_to_id (constant)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ubx0qO1nX0Ze", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "2d3004d9-2b1c-4604-a5af-171a1fddeb11" | |
| }, | |
| "source": [ | |
| "input_ids = tf.constant(100, dtype=tf.int32)\n", | |
| "pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n", | |
| "pieces" | |
| ], | |
| "execution_count": 6, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=3, shape=(), dtype=string, numpy=b'll'>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 6 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "MSf54BtcX0Zk", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "d46a62e9-7311-4a25-ff83-37f5f16f0e33" | |
| }, | |
| "source": [ | |
| "tfs.piece_to_id(pieces, model_file='m.model')" | |
| ], | |
| "execution_count": 7, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=5, shape=(), dtype=int32, numpy=100>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 7 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "C8lw1u6BX0Zr", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "### id_to_piece and piece_to_id (1D)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "IgB4-Kx8X0Zt", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 69 | |
| }, | |
| "outputId": "b8ce5df7-de3f-458c-e9f9-24679dcdafa5" | |
| }, | |
| "source": [ | |
| "input_ids = tf.constant([0,1,2,3,4,5], dtype=tf.int32)\n", | |
| "pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n", | |
| "pieces" | |
| ], | |
| "execution_count": 8, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=8, shape=(6,), dtype=string, numpy=\n", | |
| "array([b'<unk>', b'<s>', b'</s>', b',', b'.', b'\\xe2\\x96\\x81the'],\n", | |
| " dtype=object)>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 8 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "TWHri41YX0Z0", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "b54b7643-34c3-4218-f14b-7e635f428ac2" | |
| }, | |
| "source": [ | |
| "ids = tfs.piece_to_id(pieces, model_file='m.model')\n", | |
| "ids" | |
| ], | |
| "execution_count": 9, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=10, shape=(6,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5], dtype=int32)>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 9 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "ZiexeLJFX0Z5", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "### id_to_piece and piece_to_id (2D)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "KjvvkQ8UX0Z8", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 86 | |
| }, | |
| "outputId": "7cfdd4ab-6bad-4df8-8d09-73cb9cfa6a30" | |
| }, | |
| "source": [ | |
| "input_ids = tf.constant([[0,1,2,3,4],[5,6,7,8,9]], dtype=tf.int32)\n", | |
| "pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n", | |
| "pieces" | |
| ], | |
| "execution_count": 10, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=13, shape=(2, 5), dtype=string, numpy=\n", | |
| "array([[b'<unk>', b'<s>', b'</s>', b',', b'.'],\n", | |
| " [b'\\xe2\\x96\\x81the', b's', b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81',\n", | |
| " b'\\xe2\\x96\\x81to']], dtype=object)>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 10 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "F9twViRVX0aA", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 69 | |
| }, | |
| "outputId": "488e3a11-cb55-42d3-e812-0ee917d4c401" | |
| }, | |
| "source": [ | |
| "ids = tfs.piece_to_id(pieces, model_file='m.model')\n", | |
| "ids" | |
| ], | |
| "execution_count": 11, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=15, shape=(2, 5), dtype=int32, numpy=\n", | |
| "array([[0, 1, 2, 3, 4],\n", | |
| " [5, 6, 7, 8, 9]], dtype=int32)>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 11 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "7HPssPr_X0aF", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "### proto" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "GuphTn1sX0aH", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "744a139d-ad32-4087-c38f-707de7c9dd3f" | |
| }, | |
| "source": [ | |
| "proto = tf.io.gfile.GFile('m.model', 'rb').read()\n", | |
| "tfs.piece_size(model_proto=proto)" | |
| ], | |
| "execution_count": 12, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=17, shape=(), dtype=int32, numpy=1200>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 12 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "S96BAnfMX0aL", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "### is_unknown and is_control" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "vIJ2q9GsX0aO", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 52 | |
| }, | |
| "outputId": "bcb7a5d5-712f-46e8-ebeb-315d25f68d12" | |
| }, | |
| "source": [ | |
| "input_ids = tf.constant([0,1,2,3,4,5], dtype=tf.int32)\n", | |
| "is_unknown = tfs.is_unknown(input_ids, model_file='m.model')\n", | |
| "is_control = tfs.is_control(input_ids, model_file='m.model')\n", | |
| "is_unknown, is_control" | |
| ], | |
| "execution_count": 13, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(<tf.Tensor: id=20, shape=(6,), dtype=bool, numpy=array([ True, False, False, False, False, False])>,\n", | |
| " <tf.Tensor: id=21, shape=(6,), dtype=bool, numpy=array([False, True, True, False, False, False])>)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 13 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "N-HeR_hYX0aS", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "### encode, encode_sparse, decode" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "jgQtpZRfX0aU", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "input_text = ['hello world.', 'I have a dog.', 'I have an apple.', 'this is a problem that we have to solve', 'Suyash is a good boy']" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "f7c9-05ZX0aX", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "model_proto = tf.io.gfile.GFile('m.model', 'rb').read()" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "uopMpoumX0ab", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 156 | |
| }, | |
| "outputId": "345ab5cd-4c0b-41b4-b8d9-aa80944474b7" | |
| }, | |
| "source": [ | |
| "ids, seq_len = tfs.encode(input_text, model_proto=model_proto)\n", | |
| "ids, seq_len" | |
| ], | |
| "execution_count": 16, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(<tf.Tensor: id=27, shape=(5, 13), dtype=int32, numpy=\n", | |
| " array([[ 35, 100, 22, 940, 4, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
| " [ 7, 68, 10, 85, 46, 4, 0, 0, 0, 0, 0, 0, 0],\n", | |
| " [ 7, 68, 154, 10, 37, 37, 78, 4, 0, 0, 0, 0, 0],\n", | |
| " [ 56, 42, 10, 223, 339, 30, 28, 112, 68, 9, 63, 44, 143],\n", | |
| " [210, 54, 31, 439, 42, 10, 281, 316, 31, 0, 0, 0, 0]],\n", | |
| " dtype=int32)>,\n", | |
| " <tf.Tensor: id=28, shape=(5,), dtype=int32, numpy=array([ 5, 6, 8, 13, 9], dtype=int32)>)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 16 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "zwMD1qpUX0af", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 139 | |
| }, | |
| "outputId": "122fbe20-24ad-4785-e0f8-e50c9f0b0bea" | |
| }, | |
| "source": [ | |
| "sparse_ids = tfs.encode_sparse(input_text, model_proto=model_proto)\n", | |
| "tf.sparse.to_dense(sparse_ids)" | |
| ], | |
| "execution_count": 17, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=38, shape=(5, 13), dtype=int32, numpy=\n", | |
| "array([[ 35, 100, 22, 940, 4, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
| " [ 7, 68, 10, 85, 46, 4, 0, 0, 0, 0, 0, 0, 0],\n", | |
| " [ 7, 68, 154, 10, 37, 37, 78, 4, 0, 0, 0, 0, 0],\n", | |
| " [ 56, 42, 10, 223, 339, 30, 28, 112, 68, 9, 63, 44, 143],\n", | |
| " [210, 54, 31, 439, 42, 10, 281, 316, 31, 0, 0, 0, 0]],\n", | |
| " dtype=int32)>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 17 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "wEC5G5y5X0ai", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 86 | |
| }, | |
| "outputId": "f86ff06c-0139-43bc-ae27-1ae76409768b" | |
| }, | |
| "source": [ | |
| "tfs.decode(ids, seq_len, model_proto=model_proto)" | |
| ], | |
| "execution_count": 18, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=40, shape=(5,), dtype=string, numpy=\n", | |
| "array([b'hello world.', b'I have a dog.', b'I have an apple.',\n", | |
| " b'this is a problem that we have to solve',\n", | |
| " b'Suyash is a good boy'], dtype=object)>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 18 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "2rUKDjKbYHH8", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 312 | |
| }, | |
| "outputId": "681aae8d-ae38-4d1c-a163-33e836b24d4c" | |
| }, | |
| "source": [ | |
| "tfs.id_to_piece(ids, model_proto=model_proto)" | |
| ], | |
| "execution_count": 19, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<tf.Tensor: id=42, shape=(5, 13), dtype=string, numpy=\n", | |
| "array([[b'\\xe2\\x96\\x81he', b'll', b'o', b'\\xe2\\x96\\x81world', b'.',\n", | |
| " b'<unk>', b'<unk>', b'<unk>', b'<unk>', b'<unk>', b'<unk>',\n", | |
| " b'<unk>', b'<unk>'],\n", | |
| " [b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81a',\n", | |
| " b'\\xe2\\x96\\x81do', b'g', b'.', b'<unk>', b'<unk>', b'<unk>',\n", | |
| " b'<unk>', b'<unk>', b'<unk>', b'<unk>'],\n", | |
| " [b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81an',\n", | |
| " b'\\xe2\\x96\\x81a', b'p', b'p', b'le', b'.', b'<unk>', b'<unk>',\n", | |
| " b'<unk>', b'<unk>', b'<unk>'],\n", | |
| " [b'\\xe2\\x96\\x81this', b'\\xe2\\x96\\x81is', b'\\xe2\\x96\\x81a',\n", | |
| " b'\\xe2\\x96\\x81pro', b'ble', b'm', b'\\xe2\\x96\\x81that',\n", | |
| " b'\\xe2\\x96\\x81we', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81to',\n", | |
| " b'\\xe2\\x96\\x81so', b'l', b've'],\n", | |
| " [b'\\xe2\\x96\\x81S', b'u', b'y', b'ash', b'\\xe2\\x96\\x81is',\n", | |
| " b'\\xe2\\x96\\x81a', b'\\xe2\\x96\\x81good', b'\\xe2\\x96\\x81bo', b'y',\n", | |
| " b'<unk>', b'<unk>', b'<unk>', b'<unk>']], dtype=object)>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 19 | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment