suyash · August 21, 2019 04:14
diff --git a/spm.ipynb b/spm.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.8"
    },
    "colab": {
      "name": "spm2.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": []
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "2hEa2g4yX0Y5",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 680
        },
        "outputId": "a84e4aaa-5392-4b29-a2d2-c882562d44dd"
      },
      "source": [
        "!pip install tensorflow==2.0.0b1 sentencepiece tf_sentencepiece"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting tensorflow==2.0.0b1\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/29/6c/2c9a5c4d095c63c2fb37d20def0e4f92685f7aee9243d6aae25862694fd1/tensorflow-2.0.0b1-cp36-cp36m-manylinux1_x86_64.whl (87.9MB)\n",
            "\u001b[K     |████████████████████████████████| 87.9MB 346kB/s \n",
            "\u001b[?25hCollecting sentencepiece\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)\n",
            "\u001b[K     |████████████████████████████████| 1.0MB 35.0MB/s \n",
            "\u001b[?25hCollecting tf_sentencepiece\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/dc/2c/20800032089a9271757921f3adc1f2c7ec2d294ec9fa07b3115fab9d27c2/tf_sentencepiece-0.1.83-py2.py3-none-manylinux1_x86_64.whl (2.7MB)\n",
            "\u001b[K     |████████████████████████████████| 2.7MB 36.8MB/s \n",
            "\u001b[?25hRequirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.2.2)\n",
            "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.11.2)\n",
            "Collecting tb-nightly<1.14.0a20190604,>=1.14.0a20190603 (from tensorflow==2.0.0b1)\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/a4/96/571b875cd81dda9d5dfa1422a4f9d749e67c0a8d4f4f0b33a4e5f5f35e27/tb_nightly-1.14.0a20190603-py3-none-any.whl (3.1MB)\n",
            "\u001b[K     |████████████████████████████████| 3.1MB 38.1MB/s \n",
            "\u001b[?25hRequirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.8.0)\n",
            "Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (3.7.1)\n",
            "Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.16.4)\n",
            "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.1.0)\n",
            "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.1.0)\n",
            "Collecting tf-estimator-nightly<1.14.0.dev2019060502,>=1.14.0.dev2019060501 (from tensorflow==2.0.0b1)\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/32/dd/99c47dd007dcf10d63fd895611b063732646f23059c618a373e85019eb0e/tf_estimator_nightly-1.14.0.dev2019060501-py2.py3-none-any.whl (496kB)\n",
            "\u001b[K     |████████████████████████████████| 501kB 31.9MB/s \n",
            "\u001b[?25hRequirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.33.4)\n",
            "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.12.0)\n",
            "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.15.0)\n",
            "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.7.1)\n",
            "Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.1.7)\n",
            "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.0.8)\n",
            "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (3.1.1)\n",
            "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (0.15.5)\n",
            "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (41.0.1)\n",
            "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.6->tensorflow==2.0.0b1) (2.8.0)\n",
            "Installing collected packages: tb-nightly, tf-estimator-nightly, tensorflow, sentencepiece, tf-sentencepiece\n",
            "  Found existing installation: tensorflow 1.14.0\n",
            "    Uninstalling tensorflow-1.14.0:\n",
            "      Successfully uninstalled tensorflow-1.14.0\n",
            "Successfully installed sentencepiece-0.1.83 tb-nightly-1.14.0a20190603 tensorflow-2.0.0b1 tf-estimator-nightly-1.14.0.dev2019060501 tf-sentencepiece-0.1.83\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mndedWQYX0ZE",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import sentencepiece as spm\n",
        "import tensorflow as tf\n",
        "import tf_sentencepiece as tfs"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "X8UBiY3CX0ZL",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 208
        },
        "outputId": "a8796557-7450-49a0-fbb5-902a031d8c90"
      },
      "source": [
        "!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2019-08-21 04:12:26--  https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 278779 (272K) [text/plain]\n",
            "Saving to: ‘botchan.txt’\n",
            "\n",
            "\rbotchan.txt           0%[                    ]       0  --.-KB/s               \rbotchan.txt         100%[===================>] 272.25K  --.-KB/s    in 0.03s   \n",
            "\n",
            "2019-08-21 04:12:27 (9.21 MB/s) - ‘botchan.txt’ saved [278779/278779]\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gmBlKDIQX0ZS",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "a0732ecd-7f5d-4439-b3ca-946d740487be"
      },
      "source": [
        "spm.SentencePieceTrainer.train('--model_prefix=m --input=botchan.txt --vocab_size=1200')"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AQ3Z5PgNX0ZW",
        "colab_type": "text"
      },
      "source": [
        "### Get piece size"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "y40NpIjuX0ZY",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "20011571-aa58-4220-c5bd-db9e7f39a4ea"
      },
      "source": [
        "size = tfs.piece_size(model_file='m.model')\n",
        "size"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=0, shape=(), dtype=int32, numpy=1200>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "v35YSLUxX0Zd",
        "colab_type": "text"
      },
      "source": [
        "### id_to_piece and piece_to_id (constant)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ubx0qO1nX0Ze",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "2d3004d9-2b1c-4604-a5af-171a1fddeb11"
      },
      "source": [
        "input_ids = tf.constant(100, dtype=tf.int32)\n",
        "pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n",
        "pieces"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=3, shape=(), dtype=string, numpy=b'll'>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MSf54BtcX0Zk",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "d46a62e9-7311-4a25-ff83-37f5f16f0e33"
      },
      "source": [
        "tfs.piece_to_id(pieces, model_file='m.model')"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=5, shape=(), dtype=int32, numpy=100>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "C8lw1u6BX0Zr",
        "colab_type": "text"
      },
      "source": [
        "### id_to_piece and piece_to_id (1D)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IgB4-Kx8X0Zt",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "outputId": "b8ce5df7-de3f-458c-e9f9-24679dcdafa5"
      },
      "source": [
        "input_ids = tf.constant([0,1,2,3,4,5], dtype=tf.int32)\n",
        "pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n",
        "pieces"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=8, shape=(6,), dtype=string, numpy=\n",
              "array([b'<unk>', b'<s>', b'</s>', b',', b'.', b'\\xe2\\x96\\x81the'],\n",
              "      dtype=object)>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TWHri41YX0Z0",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "b54b7643-34c3-4218-f14b-7e635f428ac2"
      },
      "source": [
        "ids = tfs.piece_to_id(pieces, model_file='m.model')\n",
        "ids"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=10, shape=(6,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5], dtype=int32)>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZiexeLJFX0Z5",
        "colab_type": "text"
      },
      "source": [
        "### id_to_piece and piece_to_id (2D)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KjvvkQ8UX0Z8",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 86
        },
        "outputId": "7cfdd4ab-6bad-4df8-8d09-73cb9cfa6a30"
      },
      "source": [
        "input_ids = tf.constant([[0,1,2,3,4],[5,6,7,8,9]], dtype=tf.int32)\n",
        "pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n",
        "pieces"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=13, shape=(2, 5), dtype=string, numpy=\n",
              "array([[b'<unk>', b'<s>', b'</s>', b',', b'.'],\n",
              "       [b'\\xe2\\x96\\x81the', b's', b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81',\n",
              "        b'\\xe2\\x96\\x81to']], dtype=object)>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "F9twViRVX0aA",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "outputId": "488e3a11-cb55-42d3-e812-0ee917d4c401"
      },
      "source": [
        "ids = tfs.piece_to_id(pieces, model_file='m.model')\n",
        "ids"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=15, shape=(2, 5), dtype=int32, numpy=\n",
              "array([[0, 1, 2, 3, 4],\n",
              "       [5, 6, 7, 8, 9]], dtype=int32)>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7HPssPr_X0aF",
        "colab_type": "text"
      },
      "source": [
        "### proto"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GuphTn1sX0aH",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "744a139d-ad32-4087-c38f-707de7c9dd3f"
      },
      "source": [
        "proto = tf.io.gfile.GFile('m.model', 'rb').read()\n",
        "tfs.piece_size(model_proto=proto)"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=17, shape=(), dtype=int32, numpy=1200>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "S96BAnfMX0aL",
        "colab_type": "text"
      },
      "source": [
        "### is_unknown and is_control"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vIJ2q9GsX0aO",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        },
        "outputId": "bcb7a5d5-712f-46e8-ebeb-315d25f68d12"
      },
      "source": [
        "input_ids = tf.constant([0,1,2,3,4,5], dtype=tf.int32)\n",
        "is_unknown = tfs.is_unknown(input_ids, model_file='m.model')\n",
        "is_control = tfs.is_control(input_ids, model_file='m.model')\n",
        "is_unknown, is_control"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(<tf.Tensor: id=20, shape=(6,), dtype=bool, numpy=array([ True, False, False, False, False, False])>,\n",
              " <tf.Tensor: id=21, shape=(6,), dtype=bool, numpy=array([False,  True,  True, False, False, False])>)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N-HeR_hYX0aS",
        "colab_type": "text"
      },
      "source": [
        "### encode, encode_sparse, decode"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jgQtpZRfX0aU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "input_text = ['hello world.', 'I have a dog.', 'I have an apple.', 'this is a problem that we have to solve', 'Suyash is a good boy']"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "f7c9-05ZX0aX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "model_proto = tf.io.gfile.GFile('m.model', 'rb').read()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uopMpoumX0ab",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 156
        },
        "outputId": "345ab5cd-4c0b-41b4-b8d9-aa80944474b7"
      },
      "source": [
        "ids, seq_len = tfs.encode(input_text, model_proto=model_proto)\n",
        "ids, seq_len"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(<tf.Tensor: id=27, shape=(5, 13), dtype=int32, numpy=\n",
              " array([[ 35, 100,  22, 940,   4,   0,   0,   0,   0,   0,   0,   0,   0],\n",
              "        [  7,  68,  10,  85,  46,   4,   0,   0,   0,   0,   0,   0,   0],\n",
              "        [  7,  68, 154,  10,  37,  37,  78,   4,   0,   0,   0,   0,   0],\n",
              "        [ 56,  42,  10, 223, 339,  30,  28, 112,  68,   9,  63,  44, 143],\n",
              "        [210,  54,  31, 439,  42,  10, 281, 316,  31,   0,   0,   0,   0]],\n",
              "       dtype=int32)>,\n",
              " <tf.Tensor: id=28, shape=(5,), dtype=int32, numpy=array([ 5,  6,  8, 13,  9], dtype=int32)>)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zwMD1qpUX0af",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 139
        },
        "outputId": "122fbe20-24ad-4785-e0f8-e50c9f0b0bea"
      },
      "source": [
        "sparse_ids = tfs.encode_sparse(input_text, model_proto=model_proto)\n",
        "tf.sparse.to_dense(sparse_ids)"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=38, shape=(5, 13), dtype=int32, numpy=\n",
              "array([[ 35, 100,  22, 940,   4,   0,   0,   0,   0,   0,   0,   0,   0],\n",
              "       [  7,  68,  10,  85,  46,   4,   0,   0,   0,   0,   0,   0,   0],\n",
              "       [  7,  68, 154,  10,  37,  37,  78,   4,   0,   0,   0,   0,   0],\n",
              "       [ 56,  42,  10, 223, 339,  30,  28, 112,  68,   9,  63,  44, 143],\n",
              "       [210,  54,  31, 439,  42,  10, 281, 316,  31,   0,   0,   0,   0]],\n",
              "      dtype=int32)>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 17
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wEC5G5y5X0ai",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 86
        },
        "outputId": "f86ff06c-0139-43bc-ae27-1ae76409768b"
      },
      "source": [
        "tfs.decode(ids, seq_len, model_proto=model_proto)"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=40, shape=(5,), dtype=string, numpy=\n",
              "array([b'hello world.', b'I have a dog.', b'I have an apple.',\n",
              "       b'this is a problem that we have to solve',\n",
              "       b'Suyash is a good boy'], dtype=object)>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2rUKDjKbYHH8",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 312
        },
        "outputId": "681aae8d-ae38-4d1c-a163-33e836b24d4c"
      },
      "source": [
        "tfs.id_to_piece(ids, model_proto=model_proto)"
      ],
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<tf.Tensor: id=42, shape=(5, 13), dtype=string, numpy=\n",
              "array([[b'\\xe2\\x96\\x81he', b'll', b'o', b'\\xe2\\x96\\x81world', b'.',\n",
              "        b'<unk>', b'<unk>', b'<unk>', b'<unk>', b'<unk>', b'<unk>',\n",
              "        b'<unk>', b'<unk>'],\n",
              "       [b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81a',\n",
              "        b'\\xe2\\x96\\x81do', b'g', b'.', b'<unk>', b'<unk>', b'<unk>',\n",
              "        b'<unk>', b'<unk>', b'<unk>', b'<unk>'],\n",
              "       [b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81an',\n",
              "        b'\\xe2\\x96\\x81a', b'p', b'p', b'le', b'.', b'<unk>', b'<unk>',\n",
              "        b'<unk>', b'<unk>', b'<unk>'],\n",
              "       [b'\\xe2\\x96\\x81this', b'\\xe2\\x96\\x81is', b'\\xe2\\x96\\x81a',\n",
              "        b'\\xe2\\x96\\x81pro', b'ble', b'm', b'\\xe2\\x96\\x81that',\n",
              "        b'\\xe2\\x96\\x81we', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81to',\n",
              "        b'\\xe2\\x96\\x81so', b'l', b've'],\n",
              "       [b'\\xe2\\x96\\x81S', b'u', b'y', b'ash', b'\\xe2\\x96\\x81is',\n",
              "        b'\\xe2\\x96\\x81a', b'\\xe2\\x96\\x81good', b'\\xe2\\x96\\x81bo', b'y',\n",
              "        b'<unk>', b'<unk>', b'<unk>', b'<unk>']], dtype=object)>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 19
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	},
	"colab": {
	"name": "spm2.ipynb",
	"version": "0.3.2",
	"provenance": [],
	"collapsed_sections": []
	}
	},
	"cells": [
	{
	"cell_type": "code",
	"metadata": {
	"id": "2hEa2g4yX0Y5",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 680
	},
	"outputId": "a84e4aaa-5392-4b29-a2d2-c882562d44dd"
	},
	"source": [
	"!pip install tensorflow==2.0.0b1 sentencepiece tf_sentencepiece"
	],
	"execution_count": 1,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Collecting tensorflow==2.0.0b1\n",
	"\u001b[?25l Downloading https://files.pythonhosted.org/packages/29/6c/2c9a5c4d095c63c2fb37d20def0e4f92685f7aee9243d6aae25862694fd1/tensorflow-2.0.0b1-cp36-cp36m-manylinux1_x86_64.whl (87.9MB)\n",
	"\u001b[K \|████████████████████████████████\| 87.9MB 346kB/s \n",
	"\u001b[?25hCollecting sentencepiece\n",
	"\u001b[?25l Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)\n",
	"\u001b[K \|████████████████████████████████\| 1.0MB 35.0MB/s \n",
	"\u001b[?25hCollecting tf_sentencepiece\n",
	"\u001b[?25l Downloading https://files.pythonhosted.org/packages/dc/2c/20800032089a9271757921f3adc1f2c7ec2d294ec9fa07b3115fab9d27c2/tf_sentencepiece-0.1.83-py2.py3-none-manylinux1_x86_64.whl (2.7MB)\n",
	"\u001b[K \|████████████████████████████████\| 2.7MB 36.8MB/s \n",
	"\u001b[?25hRequirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.2.2)\n",
	"Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.11.2)\n",
	"Collecting tb-nightly<1.14.0a20190604,>=1.14.0a20190603 (from tensorflow==2.0.0b1)\n",
	"\u001b[?25l Downloading https://files.pythonhosted.org/packages/a4/96/571b875cd81dda9d5dfa1422a4f9d749e67c0a8d4f4f0b33a4e5f5f35e27/tb_nightly-1.14.0a20190603-py3-none-any.whl (3.1MB)\n",
	"\u001b[K \|████████████████████████████████\| 3.1MB 38.1MB/s \n",
	"\u001b[?25hRequirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.8.0)\n",
	"Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (3.7.1)\n",
	"Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.16.4)\n",
	"Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.1.0)\n",
	"Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.1.0)\n",
	"Collecting tf-estimator-nightly<1.14.0.dev2019060502,>=1.14.0.dev2019060501 (from tensorflow==2.0.0b1)\n",
	"\u001b[?25l Downloading https://files.pythonhosted.org/packages/32/dd/99c47dd007dcf10d63fd895611b063732646f23059c618a373e85019eb0e/tf_estimator_nightly-1.14.0.dev2019060501-py2.py3-none-any.whl (496kB)\n",
	"\u001b[K \|████████████████████████████████\| 501kB 31.9MB/s \n",
	"\u001b[?25hRequirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.33.4)\n",
	"Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.12.0)\n",
	"Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.15.0)\n",
	"Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.7.1)\n",
	"Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (0.1.7)\n",
	"Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.0.0b1) (1.0.8)\n",
	"Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (3.1.1)\n",
	"Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (0.15.5)\n",
	"Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.14.0a20190604,>=1.14.0a20190603->tensorflow==2.0.0b1) (41.0.1)\n",
	"Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.6->tensorflow==2.0.0b1) (2.8.0)\n",
	"Installing collected packages: tb-nightly, tf-estimator-nightly, tensorflow, sentencepiece, tf-sentencepiece\n",
	" Found existing installation: tensorflow 1.14.0\n",
	" Uninstalling tensorflow-1.14.0:\n",
	" Successfully uninstalled tensorflow-1.14.0\n",
	"Successfully installed sentencepiece-0.1.83 tb-nightly-1.14.0a20190603 tensorflow-2.0.0b1 tf-estimator-nightly-1.14.0.dev2019060501 tf-sentencepiece-0.1.83\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "mndedWQYX0ZE",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import sentencepiece as spm\n",
	"import tensorflow as tf\n",
	"import tf_sentencepiece as tfs"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "X8UBiY3CX0ZL",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 208
	},
	"outputId": "a8796557-7450-49a0-fbb5-902a031d8c90"
	},
	"source": [
	"!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt"
	],
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"--2019-08-21 04:12:26-- https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt\n",
	"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
	"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)\|151.101.0.133\|:443... connected.\n",
	"HTTP request sent, awaiting response... 200 OK\n",
	"Length: 278779 (272K) [text/plain]\n",
	"Saving to: ‘botchan.txt’\n",
	"\n",
	"\rbotchan.txt 0%[ ] 0 --.-KB/s \rbotchan.txt 100%[===================>] 272.25K --.-KB/s in 0.03s \n",
	"\n",
	"2019-08-21 04:12:27 (9.21 MB/s) - ‘botchan.txt’ saved [278779/278779]\n",
	"\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "gmBlKDIQX0ZS",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "a0732ecd-7f5d-4439-b3ca-946d740487be"
	},
	"source": [
	"spm.SentencePieceTrainer.train('--model_prefix=m --input=botchan.txt --vocab_size=1200')"
	],
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"True"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 4
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "AQ3Z5PgNX0ZW",
	"colab_type": "text"
	},
	"source": [
	"### Get piece size"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "y40NpIjuX0ZY",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "20011571-aa58-4220-c5bd-db9e7f39a4ea"
	},
	"source": [
	"size = tfs.piece_size(model_file='m.model')\n",
	"size"
	],
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=0, shape=(), dtype=int32, numpy=1200>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 5
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "v35YSLUxX0Zd",
	"colab_type": "text"
	},
	"source": [
	"### id_to_piece and piece_to_id (constant)"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "ubx0qO1nX0Ze",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "2d3004d9-2b1c-4604-a5af-171a1fddeb11"
	},
	"source": [
	"input_ids = tf.constant(100, dtype=tf.int32)\n",
	"pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n",
	"pieces"
	],
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=3, shape=(), dtype=string, numpy=b'll'>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 6
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "MSf54BtcX0Zk",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "d46a62e9-7311-4a25-ff83-37f5f16f0e33"
	},
	"source": [
	"tfs.piece_to_id(pieces, model_file='m.model')"
	],
	"execution_count": 7,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=5, shape=(), dtype=int32, numpy=100>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 7
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "C8lw1u6BX0Zr",
	"colab_type": "text"
	},
	"source": [
	"### id_to_piece and piece_to_id (1D)"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "IgB4-Kx8X0Zt",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 69
	},
	"outputId": "b8ce5df7-de3f-458c-e9f9-24679dcdafa5"
	},
	"source": [
	"input_ids = tf.constant([0,1,2,3,4,5], dtype=tf.int32)\n",
	"pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n",
	"pieces"
	],
	"execution_count": 8,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=8, shape=(6,), dtype=string, numpy=\n",
	"array([b'<unk>', b'<s>', b'</s>', b',', b'.', b'\\xe2\\x96\\x81the'],\n",
	" dtype=object)>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 8
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "TWHri41YX0Z0",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "b54b7643-34c3-4218-f14b-7e635f428ac2"
	},
	"source": [
	"ids = tfs.piece_to_id(pieces, model_file='m.model')\n",
	"ids"
	],
	"execution_count": 9,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=10, shape=(6,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5], dtype=int32)>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 9
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "ZiexeLJFX0Z5",
	"colab_type": "text"
	},
	"source": [
	"### id_to_piece and piece_to_id (2D)"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "KjvvkQ8UX0Z8",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 86
	},
	"outputId": "7cfdd4ab-6bad-4df8-8d09-73cb9cfa6a30"
	},
	"source": [
	"input_ids = tf.constant([[0,1,2,3,4],[5,6,7,8,9]], dtype=tf.int32)\n",
	"pieces = tfs.id_to_piece(input_ids, model_file='m.model')\n",
	"pieces"
	],
	"execution_count": 10,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=13, shape=(2, 5), dtype=string, numpy=\n",
	"array([[b'<unk>', b'<s>', b'</s>', b',', b'.'],\n",
	" [b'\\xe2\\x96\\x81the', b's', b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81',\n",
	" b'\\xe2\\x96\\x81to']], dtype=object)>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 10
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "F9twViRVX0aA",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 69
	},
	"outputId": "488e3a11-cb55-42d3-e812-0ee917d4c401"
	},
	"source": [
	"ids = tfs.piece_to_id(pieces, model_file='m.model')\n",
	"ids"
	],
	"execution_count": 11,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=15, shape=(2, 5), dtype=int32, numpy=\n",
	"array([[0, 1, 2, 3, 4],\n",
	" [5, 6, 7, 8, 9]], dtype=int32)>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 11
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "7HPssPr_X0aF",
	"colab_type": "text"
	},
	"source": [
	"### proto"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "GuphTn1sX0aH",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "744a139d-ad32-4087-c38f-707de7c9dd3f"
	},
	"source": [
	"proto = tf.io.gfile.GFile('m.model', 'rb').read()\n",
	"tfs.piece_size(model_proto=proto)"
	],
	"execution_count": 12,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=17, shape=(), dtype=int32, numpy=1200>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 12
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "S96BAnfMX0aL",
	"colab_type": "text"
	},
	"source": [
	"### is_unknown and is_control"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "vIJ2q9GsX0aO",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 52
	},
	"outputId": "bcb7a5d5-712f-46e8-ebeb-315d25f68d12"
	},
	"source": [
	"input_ids = tf.constant([0,1,2,3,4,5], dtype=tf.int32)\n",
	"is_unknown = tfs.is_unknown(input_ids, model_file='m.model')\n",
	"is_control = tfs.is_control(input_ids, model_file='m.model')\n",
	"is_unknown, is_control"
	],
	"execution_count": 13,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(<tf.Tensor: id=20, shape=(6,), dtype=bool, numpy=array([ True, False, False, False, False, False])>,\n",
	" <tf.Tensor: id=21, shape=(6,), dtype=bool, numpy=array([False, True, True, False, False, False])>)"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 13
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "N-HeR_hYX0aS",
	"colab_type": "text"
	},
	"source": [
	"### encode, encode_sparse, decode"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "jgQtpZRfX0aU",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"input_text = ['hello world.', 'I have a dog.', 'I have an apple.', 'this is a problem that we have to solve', 'Suyash is a good boy']"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "f7c9-05ZX0aX",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"model_proto = tf.io.gfile.GFile('m.model', 'rb').read()"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "uopMpoumX0ab",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 156
	},
	"outputId": "345ab5cd-4c0b-41b4-b8d9-aa80944474b7"
	},
	"source": [
	"ids, seq_len = tfs.encode(input_text, model_proto=model_proto)\n",
	"ids, seq_len"
	],
	"execution_count": 16,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(<tf.Tensor: id=27, shape=(5, 13), dtype=int32, numpy=\n",
	" array([[ 35, 100, 22, 940, 4, 0, 0, 0, 0, 0, 0, 0, 0],\n",
	" [ 7, 68, 10, 85, 46, 4, 0, 0, 0, 0, 0, 0, 0],\n",
	" [ 7, 68, 154, 10, 37, 37, 78, 4, 0, 0, 0, 0, 0],\n",
	" [ 56, 42, 10, 223, 339, 30, 28, 112, 68, 9, 63, 44, 143],\n",
	" [210, 54, 31, 439, 42, 10, 281, 316, 31, 0, 0, 0, 0]],\n",
	" dtype=int32)>,\n",
	" <tf.Tensor: id=28, shape=(5,), dtype=int32, numpy=array([ 5, 6, 8, 13, 9], dtype=int32)>)"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 16
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "zwMD1qpUX0af",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 139
	},
	"outputId": "122fbe20-24ad-4785-e0f8-e50c9f0b0bea"
	},
	"source": [
	"sparse_ids = tfs.encode_sparse(input_text, model_proto=model_proto)\n",
	"tf.sparse.to_dense(sparse_ids)"
	],
	"execution_count": 17,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=38, shape=(5, 13), dtype=int32, numpy=\n",
	"array([[ 35, 100, 22, 940, 4, 0, 0, 0, 0, 0, 0, 0, 0],\n",
	" [ 7, 68, 10, 85, 46, 4, 0, 0, 0, 0, 0, 0, 0],\n",
	" [ 7, 68, 154, 10, 37, 37, 78, 4, 0, 0, 0, 0, 0],\n",
	" [ 56, 42, 10, 223, 339, 30, 28, 112, 68, 9, 63, 44, 143],\n",
	" [210, 54, 31, 439, 42, 10, 281, 316, 31, 0, 0, 0, 0]],\n",
	" dtype=int32)>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 17
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "wEC5G5y5X0ai",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 86
	},
	"outputId": "f86ff06c-0139-43bc-ae27-1ae76409768b"
	},
	"source": [
	"tfs.decode(ids, seq_len, model_proto=model_proto)"
	],
	"execution_count": 18,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=40, shape=(5,), dtype=string, numpy=\n",
	"array([b'hello world.', b'I have a dog.', b'I have an apple.',\n",
	" b'this is a problem that we have to solve',\n",
	" b'Suyash is a good boy'], dtype=object)>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 18
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "2rUKDjKbYHH8",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 312
	},
	"outputId": "681aae8d-ae38-4d1c-a163-33e836b24d4c"
	},
	"source": [
	"tfs.id_to_piece(ids, model_proto=model_proto)"
	],
	"execution_count": 19,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"<tf.Tensor: id=42, shape=(5, 13), dtype=string, numpy=\n",
	"array([[b'\\xe2\\x96\\x81he', b'll', b'o', b'\\xe2\\x96\\x81world', b'.',\n",
	" b'<unk>', b'<unk>', b'<unk>', b'<unk>', b'<unk>', b'<unk>',\n",
	" b'<unk>', b'<unk>'],\n",
	" [b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81a',\n",
	" b'\\xe2\\x96\\x81do', b'g', b'.', b'<unk>', b'<unk>', b'<unk>',\n",
	" b'<unk>', b'<unk>', b'<unk>', b'<unk>'],\n",
	" [b'\\xe2\\x96\\x81I', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81an',\n",
	" b'\\xe2\\x96\\x81a', b'p', b'p', b'le', b'.', b'<unk>', b'<unk>',\n",
	" b'<unk>', b'<unk>', b'<unk>'],\n",
	" [b'\\xe2\\x96\\x81this', b'\\xe2\\x96\\x81is', b'\\xe2\\x96\\x81a',\n",
	" b'\\xe2\\x96\\x81pro', b'ble', b'm', b'\\xe2\\x96\\x81that',\n",
	" b'\\xe2\\x96\\x81we', b'\\xe2\\x96\\x81have', b'\\xe2\\x96\\x81to',\n",
	" b'\\xe2\\x96\\x81so', b'l', b've'],\n",
	" [b'\\xe2\\x96\\x81S', b'u', b'y', b'ash', b'\\xe2\\x96\\x81is',\n",
	" b'\\xe2\\x96\\x81a', b'\\xe2\\x96\\x81good', b'\\xe2\\x96\\x81bo', b'y',\n",
	" b'<unk>', b'<unk>', b'<unk>', b'<unk>']], dtype=object)>"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 19
	}
	]
	}
	]
	}
No results found