Skip to content

Instantly share code, notes, and snippets.

@praateekmahajan
Last active December 16, 2020 05:05
Show Gist options
  • Select an option

  • Save praateekmahajan/bc2ace6477d04927038a7a5acaf30349 to your computer and use it in GitHub Desktop.

Select an option

Save praateekmahajan/bc2ace6477d04927038a7a5acaf30349 to your computer and use it in GitHub Desktop.

Revisions

  1. praateekmahajan revised this gist Dec 16, 2020. 1 changed file with 0 additions and 193 deletions.
    193 changes: 0 additions & 193 deletions padding_slow.ipynb
    Original file line number Diff line number Diff line change
    @@ -1,13 +1,4 @@
    {
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
    "import random\n",
    "from torch.utils.data import Dataset, Data{
    "cells": [
    {
    "cell_type": "code",
    @@ -200,187 +191,3 @@
    "nbformat": 4,
    "nbformat_minor": 2
    }
    Loader\n",
    "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n",
    "import torch.nn as nn\n",
    "import torch\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import time\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "num_rows = 100000\n",
    "max_length = 100\n",
    "\n",
    "df = pd.DataFrame(\n",
    " [\n",
    " {\"data\": np.random.randn(np.random.randint(1, max_length))}\n",
    " for _ in range(num_rows)\n",
    " ]\n",
    ")\n",
    "df[\"size\"] = df[\"data\"].apply(len)\n",
    "\n",
    "\n",
    "class DummyDataset(Dataset):\n",
    " def __init__(self, file_name_or_df, max_seq_len=50):\n",
    " self.max_seq_len = max_seq_len\n",
    " if isinstance(file_name_or_df, str):\n",
    " self.data = pd.read_json(file_name_or_df)\n",
    " else:\n",
    " self.data = file_name_or_df\n",
    "\n",
    " def __len__(self):\n",
    " return len(self.data)\n",
    "\n",
    " def __getitem__(self, idx):\n",
    " sample = self.data.iloc[idx]\n",
    " seq_len = sample[\"size\"]\n",
    " seq = sample[\"data\"]\n",
    " if seq_len > self.max_seq_len:\n",
    " seq = np.asarray(seq[-self.max_seq_len :])\n",
    " else:\n",
    " seq = np.pad(\n",
    " seq, pad_width=(0, self.max_seq_len - seq_len), constant_values=0\n",
    " )\n",
    "\n",
    " return {\n",
    " \"data\": torch.tensor(seq, dtype=torch.float),\n",
    " \"size\": min(seq_len, self.max_seq_len),\n",
    " }\n",
    "\n",
    "\n",
    "class DummyModel(nn.Module):\n",
    " def __init__(self, should_pack):\n",
    " super(DummyModel, self).__init__()\n",
    "\n",
    " self.rnn_in_dim = 1\n",
    " self.rnn_out_dim = 3\n",
    " self.rnn_num_layers = 1\n",
    "\n",
    " self.should_pack = should_pack\n",
    "\n",
    " self.rnn = nn.RNN(\n",
    " input_size=self.rnn_in_dim,\n",
    " hidden_size=self.rnn_out_dim,\n",
    " num_layers=self.rnn_num_layers,\n",
    " batch_first=True,\n",
    " )\n",
    "\n",
    " self.fc = nn.Linear(self.rnn_out_dim, self.num_classes)\n",
    "\n",
    " def forward(self, batch):\n",
    "\n",
    " input_rnn = batch[\"data\"].unsqueeze(-1)\n",
    " if self.should_pack:\n",
    " packed_input = pack_padded_sequence(\n",
    " input_rnn, batch[\"size\"], batch_first=True, enforce_sorted=False\n",
    " )\n",
    " packed_rnn_out, _ = self.rnn(packed_input)\n",
    " rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)\n",
    " else:\n",
    " rnn_out, _ = self.rnn(input_rnn)\n",
    "\n",
    " return rnn_out"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "81b65336788c4025b46205c8087e233b",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "\n",
    "Time taken when packing is enabled : 22.701847791671753\n"
    ]
    }
    ],
    "source": [
    "ds = DummyDataset(df)\n",
    "dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
    "\n",
    "model = DummyModel(should_pack=True)\n",
    "start_time = time.time()\n",
    "for batch_idx, batch in enumerate(tqdm(dl)):\n",
    " model(batch)\n",
    "print(f\"Time taken when packing is enabled : {time.time() - start_time}\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "ffb9dbb8b7b549ed83f3496f51a348d5",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "\n",
    "Time taken when packing is disabled : 19.904962062835693\n"
    ]
    }
    ],
    "source": [
    "ds = DummyDataset(df)\n",
    "dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
    "\n",
    "model = DummyModel(should_pack=False)\n",
    "start_time = time.time()\n",
    "for batch_idx, batch in enumerate(tqdm(dl)):\n",
    " model(batch)\n",
    "print(f\"Time taken when packing is disabled : {time.time() - start_time}\")"
    ]
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.4"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 2
    }
  2. praateekmahajan revised this gist Dec 16, 2020. 1 changed file with 228 additions and 13 deletions.
    241 changes: 228 additions & 13 deletions padding_slow.ipynb
    Original file line number Diff line number Diff line change
    @@ -2,7 +2,16 @@
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 10,
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
    "import random\n",
    "from torch.utils.data import Dataset, Data{
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
    @@ -57,15 +66,14 @@
    "\n",
    "\n",
    "class DummyModel(nn.Module):\n",
    " def __init__(self, should_pad):\n",
    " def __init__(self, should_pack):\n",
    " super(DummyModel, self).__init__()\n",
    "\n",
    " self.num_classes = 2\n",
    " self.rnn_in_dim = 1\n",
    " self.rnn_out_dim = 3\n",
    " self.rnn_num_layers = 1\n",
    "\n",
    " self.should_pad = should_pad\n",
    " self.should_pack = should_pack\n",
    "\n",
    " self.rnn = nn.RNN(\n",
    " input_size=self.rnn_in_dim,\n",
    @@ -79,7 +87,7 @@
    " def forward(self, batch):\n",
    "\n",
    " input_rnn = batch[\"data\"].unsqueeze(-1)\n",
    " if self.should_pad:\n",
    " if self.should_pack:\n",
    " packed_input = pack_padded_sequence(\n",
    " input_rnn, batch[\"size\"], batch_first=True, enforce_sorted=False\n",
    " )\n",
    @@ -93,13 +101,13 @@
    },
    {
    "cell_type": "code",
    "execution_count": 11,
    "execution_count": 2,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "dac4d14c37b44a2db8823d2c308f5b61",
    "model_id": "81b65336788c4025b46205c8087e233b",
    "version_major": 2,
    "version_minor": 0
    },
    @@ -115,35 +123,242 @@
    "output_type": "stream",
    "text": [
    "\n",
    "Time taken when padding is enabled : 22.755648136138916\n"
    "Time taken when packing is enabled : 22.701847791671753\n"
    ]
    }
    ],
    "source": [
    "ds = DummyDataset(df)\n",
    "dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
    "\n",
    "model = DummyModel(should_pad=True)\n",
    "model = DummyModel(should_pack=True)\n",
    "start_time = time.time()\n",
    "for batch_idx, batch in enumerate(tqdm(dl)):\n",
    " model(batch)\n",
    "print(f\"Time taken when padding is enabled : {time.time() - start_time}\")"
    "print(f\"Time taken when packing is enabled : {time.time() - start_time}\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "outputs": [
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "ffb9dbb8b7b549ed83f3496f51a348d5",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "\n",
    "Time taken when packing is disabled : 19.904962062835693\n"
    ]
    }
    ],
    "source": [
    "ds = DummyDataset(df)\n",
    "dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
    "\n",
    "model = DummyModel(should_pack=False)\n",
    "start_time = time.time()\n",
    "for batch_idx, batch in enumerate(tqdm(dl)):\n",
    " model(batch)\n",
    "print(f\"Time taken when packing is disabled : {time.time() - start_time}\")"
    ]
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.4"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 2
    }
    Loader\n",
    "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n",
    "import torch.nn as nn\n",
    "import torch\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import time\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "num_rows = 100000\n",
    "max_length = 100\n",
    "\n",
    "df = pd.DataFrame(\n",
    " [\n",
    " {\"data\": np.random.randn(np.random.randint(1, max_length))}\n",
    " for _ in range(num_rows)\n",
    " ]\n",
    ")\n",
    "df[\"size\"] = df[\"data\"].apply(len)\n",
    "\n",
    "\n",
    "class DummyDataset(Dataset):\n",
    " def __init__(self, file_name_or_df, max_seq_len=50):\n",
    " self.max_seq_len = max_seq_len\n",
    " if isinstance(file_name_or_df, str):\n",
    " self.data = pd.read_json(file_name_or_df)\n",
    " else:\n",
    " self.data = file_name_or_df\n",
    "\n",
    " def __len__(self):\n",
    " return len(self.data)\n",
    "\n",
    " def __getitem__(self, idx):\n",
    " sample = self.data.iloc[idx]\n",
    " seq_len = sample[\"size\"]\n",
    " seq = sample[\"data\"]\n",
    " if seq_len > self.max_seq_len:\n",
    " seq = np.asarray(seq[-self.max_seq_len :])\n",
    " else:\n",
    " seq = np.pad(\n",
    " seq, pad_width=(0, self.max_seq_len - seq_len), constant_values=0\n",
    " )\n",
    "\n",
    " return {\n",
    " \"data\": torch.tensor(seq, dtype=torch.float),\n",
    " \"size\": min(seq_len, self.max_seq_len),\n",
    " }\n",
    "\n",
    "\n",
    "class DummyModel(nn.Module):\n",
    " def __init__(self, should_pack):\n",
    " super(DummyModel, self).__init__()\n",
    "\n",
    " self.rnn_in_dim = 1\n",
    " self.rnn_out_dim = 3\n",
    " self.rnn_num_layers = 1\n",
    "\n",
    " self.should_pack = should_pack\n",
    "\n",
    " self.rnn = nn.RNN(\n",
    " input_size=self.rnn_in_dim,\n",
    " hidden_size=self.rnn_out_dim,\n",
    " num_layers=self.rnn_num_layers,\n",
    " batch_first=True,\n",
    " )\n",
    "\n",
    " self.fc = nn.Linear(self.rnn_out_dim, self.num_classes)\n",
    "\n",
    " def forward(self, batch):\n",
    "\n",
    " input_rnn = batch[\"data\"].unsqueeze(-1)\n",
    " if self.should_pack:\n",
    " packed_input = pack_padded_sequence(\n",
    " input_rnn, batch[\"size\"], batch_first=True, enforce_sorted=False\n",
    " )\n",
    " packed_rnn_out, _ = self.rnn(packed_input)\n",
    " rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)\n",
    " else:\n",
    " rnn_out, _ = self.rnn(input_rnn)\n",
    "\n",
    " return rnn_out"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "81b65336788c4025b46205c8087e233b",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "\n",
    "Time taken when packing is enabled : 22.701847791671753\n"
    ]
    }
    ],
    "source": [
    "ds = DummyDataset(df)\n",
    "dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
    "\n",
    "model = DummyModel(should_pack=True)\n",
    "start_time = time.time()\n",
    "for batch_idx, batch in enumerate(tqdm(dl)):\n",
    " model(batch)\n",
    "print(f\"Time taken when packing is enabled : {time.time() - start_time}\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "ffb9dbb8b7b549ed83f3496f51a348d5",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "\n",
    "Time taken when packing is disabled : 19.904962062835693\n"
    ]
    }
    ],
    "source": [
    "ds = DummyDataset(df)\n",
    "dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
    "\n",
    "model = DummyModel(should_pad=False)\n",
    "model = DummyModel(should_pack=False)\n",
    "start_time = time.time()\n",
    "for batch_idx, batch in enumerate(tqdm(dl)):\n",
    " model(batch)\n",
    "print(f\"Time taken when padding is disabled : {time.time() - start_time}\")"
    "print(f\"Time taken when packing is disabled : {time.time() - start_time}\")"
    ]
    }
    ],
  3. praateekmahajan created this gist Dec 16, 2020.
    171 changes: 171 additions & 0 deletions padding_slow.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,171 @@
    {
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
    "import random\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n",
    "import torch.nn as nn\n",
    "import torch\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import time\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "num_rows = 100000\n",
    "max_length = 100\n",
    "\n",
    "df = pd.DataFrame(\n",
    " [\n",
    " {\"data\": np.random.randn(np.random.randint(1, max_length))}\n",
    " for _ in range(num_rows)\n",
    " ]\n",
    ")\n",
    "df[\"size\"] = df[\"data\"].apply(len)\n",
    "\n",
    "\n",
    "class DummyDataset(Dataset):\n",
    " def __init__(self, file_name_or_df, max_seq_len=50):\n",
    " self.max_seq_len = max_seq_len\n",
    " if isinstance(file_name_or_df, str):\n",
    " self.data = pd.read_json(file_name_or_df)\n",
    " else:\n",
    " self.data = file_name_or_df\n",
    "\n",
    " def __len__(self):\n",
    " return len(self.data)\n",
    "\n",
    " def __getitem__(self, idx):\n",
    " sample = self.data.iloc[idx]\n",
    " seq_len = sample[\"size\"]\n",
    " seq = sample[\"data\"]\n",
    " if seq_len > self.max_seq_len:\n",
    " seq = np.asarray(seq[-self.max_seq_len :])\n",
    " else:\n",
    " seq = np.pad(\n",
    " seq, pad_width=(0, self.max_seq_len - seq_len), constant_values=0\n",
    " )\n",
    "\n",
    " return {\n",
    " \"data\": torch.tensor(seq, dtype=torch.float),\n",
    " \"size\": min(seq_len, self.max_seq_len),\n",
    " }\n",
    "\n",
    "\n",
    "class DummyModel(nn.Module):\n",
    " def __init__(self, should_pad):\n",
    " super(DummyModel, self).__init__()\n",
    "\n",
    " self.num_classes = 2\n",
    " self.rnn_in_dim = 1\n",
    " self.rnn_out_dim = 3\n",
    " self.rnn_num_layers = 1\n",
    "\n",
    " self.should_pad = should_pad\n",
    "\n",
    " self.rnn = nn.RNN(\n",
    " input_size=self.rnn_in_dim,\n",
    " hidden_size=self.rnn_out_dim,\n",
    " num_layers=self.rnn_num_layers,\n",
    " batch_first=True,\n",
    " )\n",
    "\n",
    " self.fc = nn.Linear(self.rnn_out_dim, self.num_classes)\n",
    "\n",
    " def forward(self, batch):\n",
    "\n",
    " input_rnn = batch[\"data\"].unsqueeze(-1)\n",
    " if self.should_pad:\n",
    " packed_input = pack_padded_sequence(\n",
    " input_rnn, batch[\"size\"], batch_first=True, enforce_sorted=False\n",
    " )\n",
    " packed_rnn_out, _ = self.rnn(packed_input)\n",
    " rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)\n",
    " else:\n",
    " rnn_out, _ = self.rnn(input_rnn)\n",
    "\n",
    " return rnn_out"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "application/vnd.jupyter.widget-view+json": {
    "model_id": "dac4d14c37b44a2db8823d2c308f5b61",
    "version_major": 2,
    "version_minor": 0
    },
    "text/plain": [
    "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))"
    ]
    },
    "metadata": {},
    "output_type": "display_data"
    },
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "\n",
    "Time taken when padding is enabled : 22.755648136138916\n"
    ]
    }
    ],
    "source": [
    "ds = DummyDataset(df)\n",
    "dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
    "\n",
    "model = DummyModel(should_pad=True)\n",
    "start_time = time.time()\n",
    "for batch_idx, batch in enumerate(tqdm(dl)):\n",
    " model(batch)\n",
    "print(f\"Time taken when padding is enabled : {time.time() - start_time}\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
    "ds = DummyDataset(df)\n",
    "dl = DataLoader(ds, batch_size=32, shuffle=True)\n",
    "\n",
    "model = DummyModel(should_pad=False)\n",
    "start_time = time.time()\n",
    "for batch_idx, batch in enumerate(tqdm(dl)):\n",
    " model(batch)\n",
    "print(f\"Time taken when padding is disabled : {time.time() - start_time}\")"
    ]
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.4"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 2
    }