Last active
December 16, 2020 05:05
-
-
Save praateekmahajan/bc2ace6477d04927038a7a5acaf30349 to your computer and use it in GitHub Desktop.
Revisions
-
praateekmahajan revised this gist
Dec 16, 2020 . 1 changed file with 0 additions and 193 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,13 +1,4 @@ { "cells": [ { "cell_type": "code", @@ -200,187 +191,3 @@ "nbformat": 4, "nbformat_minor": 2 } -
praateekmahajan revised this gist
Dec 16, 2020 . 1 changed file with 228 additions and 13 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,7 +2,16 @@ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import random\n", "from torch.utils.data import Dataset, Data{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -57,15 +66,14 @@ "\n", "\n", "class DummyModel(nn.Module):\n", " def __init__(self, should_pack):\n", " super(DummyModel, self).__init__()\n", "\n", " self.rnn_in_dim = 1\n", " self.rnn_out_dim = 3\n", " self.rnn_num_layers = 1\n", "\n", " self.should_pack = should_pack\n", "\n", " self.rnn = nn.RNN(\n", " input_size=self.rnn_in_dim,\n", @@ -79,7 +87,7 @@ " def forward(self, batch):\n", "\n", " input_rnn = batch[\"data\"].unsqueeze(-1)\n", " if self.should_pack:\n", " packed_input = pack_padded_sequence(\n", " input_rnn, batch[\"size\"], batch_first=True, enforce_sorted=False\n", " )\n", @@ -93,13 +101,13 @@ }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "81b65336788c4025b46205c8087e233b", "version_major": 2, "version_minor": 0 }, @@ -115,35 +123,242 @@ "output_type": "stream", "text": [ "\n", "Time taken when packing is enabled : 22.701847791671753\n" ] } ], "source": [ "ds = DummyDataset(df)\n", "dl = DataLoader(ds, batch_size=32, shuffle=True)\n", "\n", "model = DummyModel(should_pack=True)\n", "start_time = time.time()\n", "for batch_idx, batch in enumerate(tqdm(dl)):\n", " model(batch)\n", "print(f\"Time taken when packing is enabled : {time.time() - start_time}\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ffb9dbb8b7b549ed83f3496f51a348d5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Time taken when packing is disabled : 19.904962062835693\n" ] } ], "source": [ "ds = DummyDataset(df)\n", "dl = DataLoader(ds, batch_size=32, shuffle=True)\n", "\n", "model = DummyModel(should_pack=False)\n", "start_time = time.time()\n", "for batch_idx, batch in enumerate(tqdm(dl)):\n", " model(batch)\n", "print(f\"Time taken when packing is disabled : {time.time() - start_time}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 } Loader\n", "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n", "import torch.nn as nn\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "import time\n", "from tqdm.notebook import tqdm\n", "\n", "num_rows = 100000\n", "max_length = 100\n", "\n", "df = pd.DataFrame(\n", " [\n", " {\"data\": np.random.randn(np.random.randint(1, max_length))}\n", " for _ in range(num_rows)\n", " ]\n", ")\n", "df[\"size\"] = df[\"data\"].apply(len)\n", "\n", "\n", "class DummyDataset(Dataset):\n", " def __init__(self, file_name_or_df, max_seq_len=50):\n", " self.max_seq_len = max_seq_len\n", " if isinstance(file_name_or_df, str):\n", " self.data = pd.read_json(file_name_or_df)\n", " else:\n", " self.data = file_name_or_df\n", "\n", " def __len__(self):\n", " return len(self.data)\n", "\n", " def __getitem__(self, idx):\n", " sample = self.data.iloc[idx]\n", " seq_len = sample[\"size\"]\n", " seq = sample[\"data\"]\n", " if seq_len > self.max_seq_len:\n", " seq = np.asarray(seq[-self.max_seq_len :])\n", " else:\n", " seq = np.pad(\n", " seq, pad_width=(0, self.max_seq_len - seq_len), constant_values=0\n", " )\n", "\n", " return {\n", " \"data\": torch.tensor(seq, dtype=torch.float),\n", " \"size\": min(seq_len, self.max_seq_len),\n", " }\n", "\n", "\n", "class DummyModel(nn.Module):\n", " def __init__(self, should_pack):\n", " super(DummyModel, self).__init__()\n", "\n", " self.rnn_in_dim = 1\n", " self.rnn_out_dim = 3\n", " self.rnn_num_layers = 1\n", "\n", " self.should_pack = should_pack\n", "\n", " self.rnn = nn.RNN(\n", " input_size=self.rnn_in_dim,\n", " hidden_size=self.rnn_out_dim,\n", " num_layers=self.rnn_num_layers,\n", " batch_first=True,\n", " )\n", "\n", " self.fc = nn.Linear(self.rnn_out_dim, self.num_classes)\n", "\n", " def forward(self, batch):\n", "\n", " input_rnn = batch[\"data\"].unsqueeze(-1)\n", " if self.should_pack:\n", " packed_input = pack_padded_sequence(\n", " input_rnn, batch[\"size\"], batch_first=True, enforce_sorted=False\n", " )\n", " packed_rnn_out, _ = self.rnn(packed_input)\n", " rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)\n", " else:\n", " rnn_out, _ = self.rnn(input_rnn)\n", "\n", " return rnn_out" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "81b65336788c4025b46205c8087e233b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Time taken when packing is enabled : 22.701847791671753\n" ] } ], "source": [ "ds = DummyDataset(df)\n", "dl = DataLoader(ds, batch_size=32, shuffle=True)\n", "\n", "model = DummyModel(should_pack=True)\n", "start_time = time.time()\n", "for batch_idx, batch in enumerate(tqdm(dl)):\n", " model(batch)\n", "print(f\"Time taken when packing is enabled : {time.time() - start_time}\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ffb9dbb8b7b549ed83f3496f51a348d5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Time taken when packing is disabled : 19.904962062835693\n" ] } ], "source": [ "ds = DummyDataset(df)\n", "dl = DataLoader(ds, batch_size=32, shuffle=True)\n", "\n", "model = DummyModel(should_pack=False)\n", "start_time = time.time()\n", "for batch_idx, batch in enumerate(tqdm(dl)):\n", " model(batch)\n", "print(f\"Time taken when packing is disabled : {time.time() - start_time}\")" ] } ], -
praateekmahajan created this gist
Dec 16, 2020 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,171 @@ { "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import random\n", "from torch.utils.data import Dataset, DataLoader\n", "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n", "import torch.nn as nn\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "import time\n", "from tqdm.notebook import tqdm\n", "\n", "num_rows = 100000\n", "max_length = 100\n", "\n", "df = pd.DataFrame(\n", " [\n", " {\"data\": np.random.randn(np.random.randint(1, max_length))}\n", " for _ in range(num_rows)\n", " ]\n", ")\n", "df[\"size\"] = df[\"data\"].apply(len)\n", "\n", "\n", "class DummyDataset(Dataset):\n", " def __init__(self, file_name_or_df, max_seq_len=50):\n", " self.max_seq_len = max_seq_len\n", " if isinstance(file_name_or_df, str):\n", " self.data = pd.read_json(file_name_or_df)\n", " else:\n", " self.data = file_name_or_df\n", "\n", " def __len__(self):\n", " return len(self.data)\n", "\n", " def __getitem__(self, idx):\n", " sample = self.data.iloc[idx]\n", " seq_len = sample[\"size\"]\n", " seq = sample[\"data\"]\n", " if seq_len > self.max_seq_len:\n", " seq = np.asarray(seq[-self.max_seq_len :])\n", " else:\n", " seq = np.pad(\n", " seq, pad_width=(0, self.max_seq_len - seq_len), constant_values=0\n", " )\n", "\n", " return {\n", " \"data\": torch.tensor(seq, dtype=torch.float),\n", " \"size\": min(seq_len, self.max_seq_len),\n", " }\n", "\n", "\n", "class DummyModel(nn.Module):\n", " def __init__(self, should_pad):\n", " super(DummyModel, self).__init__()\n", "\n", " self.num_classes = 2\n", " self.rnn_in_dim = 1\n", " self.rnn_out_dim = 3\n", " self.rnn_num_layers = 1\n", "\n", " self.should_pad = should_pad\n", "\n", " self.rnn = nn.RNN(\n", " input_size=self.rnn_in_dim,\n", " hidden_size=self.rnn_out_dim,\n", " num_layers=self.rnn_num_layers,\n", " batch_first=True,\n", " )\n", "\n", " self.fc = nn.Linear(self.rnn_out_dim, self.num_classes)\n", "\n", " def forward(self, batch):\n", "\n", " input_rnn = batch[\"data\"].unsqueeze(-1)\n", " if self.should_pad:\n", " packed_input = pack_padded_sequence(\n", " input_rnn, batch[\"size\"], batch_first=True, enforce_sorted=False\n", " )\n", " packed_rnn_out, _ = self.rnn(packed_input)\n", " rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)\n", " else:\n", " rnn_out, _ = self.rnn(input_rnn)\n", "\n", " return rnn_out" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dac4d14c37b44a2db8823d2c308f5b61", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3125.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Time taken when padding is enabled : 22.755648136138916\n" ] } ], "source": [ "ds = DummyDataset(df)\n", "dl = DataLoader(ds, batch_size=32, shuffle=True)\n", "\n", "model = DummyModel(should_pad=True)\n", "start_time = time.time()\n", "for batch_idx, batch in enumerate(tqdm(dl)):\n", " model(batch)\n", "print(f\"Time taken when padding is enabled : {time.time() - start_time}\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "ds = DummyDataset(df)\n", "dl = DataLoader(ds, batch_size=32, shuffle=True)\n", "\n", "model = DummyModel(should_pad=False)\n", "start_time = time.time()\n", "for batch_idx, batch in enumerate(tqdm(dl)):\n", " model(batch)\n", "print(f\"Time taken when padding is disabled : {time.time() - start_time}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }