Skip to content

Instantly share code, notes, and snippets.

@Sparrow0hawk
Last active April 15, 2022 19:09
Show Gist options
  • Select an option

  • Save Sparrow0hawk/bf3bf62abc4a623cf1c5f5cb20d806a6 to your computer and use it in GitHub Desktop.

Select an option

Save Sparrow0hawk/bf3bf62abc4a623cf1c5f5cb20d806a6 to your computer and use it in GitHub Desktop.
Extracting topic scores for documents using LDA Gensim (SOO answer)
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Untitled28.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyMbQk1gQNIBS7J1ybWMNhHa",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Sparrow0hawk/bf3bf62abc4a623cf1c5f5cb20d806a6/untitled28.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "rD5gmpMum-Fe"
},
"outputs": [],
"source": [
"from gensim.test.utils import common_texts, common_corpus, common_dictionary\n",
"from gensim.models import LdaModel\n",
"\n",
"# train a quick lda model using the common _corpus, _dictionary and _texts from gensim\n",
"optimal_model = LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)"
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"##dominant topic for each document\n",
"def format_topics_sentences(ldamodel=optimal_model, \n",
" corpus=common_corpus, \n",
" texts=common_texts, \n",
" n=1):\n",
" \"\"\"\n",
" A function for extracting a number of dominant topics for a given document\n",
" using an existing LDA model\n",
" \"\"\"\n",
" # Init output\n",
" sent_topics_df = pd.DataFrame()\n",
"\n",
"\n",
" # Get main topic in each document\n",
" for i, row in enumerate(ldamodel[corpus]):\n",
" row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
" # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
" for j, (topic_num, prop_topic) in enumerate(row):\n",
" # we use range here to iterate over the n parameter\n",
" if j in range(n): # => dominant topic\n",
" wp = ldamodel.show_topic(topic_num)\n",
" topic_keywords = \", \".join([word for word, prop in wp])\n",
" sent_topics_df = sent_topics_df.append(\n",
" # and also use the i value here to get the document label\n",
" pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords]),\n",
" ignore_index=True,\n",
" )\n",
" else:\n",
" break\n",
" sent_topics_df.columns = [\"Document\", \"Dominant_Topic\", \"Perc_Contribution\", \"Topic_Keywords\"]\n",
"\n",
" # Add original text to the end of the output\n",
" text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]\n",
" contents = pd.Series(text_col, name='original_texts')\n",
" sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)\n",
" return sent_topics_df"
],
"metadata": {
"id": "ylYlNPQLm_Fb"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"format_topics_sentences(ldamodel=optimal_model, corpus=common_corpus, texts=common_texts, n=2)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 614
},
"id": "OgK69on8nDsT",
"outputId": "40b988aa-f19b-41cc-923a-96fb034be298"
},
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Document Dominant_Topic Perc_Contribution \\\n",
"0 0.0 7.0 0.7750 \n",
"1 0.0 5.0 0.0250 \n",
"2 1.0 4.0 0.8714 \n",
"3 1.0 3.0 0.0143 \n",
"4 2.0 5.0 0.8200 \n",
"5 2.0 0.0 0.0200 \n",
"6 3.0 0.0 0.8200 \n",
"7 3.0 5.0 0.0200 \n",
"8 4.0 3.0 0.7750 \n",
"9 4.0 4.0 0.0250 \n",
"10 5.0 8.0 0.5500 \n",
"11 5.0 4.0 0.0500 \n",
"12 6.0 8.0 0.7000 \n",
"13 6.0 2.0 0.0333 \n",
"14 7.0 8.0 0.7750 \n",
"15 7.0 2.0 0.0250 \n",
"16 8.0 2.0 0.7750 \n",
"17 8.0 8.0 0.0250 \n",
"\n",
" Topic_Keywords \\\n",
"0 human, interface, computer, trees, graph, syst... \n",
"1 system, user, interface, eps, trees, graph, co... \n",
"2 user, survey, time, computer, system, response... \n",
"3 user, response, time, trees, graph, system, in... \n",
"4 system, user, interface, eps, trees, graph, co... \n",
"5 system, eps, human, trees, graph, user, comput... \n",
"6 system, eps, human, trees, graph, user, comput... \n",
"7 system, user, interface, eps, trees, graph, co... \n",
"8 user, response, time, trees, graph, system, in... \n",
"9 user, survey, time, computer, system, response... \n",
"10 graph, trees, minors, system, interface, time,... \n",
"11 user, survey, time, computer, system, response... \n",
"12 graph, trees, minors, system, interface, time,... \n",
"13 minors, survey, graph, trees, system, user, hu... \n",
"14 graph, trees, minors, system, interface, time,... \n",
"15 minors, survey, graph, trees, system, user, hu... \n",
"16 minors, survey, graph, trees, system, user, hu... \n",
"17 graph, trees, minors, system, interface, time,... \n",
"\n",
" original_texts \n",
"0 [human, interface, computer] \n",
"1 [human, interface, computer] \n",
"2 [survey, user, computer, system, response, time] \n",
"3 [survey, user, computer, system, response, time] \n",
"4 [eps, user, interface, system] \n",
"5 [eps, user, interface, system] \n",
"6 [system, human, system, eps] \n",
"7 [system, human, system, eps] \n",
"8 [user, response, time] \n",
"9 [user, response, time] \n",
"10 [trees] \n",
"11 [trees] \n",
"12 [graph, trees] \n",
"13 [graph, trees] \n",
"14 [graph, minors, trees] \n",
"15 [graph, minors, trees] \n",
"16 [graph, minors, survey] \n",
"17 [graph, minors, survey] "
],
"text/html": [
"\n",
" <div id=\"df-269dacc6-7800-4c13-a48d-88ad0314e5c2\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Document</th>\n",
" <th>Dominant_Topic</th>\n",
" <th>Perc_Contribution</th>\n",
" <th>Topic_Keywords</th>\n",
" <th>original_texts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>7.0</td>\n",
" <td>0.7750</td>\n",
" <td>human, interface, computer, trees, graph, syst...</td>\n",
" <td>[human, interface, computer]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>5.0</td>\n",
" <td>0.0250</td>\n",
" <td>system, user, interface, eps, trees, graph, co...</td>\n",
" <td>[human, interface, computer]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>4.0</td>\n",
" <td>0.8714</td>\n",
" <td>user, survey, time, computer, system, response...</td>\n",
" <td>[survey, user, computer, system, response, time]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>0.0143</td>\n",
" <td>user, response, time, trees, graph, system, in...</td>\n",
" <td>[survey, user, computer, system, response, time]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>5.0</td>\n",
" <td>0.8200</td>\n",
" <td>system, user, interface, eps, trees, graph, co...</td>\n",
" <td>[eps, user, interface, system]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0200</td>\n",
" <td>system, eps, human, trees, graph, user, comput...</td>\n",
" <td>[eps, user, interface, system]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>0.8200</td>\n",
" <td>system, eps, human, trees, graph, user, comput...</td>\n",
" <td>[system, human, system, eps]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3.0</td>\n",
" <td>5.0</td>\n",
" <td>0.0200</td>\n",
" <td>system, user, interface, eps, trees, graph, co...</td>\n",
" <td>[system, human, system, eps]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>4.0</td>\n",
" <td>3.0</td>\n",
" <td>0.7750</td>\n",
" <td>user, response, time, trees, graph, system, in...</td>\n",
" <td>[user, response, time]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0250</td>\n",
" <td>user, survey, time, computer, system, response...</td>\n",
" <td>[user, response, time]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>5.0</td>\n",
" <td>8.0</td>\n",
" <td>0.5500</td>\n",
" <td>graph, trees, minors, system, interface, time,...</td>\n",
" <td>[trees]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>5.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0500</td>\n",
" <td>user, survey, time, computer, system, response...</td>\n",
" <td>[trees]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>6.0</td>\n",
" <td>8.0</td>\n",
" <td>0.7000</td>\n",
" <td>graph, trees, minors, system, interface, time,...</td>\n",
" <td>[graph, trees]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>6.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0333</td>\n",
" <td>minors, survey, graph, trees, system, user, hu...</td>\n",
" <td>[graph, trees]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>7.0</td>\n",
" <td>8.0</td>\n",
" <td>0.7750</td>\n",
" <td>graph, trees, minors, system, interface, time,...</td>\n",
" <td>[graph, minors, trees]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0250</td>\n",
" <td>minors, survey, graph, trees, system, user, hu...</td>\n",
" <td>[graph, minors, trees]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>8.0</td>\n",
" <td>2.0</td>\n",
" <td>0.7750</td>\n",
" <td>minors, survey, graph, trees, system, user, hu...</td>\n",
" <td>[graph, minors, survey]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>8.0</td>\n",
" <td>8.0</td>\n",
" <td>0.0250</td>\n",
" <td>graph, trees, minors, system, interface, time,...</td>\n",
" <td>[graph, minors, survey]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-269dacc6-7800-4c13-a48d-88ad0314e5c2')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-269dacc6-7800-4c13-a48d-88ad0314e5c2 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-269dacc6-7800-4c13-a48d-88ad0314e5c2');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"source": [
"for i in optimal_model[common_corpus]:\n",
" print(i)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "b_-wOVlInHLB",
"outputId": "4b03ce2d-ec95-4a79-b016-5bf212e3f2b0"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[(0, 0.025002241), (1, 0.025000002), (2, 0.025), (3, 0.025), (4, 0.02500137), (5, 0.025002241), (6, 0.025000002), (7, 0.77499413), (8, 0.025), (9, 0.025000002)]\n",
"[(0, 0.014288797), (1, 0.014285716), (2, 0.014287288), (3, 0.014290491), (4, 0.8714151), (5, 0.014288196), (6, 0.014285716), (7, 0.014287288), (8, 0.014285714), (9, 0.014285716)]\n",
"[(0, 0.020005856), (1, 0.020000003), (2, 0.020000001), (3, 0.020002093), (4, 0.020002015), (5, 0.81998795), (6, 0.020000003), (7, 0.020002093), (8, 0.020000001), (9, 0.020000003)]\n",
"[(0, 0.8199941), (1, 0.020000001), (2, 0.02), (3, 0.02), (4, 0.020000812), (5, 0.02000298), (6, 0.020000001), (7, 0.020002091), (8, 0.02), (9, 0.020000001)]\n",
"[(0, 0.025), (1, 0.025000002), (2, 0.025), (3, 0.77499324), (4, 0.025004486), (5, 0.025002241), (6, 0.025000002), (7, 0.025), (8, 0.025), (9, 0.025000002)]\n",
"[(0, 0.05), (1, 0.050000004), (2, 0.05), (3, 0.05), (4, 0.050006837), (5, 0.05), (6, 0.050000004), (7, 0.05), (8, 0.54999316), (9, 0.050000004)]\n",
"[(0, 0.033333335), (1, 0.033333335), (2, 0.033338174), (3, 0.033333335), (4, 0.03333516), (5, 0.033333335), (6, 0.033333335), (7, 0.033333335), (8, 0.6999933), (9, 0.033333335)]\n",
"[(0, 0.025), (1, 0.025000002), (2, 0.025009582), (3, 0.025), (4, 0.02500085), (5, 0.025), (6, 0.025000002), (7, 0.025), (8, 0.77498955), (9, 0.025000002)]\n",
"[(0, 0.025), (1, 0.025000002), (2, 0.7749882), (3, 0.025), (4, 0.025001371), (5, 0.025), (6, 0.025000002), (7, 0.025), (8, 0.025010476), (9, 0.025000002)]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"list(optimal_model[common_corpus])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Pcwk9DDGnLSW",
"outputId": "7ffe6328-98e2-468a-bccc-39e55350f780"
},
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[[(0, 0.025002243),\n",
" (1, 0.025000002),\n",
" (2, 0.025),\n",
" (3, 0.025),\n",
" (4, 0.02500137),\n",
" (5, 0.025002243),\n",
" (6, 0.025000002),\n",
" (7, 0.7749942),\n",
" (8, 0.025),\n",
" (9, 0.025000002)],\n",
" [(0, 0.014288797),\n",
" (1, 0.014285716),\n",
" (2, 0.014287288),\n",
" (3, 0.01429052),\n",
" (4, 0.8714151),\n",
" (5, 0.014288196),\n",
" (6, 0.014285716),\n",
" (7, 0.014287288),\n",
" (8, 0.014285714),\n",
" (9, 0.014285716)],\n",
" [(0, 0.02000577),\n",
" (1, 0.020000001),\n",
" (2, 0.02),\n",
" (3, 0.020002091),\n",
" (4, 0.020002013),\n",
" (5, 0.8199881),\n",
" (6, 0.020000001),\n",
" (7, 0.020002091),\n",
" (8, 0.02),\n",
" (9, 0.020000001)],\n",
" [(0, 0.8199941),\n",
" (1, 0.02),\n",
" (2, 0.019999998),\n",
" (3, 0.019999998),\n",
" (4, 0.02000081),\n",
" (5, 0.020002991),\n",
" (6, 0.02),\n",
" (7, 0.02000209),\n",
" (8, 0.019999998),\n",
" (9, 0.02)],\n",
" [(0, 0.025000002),\n",
" (1, 0.025000004),\n",
" (2, 0.025000002),\n",
" (3, 0.77499366),\n",
" (4, 0.02500412),\n",
" (5, 0.025002243),\n",
" (6, 0.025000004),\n",
" (7, 0.025000002),\n",
" (8, 0.025000002),\n",
" (9, 0.025000004)],\n",
" [(0, 0.049999993),\n",
" (1, 0.049999997),\n",
" (2, 0.049999993),\n",
" (3, 0.049999993),\n",
" (4, 0.05000683),\n",
" (5, 0.049999993),\n",
" (6, 0.049999997),\n",
" (7, 0.049999993),\n",
" (8, 0.54999316),\n",
" (9, 0.049999997)],\n",
" [(0, 0.033333335),\n",
" (1, 0.033333335),\n",
" (2, 0.033337615),\n",
" (3, 0.033333335),\n",
" (4, 0.033335157),\n",
" (5, 0.033333335),\n",
" (6, 0.033333335),\n",
" (7, 0.033333335),\n",
" (8, 0.69999385),\n",
" (9, 0.033333335)],\n",
" [(0, 0.025),\n",
" (1, 0.025000002),\n",
" (2, 0.02500647),\n",
" (3, 0.025),\n",
" (4, 0.025000848),\n",
" (5, 0.025),\n",
" (6, 0.025000002),\n",
" (7, 0.025),\n",
" (8, 0.77499264),\n",
" (9, 0.025000002)],\n",
" [(0, 0.025000002),\n",
" (1, 0.025000004),\n",
" (2, 0.77499217),\n",
" (3, 0.025000002),\n",
" (4, 0.025001371),\n",
" (5, 0.025000002),\n",
" (6, 0.025000004),\n",
" (7, 0.025000002),\n",
" (8, 0.025006471),\n",
" (9, 0.025000004)]]"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"source": [
""
],
"metadata": {
"id": "jxG1iDelndlS"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment