Last active
August 8, 2021 17:17
-
-
Save SandieIJ/69fc80c372e823fecfd4eeeda2156936 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: spacy in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (2.2.4)\n", | |
| "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.0.0)\n", | |
| "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (0.6.0)\n", | |
| "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (3.0.2)\n", | |
| "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.1.3)\n", | |
| "Requirement already satisfied: numpy>=1.15.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.17.2)\n", | |
| "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.0.2)\n", | |
| "Requirement already satisfied: setuptools in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (41.4.0)\n", | |
| "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (2.0.3)\n", | |
| "Requirement already satisfied: thinc==7.4.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (7.4.0)\n", | |
| "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (2.22.0)\n", | |
| "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (0.4.1)\n", | |
| "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (4.43.0)\n", | |
| "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.0.2)\n", | |
| "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from catalogue<1.1.0,>=0.0.7->spacy) (0.23)\n", | |
| "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n", | |
| "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.2)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2019.9.11)\n", | |
| "Requirement already satisfied: idna<2.9,>=2.5 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.8)\n", | |
| "Requirement already satisfied: zipp>=0.5 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy) (0.6.0)\n", | |
| "Requirement already satisfied: more-itertools in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from zipp>=0.5->importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy) (7.2.0)\n", | |
| "Requirement already satisfied: gensim in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (3.8.1)\n", | |
| "Requirement already satisfied: six>=1.5.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from gensim) (1.12.0)\n", | |
| "Requirement already satisfied: numpy>=1.11.3 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from gensim) (1.17.2)\n", | |
| "Requirement already satisfied: smart-open>=1.8.1 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from gensim) (1.9.0)\n", | |
| "Requirement already satisfied: scipy>=0.18.1 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from gensim) (1.4.1)\n", | |
| "Requirement already satisfied: boto>=2.32 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from smart-open>=1.8.1->gensim) (2.49.0)\n", | |
| "Requirement already satisfied: boto3 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from smart-open>=1.8.1->gensim) (1.12.11)\n", | |
| "Requirement already satisfied: requests in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from smart-open>=1.8.1->gensim) (2.22.0)\n", | |
| "Requirement already satisfied: botocore<1.16.0,>=1.15.11 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from boto3->smart-open>=1.8.1->gensim) (1.15.11)\n", | |
| "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from boto3->smart-open>=1.8.1->gensim) (0.9.5)\n", | |
| "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from boto3->smart-open>=1.8.1->gensim) (0.3.3)\n", | |
| "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests->smart-open>=1.8.1->gensim) (1.24.2)\n", | |
| "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests->smart-open>=1.8.1->gensim) (3.0.4)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests->smart-open>=1.8.1->gensim) (2019.9.11)\n", | |
| "Requirement already satisfied: idna<2.9,>=2.5 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests->smart-open>=1.8.1->gensim) (2.8)\n", | |
| "Requirement already satisfied: docutils<0.16,>=0.10 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from botocore<1.16.0,>=1.15.11->boto3->smart-open>=1.8.1->gensim) (0.15.2)\n", | |
| "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from botocore<1.16.0,>=1.15.11->boto3->smart-open>=1.8.1->gensim) (2.8.0)\n", | |
| "Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz\n", | |
| "\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0MB)\n", | |
| "\u001b[K |████████████████████████████████| 12.0MB 11.5MB/s eta 0:00:01\n", | |
| "\u001b[?25hRequirement already satisfied (use --upgrade to upgrade): en-core-web-sm==2.2.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages\n", | |
| "Requirement already satisfied: spacy>=2.2.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from en-core-web-sm==2.2.0) (2.2.4)\n", | |
| "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (0.6.0)\n", | |
| "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (1.0.2)\n", | |
| "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (1.1.3)\n", | |
| "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (2.0.3)\n", | |
| "Requirement already satisfied: numpy>=1.15.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (1.17.2)\n", | |
| "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (0.4.1)\n", | |
| "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (4.43.0)\n", | |
| "Requirement already satisfied: thinc==7.4.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (7.4.0)\n", | |
| "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (1.0.2)\n", | |
| "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (1.0.0)\n", | |
| "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (2.22.0)\n", | |
| "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (3.0.2)\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: setuptools in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from spacy>=2.2.0->en-core-web-sm==2.2.0) (41.4.0)\n", | |
| "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.0->en-core-web-sm==2.2.0) (0.23)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.0->en-core-web-sm==2.2.0) (2019.9.11)\n", | |
| "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.0->en-core-web-sm==2.2.0) (3.0.4)\n", | |
| "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.0->en-core-web-sm==2.2.0) (1.24.2)\n", | |
| "Requirement already satisfied: idna<2.9,>=2.5 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.0->en-core-web-sm==2.2.0) (2.8)\n", | |
| "Requirement already satisfied: zipp>=0.5 in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.0->en-core-web-sm==2.2.0) (0.6.0)\n", | |
| "Requirement already satisfied: more-itertools in /Users/sandiejirongo/opt/anaconda3/lib/python3.7/site-packages (from zipp>=0.5->importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.0->en-core-web-sm==2.2.0) (7.2.0)\n", | |
| "Building wheels for collected packages: en-core-web-sm\n", | |
| " Building wheel for en-core-web-sm (setup.py) ... \u001b[?25ldone\n", | |
| "\u001b[?25h Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-cp37-none-any.whl size=12019125 sha256=63a6868af18bc78b6d100ef11c2b99366673b8d9ac50a3aa3ca4c5fa3d1e120b\n", | |
| " Stored in directory: /Users/sandiejirongo/Library/Caches/pip/wheels/48/5c/1c/15f9d02afc8221a668d2172446dd8467b20cdb9aef80a172a4\n", | |
| "Successfully built en-core-web-sm\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!pip install spacy\n", | |
| "!pip install gensim\n", | |
| "!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import spacy\n", | |
| "import re\n", | |
| "import pandas as pd\n", | |
| "import gensim\n", | |
| "from gensim.utils import simple_preprocess \n", | |
| "import gensim.corpora as corpora\n", | |
| "from pprint import pprint" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>name</th>\n", | |
| " <th>description</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td>0</td>\n", | |
| " <td>Legend Fire Squad survival: Free Fire Battlegr...</td>\n", | |
| " <td>Ready to play an amazing and exciting best sho...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1</td>\n", | |
| " <td>Ambulance Game</td>\n", | |
| " <td>You must be a fan of the driving games. We ass...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>2</td>\n", | |
| " <td>Beam Drive NG Death Stair Car Crash Simulator</td>\n", | |
| " <td>Beam Drive NG Death Stair Car Crash Accidents ...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>3</td>\n", | |
| " <td>Kelime İncileri</td>\n", | |
| " <td>Yeni Kelime Bulmaca Oyununuz! Kelime Arama ve ...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>4</td>\n", | |
| " <td>Word Blocks</td>\n", | |
| " <td>Word Blocks is a new kind of word search puzzl...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>5</td>\n", | |
| " <td>Free Fire Commando - Counter Attack FPS 2019</td>\n", | |
| " <td>Free Fire Commando - Counter Attack FPS 2019 i...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>6</td>\n", | |
| " <td>Fall Race 3D</td>\n", | |
| " <td>The most exciting sky race!Run through the sky...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>7</td>\n", | |
| " <td>Math School Game Basic: Crazy Principal</td>\n", | |
| " <td>Your school principal went crazy and locked yo...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>8</td>\n", | |
| " <td>Jump Cube</td>\n", | |
| " <td>Jump Cube is an addictive game, tap the right ...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>9</td>\n", | |
| " <td>Tien Len Offline</td>\n", | |
| " <td>Một tựa game cũng như cách chơi ko thể quen th...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " name \\\n", | |
| "0 Legend Fire Squad survival: Free Fire Battlegr... \n", | |
| "1 Ambulance Game \n", | |
| "2 Beam Drive NG Death Stair Car Crash Simulator \n", | |
| "3 Kelime İncileri \n", | |
| "4 Word Blocks \n", | |
| "5 Free Fire Commando - Counter Attack FPS 2019 \n", | |
| "6 Fall Race 3D \n", | |
| "7 Math School Game Basic: Crazy Principal \n", | |
| "8 Jump Cube \n", | |
| "9 Tien Len Offline \n", | |
| "\n", | |
| " description \n", | |
| "0 Ready to play an amazing and exciting best sho... \n", | |
| "1 You must be a fan of the driving games. We ass... \n", | |
| "2 Beam Drive NG Death Stair Car Crash Accidents ... \n", | |
| "3 Yeni Kelime Bulmaca Oyununuz! Kelime Arama ve ... \n", | |
| "4 Word Blocks is a new kind of word search puzzl... \n", | |
| "5 Free Fire Commando - Counter Attack FPS 2019 i... \n", | |
| "6 The most exciting sky race!Run through the sky... \n", | |
| "7 Your school principal went crazy and locked yo... \n", | |
| "8 Jump Cube is an addictive game, tap the right ... \n", | |
| "9 Một tựa game cũng như cách chơi ko thể quen th... " | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Reading loading/data\n", | |
| "data = pd.read_csv(\"https://raw.githubusercontent.com/SandieIJ/Capstone/master/data/sandra_csv_results-20190723-155508.csv\")\n", | |
| "\n", | |
| "data.head(10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "descriptions = data.description.values.tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Remove new line characters\n", | |
| "no_new_lines = [re.sub('\\s+', ' ', sent) for sent in descriptions] \n", | |
| "\n", | |
| "#Remove non letter characters\n", | |
| "non_letters = [re.sub('[^a-zA-Z]', ' ', no_new_line) for no_new_line in no_new_lines]\n", | |
| "\n", | |
| "# Remove distracting single quotes\n", | |
| "no_quotes = [re.sub(\"\\'\", '', non_letter) for non_letter in non_letters]\n", | |
| "\n", | |
| "#break down sentences into words\n", | |
| "def sent_to_words(sentences): \n", | |
| " for sentence in sentences:\n", | |
| " yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n", | |
| "\n", | |
| "data_words = list(sent_to_words(descriptions)) " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Build the bigram and trigram models\n", | |
| "bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) \n", | |
| "trigram = gensim.models.Phrases(bigram[data_words], threshold=100)\n", | |
| "\n", | |
| "# Faster way to get a sentence clubbed as a trigram/bigram\n", | |
| "bigram_mod = gensim.models.phrases.Phraser(bigram) \n", | |
| "\n", | |
| "trigram_mod = gensim.models.phrases.Phraser(trigram)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Initialize spacy\n", | |
| "nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])\n", | |
| "\n", | |
| "def make_bigrams(texts):\n", | |
| " return [bigram_mod[doc] for doc in texts]\n", | |
| "\n", | |
| "def make_trigrams(texts):\n", | |
| " return [trigram_mod[bigram_mod[doc]] for doc in texts]\n", | |
| "\n", | |
| "def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): \n", | |
| " \"\"\"https://spacy.io/api/annotation\"\"\"\n", | |
| " texts_out = []\n", | |
| " for sent in texts:\n", | |
| " doc = nlp(\" \".join(sent))\n", | |
| " texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) \n", | |
| " return texts_out\n", | |
| "\n", | |
| "# Form Bigrams\n", | |
| "data_words_bigrams = make_bigrams(data_words)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Perform lemmatization keeping only nouns, adjectives, verbs and adjectives\n", | |
| "data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) \n", | |
| "\n", | |
| "print(data_lemmatized[:1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "texts = data_lemmatized\n", | |
| "\n", | |
| "# a mapping between words and their corresponding integer values\n", | |
| "id2word = corpora.Dictionary(texts) \n", | |
| "\n", | |
| "# Term Document Frequency and gensim creates a unique id for each word in the document\n", | |
| "corpus = [id2word.doc2bow(text) for text in texts]\n", | |
| "\n", | |
| "# This corpus is a mapping of (word_id, word_frequency)\n", | |
| "print(corpus[:1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Human readable format of corpus (term-frequency)\n", | |
| "[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.4" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment