{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install spacy\n", "!pip install gensim\n", "!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "import re\n", "import pandas as pd\n", "import gensim\n", "from gensim.utils import simple_preprocess \n", "import gensim.corpora as corpora\n", "from pprint import pprint" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namedescription
0Legend Fire Squad survival: Free Fire Battlegr...Ready to play an amazing and exciting best sho...
1Ambulance GameYou must be a fan of the driving games. We ass...
2Beam Drive NG Death Stair Car Crash SimulatorBeam Drive NG Death Stair Car Crash Accidents ...
3Kelime İncileriYeni Kelime Bulmaca Oyununuz! Kelime Arama ve ...
4Word BlocksWord Blocks is a new kind of word search puzzl...
5Free Fire Commando - Counter Attack FPS 2019Free Fire Commando - Counter Attack FPS 2019 i...
6Fall Race 3DThe most exciting sky race!Run through the sky...
7Math School Game Basic: Crazy PrincipalYour school principal went crazy and locked yo...
8Jump CubeJump Cube is an addictive game, tap the right ...
9Tien Len OfflineMột tựa game cũng như cách chơi ko thể quen th...
\n", "
" ], "text/plain": [ " name \\\n", "0 Legend Fire Squad survival: Free Fire Battlegr... \n", "1 Ambulance Game \n", "2 Beam Drive NG Death Stair Car Crash Simulator \n", "3 Kelime İncileri \n", "4 Word Blocks \n", "5 Free Fire Commando - Counter Attack FPS 2019 \n", "6 Fall Race 3D \n", "7 Math School Game Basic: Crazy Principal \n", "8 Jump Cube \n", "9 Tien Len Offline \n", "\n", " description \n", "0 Ready to play an amazing and exciting best sho... \n", "1 You must be a fan of the driving games. We ass... \n", "2 Beam Drive NG Death Stair Car Crash Accidents ... \n", "3 Yeni Kelime Bulmaca Oyununuz! Kelime Arama ve ... \n", "4 Word Blocks is a new kind of word search puzzl... \n", "5 Free Fire Commando - Counter Attack FPS 2019 i... \n", "6 The most exciting sky race!Run through the sky... \n", "7 Your school principal went crazy and locked yo... \n", "8 Jump Cube is an addictive game, tap the right ... \n", "9 Một tựa game cũng như cách chơi ko thể quen th... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Reading loading/data\n", "data = pd.read_csv(\"https://raw.githubusercontent.com/SandieIJ/Capstone/master/data/sandra_csv_results-20190723-155508.csv\")\n", "\n", "data.head(10)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "descriptions = data.description.values.tolist()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Remove new line characters\n", "no_new_lines = [re.sub('\\s+', ' ', sent) for sent in descriptions] \n", "\n", "#Remove non letter characters\n", "non_letters = [re.sub('[^a-zA-Z]', ' ', no_new_line) for no_new_line in no_new_lines]\n", "\n", "# Remove distracting single quotes\n", "no_quotes = [re.sub(\"\\'\", '', non_letter) for non_letter in non_letters]\n", "\n", "#break down sentences into words\n", "def sent_to_words(sentences): \n", " for sentence in sentences:\n", " yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n", "\n", "data_words = list(sent_to_words(descriptions)) " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Build the bigram and trigram models\n", "bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) \n", "trigram = gensim.models.Phrases(bigram[data_words], threshold=100)\n", "\n", "# Faster way to get a sentence clubbed as a trigram/bigram\n", "bigram_mod = gensim.models.phrases.Phraser(bigram) \n", "\n", "trigram_mod = gensim.models.phrases.Phraser(trigram)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Initialize spacy\n", "nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])\n", "\n", "def make_bigrams(texts):\n", " return [bigram_mod[doc] for doc in texts]\n", "\n", "def make_trigrams(texts):\n", " return [trigram_mod[bigram_mod[doc]] for doc in texts]\n", "\n", "def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): \n", " \"\"\"https://spacy.io/api/annotation\"\"\"\n", " texts_out = []\n", " for sent in texts:\n", " doc = nlp(\" \".join(sent))\n", " texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) \n", " return texts_out\n", "\n", "# Form Bigrams\n", "data_words_bigrams = make_bigrams(data_words)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['ready', 'play', 'amazing', 'exciting', 'good', 'shooting', 'game', 'fire', 'shoot', 'game', 'war', 'shooting', 'game', 'free', 'unknown', 'battle', 'strike', 'free', 'survival', 'mission', 'free', 'fire', 'unknown', 'shoot', 'action', 'game', 'face', 'dangerous', 'death', 'mission', 'exciting', 'survival', 'free', 'firing_squad', 'free', 'fire', 'shooting', 'game', 'commando', 'shoot', 'survival', 'game', 'army', 'soldier', 'crazy', 'challenging', 'shooting', 'arena', 'where', 'training', 'face', 'crazy', 'dangerous', 'death', 'mission', 'enemy', 'free', 'fire', 'shoot', 'unknown', 'battleground', 'mission', 'best', 'offline', 'shoot', 'game', 'commando', 'training', 'skill', 'squad', 'survival', 'mission', 'battleground', 'survival', 'free', 'fire', 'game', 'depend', 'war', 'shoot', 'squad', 'free', 'fire', 'battleground', 'war', 'battleground', 'game', 'army', 'last', 'player', 'firing_squad', 'face', 'crazy', 'death', 'mission', 'legend', 'fire', 'fire', 'free', 'fire', 'battleground', 'battleground', 'cross', 'fire', 'surgical_strike', 'fill', 'fierce', 'shooting', 'game', 'training', 'skill', 'fire', 'battleground', 'game', 'world', 'war', 'mission', 'where', 'commando', 'mission', 'good', 'shooting', 'survival', 'unknown', 'battle', 'strike', 'control', 'wait', 'sniper', 'shooting', 'skill', 'start', 'survival', 'battleground', 'strike', 'journey', 'modern', 'weapon', 'free', 'fire', 'survival', 'shoot', 'mission', 'sniper', 'gun', 'other', 'shoot', 'battlefield', 'weapon', 'graphic', 'real', 'firing_squad', 'mind_blowing', 'fire', 'squad', 'survival', 'mission', 'survival', 'strike', 'journey', 'legend', 'battle', 'strike', 'game', 'good', 'shooting', 'game', 'lot', 'gun', 'see', 'game', 'feel', 'good', 'gun', 'game', 'show', 'world', 'war', 'commando', 'training', 'skill', 'modern', 'weapon', 'sniper', 'gun', 'unknown', 'enemy', 'squad', 'commando', 'training', 'skill', 'free', 'fire', 'battleground', 'feature', 'variety', 'weapon', 'available', 'free', 'fire', 'shoot', 'missionsdozen', 'mission', 'war', 'shoot', 'squadreal', 'enemy', 'terrorist', 'ai', 'unknown_battleground', 'environment', 'system', 'detect', 'enemy', 'position', 'surgical', 'strikesimple', 'smooth', 'control', 'download', 'play', 'store', 'good', 'legend', 'free', 'fire', 'totally', 'free']]\n" ] } ], "source": [ "# Perform lemmatization keeping only nouns, adjectives, verbs and adjectives\n", "data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) \n", "\n", "print(data_lemmatized[:1])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 3), (7, 1), (8, 9), (9, 1), (10, 1), (11, 5), (12, 2), (13, 3), (14, 1), (15, 2), (16, 3), (17, 1), (18, 1), (19, 1), (20, 4), (21, 1), (22, 2), (23, 3), (24, 1), (25, 1), (26, 1), (27, 1), (28, 16), (29, 3), (30, 14), (31, 15), (32, 5), (33, 1), (34, 4), (35, 2), (36, 1), (37, 3), (38, 1), (39, 1), (40, 11), (41, 1), (42, 2), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 10), (52, 8), (53, 1), (54, 5), (55, 1), (56, 3), (57, 1), (58, 4), (59, 1), (60, 1), (61, 1), (62, 5), (63, 1), (64, 1), (65, 1), (66, 10), (67, 1), (68, 1), (69, 1), (70, 5), (71, 5), (72, 1), (73, 1), (74, 1), (75, 6), (76, 4), (77, 2), (78, 2)]]\n" ] } ], "source": [ "texts = data_lemmatized\n", "\n", "# a mapping between words and their corresponding integer values\n", "id2word = corpora.Dictionary(texts) \n", "\n", "# Term Document Frequency and gensim creates a unique id for each word in the document\n", "corpus = [id2word.doc2bow(text) for text in texts]\n", "\n", "# This corpus is a mapping of (word_id, word_frequency)\n", "print(corpus[:1])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('action', 1),\n", " ('ai', 1),\n", " ('amazing', 1),\n", " ('arena', 1),\n", " ('army', 2),\n", " ('available', 1),\n", " ('battle', 3),\n", " ('battlefield', 1),\n", " ('battleground', 9),\n", " ('best', 1),\n", " ('challenging', 1),\n", " ('commando', 5),\n", " ('control', 2),\n", " ('crazy', 3),\n", " ('cross', 1),\n", " ('dangerous', 2),\n", " ('death', 3),\n", " ('depend', 1),\n", " ('detect', 1),\n", " ('download', 1),\n", " ('enemy', 4),\n", " ('environment', 1),\n", " ('exciting', 2),\n", " ('face', 3),\n", " ('feature', 1),\n", " ('feel', 1),\n", " ('fierce', 1),\n", " ('fill', 1),\n", " ('fire', 16),\n", " ('firing_squad', 3),\n", " ('free', 14),\n", " ('game', 15),\n", " ('good', 5),\n", " ('graphic', 1),\n", " ('gun', 4),\n", " ('journey', 2),\n", " ('last', 1),\n", " ('legend', 3),\n", " ('lot', 1),\n", " ('mind_blowing', 1),\n", " ('mission', 11),\n", " ('missionsdozen', 1),\n", " ('modern', 2),\n", " ('offline', 1),\n", " ('other', 1),\n", " ('play', 2),\n", " ('player', 1),\n", " ('position', 1),\n", " ('ready', 1),\n", " ('real', 1),\n", " ('see', 1),\n", " ('shoot', 10),\n", " ('shooting', 8),\n", " ('show', 1),\n", " ('skill', 5),\n", " ('smooth', 1),\n", " ('sniper', 3),\n", " ('soldier', 1),\n", " ('squad', 4),\n", " ('squadreal', 1),\n", " ('start', 1),\n", " ('store', 1),\n", " ('strike', 5),\n", " ('strikesimple', 1),\n", " ('surgical', 1),\n", " ('surgical_strike', 1),\n", " ('survival', 10),\n", " ('system', 1),\n", " ('terrorist', 1),\n", " ('totally', 1),\n", " ('training', 5),\n", " ('unknown', 5),\n", " ('unknown_battleground', 1),\n", " ('variety', 1),\n", " ('wait', 1),\n", " ('war', 6),\n", " ('weapon', 4),\n", " ('where', 2),\n", " ('world', 2)]]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Human readable format of corpus (term-frequency)\n", "[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }