Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save SandieIJ/69fc80c372e823fecfd4eeeda2156936 to your computer and use it in GitHub Desktop.

Select an option

Save SandieIJ/69fc80c372e823fecfd4eeeda2156936 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install spacy\n",
"!pip install gensim\n",
"!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"import re\n",
"import pandas as pd\n",
"import gensim\n",
"from gensim.utils import simple_preprocess \n",
"import gensim.corpora as corpora\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>Legend Fire Squad survival: Free Fire Battlegr...</td>\n",
" <td>Ready to play an amazing and exciting best sho...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>Ambulance Game</td>\n",
" <td>You must be a fan of the driving games. We ass...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>Beam Drive NG Death Stair Car Crash Simulator</td>\n",
" <td>Beam Drive NG Death Stair Car Crash Accidents ...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>Kelime İncileri</td>\n",
" <td>Yeni Kelime Bulmaca Oyununuz! Kelime Arama ve ...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>Word Blocks</td>\n",
" <td>Word Blocks is a new kind of word search puzzl...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>Free Fire Commando - Counter Attack FPS 2019</td>\n",
" <td>Free Fire Commando - Counter Attack FPS 2019 i...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>Fall Race 3D</td>\n",
" <td>The most exciting sky race!Run through the sky...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>Math School Game Basic: Crazy Principal</td>\n",
" <td>Your school principal went crazy and locked yo...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>Jump Cube</td>\n",
" <td>Jump Cube is an addictive game, tap the right ...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>Tien Len Offline</td>\n",
" <td>Một tựa game cũng như cách chơi ko thể quen th...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name \\\n",
"0 Legend Fire Squad survival: Free Fire Battlegr... \n",
"1 Ambulance Game \n",
"2 Beam Drive NG Death Stair Car Crash Simulator \n",
"3 Kelime İncileri \n",
"4 Word Blocks \n",
"5 Free Fire Commando - Counter Attack FPS 2019 \n",
"6 Fall Race 3D \n",
"7 Math School Game Basic: Crazy Principal \n",
"8 Jump Cube \n",
"9 Tien Len Offline \n",
"\n",
" description \n",
"0 Ready to play an amazing and exciting best sho... \n",
"1 You must be a fan of the driving games. We ass... \n",
"2 Beam Drive NG Death Stair Car Crash Accidents ... \n",
"3 Yeni Kelime Bulmaca Oyununuz! Kelime Arama ve ... \n",
"4 Word Blocks is a new kind of word search puzzl... \n",
"5 Free Fire Commando - Counter Attack FPS 2019 i... \n",
"6 The most exciting sky race!Run through the sky... \n",
"7 Your school principal went crazy and locked yo... \n",
"8 Jump Cube is an addictive game, tap the right ... \n",
"9 Một tựa game cũng như cách chơi ko thể quen th... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Reading loading/data\n",
"data = pd.read_csv(\"https://raw.githubusercontent.com/SandieIJ/Capstone/master/data/sandra_csv_results-20190723-155508.csv\")\n",
"\n",
"data.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"descriptions = data.description.values.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Remove new line characters\n",
"no_new_lines = [re.sub('\\s+', ' ', sent) for sent in descriptions] \n",
"\n",
"#Remove non letter characters\n",
"non_letters = [re.sub('[^a-zA-Z]', ' ', no_new_line) for no_new_line in no_new_lines]\n",
"\n",
"# Remove distracting single quotes\n",
"no_quotes = [re.sub(\"\\'\", '', non_letter) for non_letter in non_letters]\n",
"\n",
"#break down sentences into words\n",
"def sent_to_words(sentences): \n",
" for sentence in sentences:\n",
" yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
"\n",
"data_words = list(sent_to_words(descriptions)) "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Build the bigram and trigram models\n",
"bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) \n",
"trigram = gensim.models.Phrases(bigram[data_words], threshold=100)\n",
"\n",
"# Faster way to get a sentence clubbed as a trigram/bigram\n",
"bigram_mod = gensim.models.phrases.Phraser(bigram) \n",
"\n",
"trigram_mod = gensim.models.phrases.Phraser(trigram)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Initialize spacy\n",
"nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])\n",
"\n",
"def make_bigrams(texts):\n",
" return [bigram_mod[doc] for doc in texts]\n",
"\n",
"def make_trigrams(texts):\n",
" return [trigram_mod[bigram_mod[doc]] for doc in texts]\n",
"\n",
"def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): \n",
" \"\"\"https://spacy.io/api/annotation\"\"\"\n",
" texts_out = []\n",
" for sent in texts:\n",
" doc = nlp(\" \".join(sent))\n",
" texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) \n",
" return texts_out\n",
"\n",
"# Form Bigrams\n",
"data_words_bigrams = make_bigrams(data_words)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['ready', 'play', 'amazing', 'exciting', 'good', 'shooting', 'game', 'fire', 'shoot', 'game', 'war', 'shooting', 'game', 'free', 'unknown', 'battle', 'strike', 'free', 'survival', 'mission', 'free', 'fire', 'unknown', 'shoot', 'action', 'game', 'face', 'dangerous', 'death', 'mission', 'exciting', 'survival', 'free', 'firing_squad', 'free', 'fire', 'shooting', 'game', 'commando', 'shoot', 'survival', 'game', 'army', 'soldier', 'crazy', 'challenging', 'shooting', 'arena', 'where', 'training', 'face', 'crazy', 'dangerous', 'death', 'mission', 'enemy', 'free', 'fire', 'shoot', 'unknown', 'battleground', 'mission', 'best', 'offline', 'shoot', 'game', 'commando', 'training', 'skill', 'squad', 'survival', 'mission', 'battleground', 'survival', 'free', 'fire', 'game', 'depend', 'war', 'shoot', 'squad', 'free', 'fire', 'battleground', 'war', 'battleground', 'game', 'army', 'last', 'player', 'firing_squad', 'face', 'crazy', 'death', 'mission', 'legend', 'fire', 'fire', 'free', 'fire', 'battleground', 'battleground', 'cross', 'fire', 'surgical_strike', 'fill', 'fierce', 'shooting', 'game', 'training', 'skill', 'fire', 'battleground', 'game', 'world', 'war', 'mission', 'where', 'commando', 'mission', 'good', 'shooting', 'survival', 'unknown', 'battle', 'strike', 'control', 'wait', 'sniper', 'shooting', 'skill', 'start', 'survival', 'battleground', 'strike', 'journey', 'modern', 'weapon', 'free', 'fire', 'survival', 'shoot', 'mission', 'sniper', 'gun', 'other', 'shoot', 'battlefield', 'weapon', 'graphic', 'real', 'firing_squad', 'mind_blowing', 'fire', 'squad', 'survival', 'mission', 'survival', 'strike', 'journey', 'legend', 'battle', 'strike', 'game', 'good', 'shooting', 'game', 'lot', 'gun', 'see', 'game', 'feel', 'good', 'gun', 'game', 'show', 'world', 'war', 'commando', 'training', 'skill', 'modern', 'weapon', 'sniper', 'gun', 'unknown', 'enemy', 'squad', 'commando', 'training', 'skill', 'free', 'fire', 'battleground', 'feature', 'variety', 'weapon', 'available', 'free', 'fire', 'shoot', 'missionsdozen', 'mission', 'war', 'shoot', 'squadreal', 'enemy', 'terrorist', 'ai', 'unknown_battleground', 'environment', 'system', 'detect', 'enemy', 'position', 'surgical', 'strikesimple', 'smooth', 'control', 'download', 'play', 'store', 'good', 'legend', 'free', 'fire', 'totally', 'free']]\n"
]
}
],
"source": [
"# Perform lemmatization keeping only nouns, adjectives, verbs and adjectives\n",
"data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) \n",
"\n",
"print(data_lemmatized[:1])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 3), (7, 1), (8, 9), (9, 1), (10, 1), (11, 5), (12, 2), (13, 3), (14, 1), (15, 2), (16, 3), (17, 1), (18, 1), (19, 1), (20, 4), (21, 1), (22, 2), (23, 3), (24, 1), (25, 1), (26, 1), (27, 1), (28, 16), (29, 3), (30, 14), (31, 15), (32, 5), (33, 1), (34, 4), (35, 2), (36, 1), (37, 3), (38, 1), (39, 1), (40, 11), (41, 1), (42, 2), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 10), (52, 8), (53, 1), (54, 5), (55, 1), (56, 3), (57, 1), (58, 4), (59, 1), (60, 1), (61, 1), (62, 5), (63, 1), (64, 1), (65, 1), (66, 10), (67, 1), (68, 1), (69, 1), (70, 5), (71, 5), (72, 1), (73, 1), (74, 1), (75, 6), (76, 4), (77, 2), (78, 2)]]\n"
]
}
],
"source": [
"texts = data_lemmatized\n",
"\n",
"# a mapping between words and their corresponding integer values\n",
"id2word = corpora.Dictionary(texts) \n",
"\n",
"# Term Document Frequency and gensim creates a unique id for each word in the document\n",
"corpus = [id2word.doc2bow(text) for text in texts]\n",
"\n",
"# This corpus is a mapping of (word_id, word_frequency)\n",
"print(corpus[:1])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[[('action', 1),\n",
" ('ai', 1),\n",
" ('amazing', 1),\n",
" ('arena', 1),\n",
" ('army', 2),\n",
" ('available', 1),\n",
" ('battle', 3),\n",
" ('battlefield', 1),\n",
" ('battleground', 9),\n",
" ('best', 1),\n",
" ('challenging', 1),\n",
" ('commando', 5),\n",
" ('control', 2),\n",
" ('crazy', 3),\n",
" ('cross', 1),\n",
" ('dangerous', 2),\n",
" ('death', 3),\n",
" ('depend', 1),\n",
" ('detect', 1),\n",
" ('download', 1),\n",
" ('enemy', 4),\n",
" ('environment', 1),\n",
" ('exciting', 2),\n",
" ('face', 3),\n",
" ('feature', 1),\n",
" ('feel', 1),\n",
" ('fierce', 1),\n",
" ('fill', 1),\n",
" ('fire', 16),\n",
" ('firing_squad', 3),\n",
" ('free', 14),\n",
" ('game', 15),\n",
" ('good', 5),\n",
" ('graphic', 1),\n",
" ('gun', 4),\n",
" ('journey', 2),\n",
" ('last', 1),\n",
" ('legend', 3),\n",
" ('lot', 1),\n",
" ('mind_blowing', 1),\n",
" ('mission', 11),\n",
" ('missionsdozen', 1),\n",
" ('modern', 2),\n",
" ('offline', 1),\n",
" ('other', 1),\n",
" ('play', 2),\n",
" ('player', 1),\n",
" ('position', 1),\n",
" ('ready', 1),\n",
" ('real', 1),\n",
" ('see', 1),\n",
" ('shoot', 10),\n",
" ('shooting', 8),\n",
" ('show', 1),\n",
" ('skill', 5),\n",
" ('smooth', 1),\n",
" ('sniper', 3),\n",
" ('soldier', 1),\n",
" ('squad', 4),\n",
" ('squadreal', 1),\n",
" ('start', 1),\n",
" ('store', 1),\n",
" ('strike', 5),\n",
" ('strikesimple', 1),\n",
" ('surgical', 1),\n",
" ('surgical_strike', 1),\n",
" ('survival', 10),\n",
" ('system', 1),\n",
" ('terrorist', 1),\n",
" ('totally', 1),\n",
" ('training', 5),\n",
" ('unknown', 5),\n",
" ('unknown_battleground', 1),\n",
" ('variety', 1),\n",
" ('wait', 1),\n",
" ('war', 6),\n",
" ('weapon', 4),\n",
" ('where', 2),\n",
" ('world', 2)]]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Human readable format of corpus (term-frequency)\n",
"[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment