Last active
August 8, 2021 17:17
-
-
Save SandieIJ/69fc80c372e823fecfd4eeeda2156936 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "**Install SpaCy and Gensim as well as a preferred SpaCy model, here I use the English model**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pip install -U spacy" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pip install pyLDAvis" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pip install osqp" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pip install gensim" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "**In order to carry out pre-processing we will use the following libraries and modules**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import spacy\n", | |
| "import re\n", | |
| "import pandas as pd\n", | |
| "import gensim\n", | |
| "from gensim.utils import simple_preprocess \n", | |
| "import gensim.corpora as corpora\n", | |
| "from pprint import pprint" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| " # Loading the data\n", | |
| "data = pd.read_csv(\"https://raw.githubusercontent.com/SandieIJ/Capstone/master/data/sandra_csv_results-20190723-155508.csv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Preview the data\n", | |
| "data.head(10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Isolate the descriptions\n", | |
| "descriptions = data.description.values.tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Remove new line characters\n", | |
| "no_new_lines = [re.sub('\\s+', ' ', sent) for sent in descriptions] \n", | |
| "\n", | |
| "#Remove non letter characters\n", | |
| "non_letters = [re.sub('[^a-zA-Z]', ' ', no_new_line) for no_new_line in no_new_lines]\n", | |
| "\n", | |
| "# Remove distracting single quotes\n", | |
| "no_quotes = [re.sub(\"\\'\", '', non_letter) for non_letter in non_letters]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Preview the data\n", | |
| "\n", | |
| "pprint(descriptions[:1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Break down sentences into words\n", | |
| "def sent_to_words(sentences): \n", | |
| " for sentence in sentences:\n", | |
| " yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data_words = list(sent_to_words(descriptions)) \n", | |
| "\n", | |
| "print(data_words[:1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Build the bigram and trigram models\n", | |
| "\n", | |
| "bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) \n", | |
| "trigram = gensim.models.Phrases(bigram[data_words], threshold=100)\n", | |
| "\n", | |
| "# Faster way to get a sentence clubbed as a trigram/bigram\n", | |
| "bigram_mod = gensim.models.phrases.Phraser(bigram) \n", | |
| "trigram_mod = gensim.models.phrases.Phraser(trigram)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Initialize spacy\n", | |
| "\n", | |
| "nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def make_bigrams(texts):\n", | |
| " return [bigram_mod[doc] for doc in texts]\n", | |
| "\n", | |
| "def make_trigrams(texts):\n", | |
| " return [trigram_mod[bigram_mod[doc]] for doc in texts]\n", | |
| "\n", | |
| "def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): \n", | |
| " \"\"\"https://spacy.io/api/annotation\"\"\"\n", | |
| " texts_out = []\n", | |
| " for sent in texts:\n", | |
| " doc = nlp(\" \".join(sent))\n", | |
| " texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) \n", | |
| " return texts_out" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Form Bigrams\n", | |
| "data_words_bigrams = make_bigrams(data_words)\n", | |
| "\n", | |
| "# Perform lemmatization keeping only nouns, adjectives, verbs and adjectives\n", | |
| "data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) \n", | |
| "\n", | |
| "print(data_lemmatized[:1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "texts = data_lemmatized\n", | |
| "\n", | |
| "# Creates a mapping between words and their corresponding integer values\n", | |
| "id2word = corpora.Dictionary(texts) \n", | |
| "\n", | |
| "# Term Document Frequency and gensim creates a unique id for each word in the document\n", | |
| "corpus = [id2word.doc2bow(text) for text in texts]\n", | |
| "\n", | |
| "# This corpus is a mapping of (word_id, word_frequency)\n", | |
| "print(corpus[:1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Human readable format of corpus (term-frequency)\n", | |
| "[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]" | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.4" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment