Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save SandieIJ/69fc80c372e823fecfd4eeeda2156936 to your computer and use it in GitHub Desktop.

Select an option

Save SandieIJ/69fc80c372e823fecfd4eeeda2156936 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Install SpaCy and Gensim as well as a preferred SpaCy model, here I use the English model**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pip install -U spacy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pip install pyLDAvis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pip install osqp"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pip install gensim"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**In order to carry out pre-processing we will use the following libraries and modules**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"import re\n",
"import pandas as pd\n",
"import gensim\n",
"from gensim.utils import simple_preprocess \n",
"import gensim.corpora as corpora\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # Loading the data\n",
"data = pd.read_csv(\"https://raw.githubusercontent.com/SandieIJ/Capstone/master/data/sandra_csv_results-20190723-155508.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Preview the data\n",
"data.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Isolate the descriptions\n",
"descriptions = data.description.values.tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Remove new line characters\n",
"no_new_lines = [re.sub('\\s+', ' ', sent) for sent in descriptions] \n",
"\n",
"#Remove non letter characters\n",
"non_letters = [re.sub('[^a-zA-Z]', ' ', no_new_line) for no_new_line in no_new_lines]\n",
"\n",
"# Remove distracting single quotes\n",
"no_quotes = [re.sub(\"\\'\", '', non_letter) for non_letter in non_letters]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Preview the data\n",
"\n",
"pprint(descriptions[:1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Break down sentences into words\n",
"def sent_to_words(sentences): \n",
" for sentence in sentences:\n",
" yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_words = list(sent_to_words(descriptions)) \n",
"\n",
"print(data_words[:1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Build the bigram and trigram models\n",
"\n",
"bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) \n",
"trigram = gensim.models.Phrases(bigram[data_words], threshold=100)\n",
"\n",
"# Faster way to get a sentence clubbed as a trigram/bigram\n",
"bigram_mod = gensim.models.phrases.Phraser(bigram) \n",
"trigram_mod = gensim.models.phrases.Phraser(trigram)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initialize spacy\n",
"\n",
"nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def make_bigrams(texts):\n",
" return [bigram_mod[doc] for doc in texts]\n",
"\n",
"def make_trigrams(texts):\n",
" return [trigram_mod[bigram_mod[doc]] for doc in texts]\n",
"\n",
"def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): \n",
" \"\"\"https://spacy.io/api/annotation\"\"\"\n",
" texts_out = []\n",
" for sent in texts:\n",
" doc = nlp(\" \".join(sent))\n",
" texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) \n",
" return texts_out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Form Bigrams\n",
"data_words_bigrams = make_bigrams(data_words)\n",
"\n",
"# Perform lemmatization keeping only nouns, adjectives, verbs and adjectives\n",
"data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) \n",
"\n",
"print(data_lemmatized[:1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"texts = data_lemmatized\n",
"\n",
"# Creates a mapping between words and their corresponding integer values\n",
"id2word = corpora.Dictionary(texts) \n",
"\n",
"# Term Document Frequency and gensim creates a unique id for each word in the document\n",
"corpus = [id2word.doc2bow(text) for text in texts]\n",
"\n",
"# This corpus is a mapping of (word_id, word_frequency)\n",
"print(corpus[:1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Human readable format of corpus (term-frequency)\n",
"[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment