SandieIJ · August 8, 2021 17:17
diff --git a/text_preprocessing.ipynb b/text_preprocessing.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Install SpaCy and Gensim as well as a preferred SpaCy model, here I use the English model**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install -U spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install pyLDAvis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install osqp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**In order to carry out pre-processing we will use the following libraries and modules**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "import re\n",
    "import pandas as pd\n",
    "import gensim\n",
    "from gensim.utils import simple_preprocess \n",
    "import gensim.corpora as corpora\n",
    "from pprint import pprint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " # Loading the data\n",
    "data = pd.read_csv(\"https://raw.githubusercontent.com/SandieIJ/Capstone/master/data/sandra_csv_results-20190723-155508.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preview the data\n",
    "data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Isolate the descriptions\n",
    "descriptions = data.description.values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove new line characters\n",
    "no_new_lines = [re.sub('\\s+', ' ', sent) for sent in descriptions] \n",
    "\n",
    "#Remove non letter characters\n",
    "non_letters = [re.sub('[^a-zA-Z]', ' ', no_new_line) for no_new_line in no_new_lines]\n",
    "\n",
    "# Remove distracting single quotes\n",
    "no_quotes = [re.sub(\"\\'\", '', non_letter) for non_letter in non_letters]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preview the data\n",
    "\n",
    "pprint(descriptions[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Break down sentences into words\n",
    "def sent_to_words(sentences): \n",
    "    for sentence in sentences:\n",
    "        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_words = list(sent_to_words(descriptions)) \n",
    "\n",
    "print(data_words[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the bigram and trigram models\n",
    "\n",
    "bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) \n",
    "trigram = gensim.models.Phrases(bigram[data_words], threshold=100)\n",
    "\n",
    "# Faster way to get a sentence clubbed as a trigram/bigram\n",
    "bigram_mod = gensim.models.phrases.Phraser(bigram) \n",
    "trigram_mod = gensim.models.phrases.Phraser(trigram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize spacy\n",
    "\n",
    "nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_bigrams(texts):\n",
    "    return [bigram_mod[doc] for doc in texts]\n",
    "\n",
    "def make_trigrams(texts):\n",
    "    return [trigram_mod[bigram_mod[doc]] for doc in texts]\n",
    "\n",
    "def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): \n",
    "    \"\"\"https://spacy.io/api/annotation\"\"\"\n",
    "    texts_out = []\n",
    "    for sent in texts:\n",
    "        doc = nlp(\" \".join(sent))\n",
    "        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) \n",
    "        return texts_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Form Bigrams\n",
    "data_words_bigrams = make_bigrams(data_words)\n",
    "\n",
    "# Perform lemmatization keeping only nouns, adjectives, verbs and adjectives\n",
    "data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) \n",
    "\n",
    "print(data_lemmatized[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = data_lemmatized\n",
    "\n",
    "# Creates a mapping between words and their corresponding integer values\n",
    "id2word = corpora.Dictionary(texts) \n",
    "\n",
    "# Term Document Frequency and gensim creates a unique id for each word in the document\n",
    "corpus = [id2word.doc2bow(text) for text in texts]\n",
    "\n",
    "# This corpus is a mapping of (word_id, word_frequency)\n",
    "print(corpus[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Human readable format of corpus (term-frequency)\n",
    "[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Install SpaCy and Gensim as well as a preferred SpaCy model, here I use the English model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"pip install -U spacy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"pip install pyLDAvis"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"pip install osqp"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"pip install gensim"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"In order to carry out pre-processing we will use the following libraries and modules"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import spacy\n",
	"import re\n",
	"import pandas as pd\n",
	"import gensim\n",
	"from gensim.utils import simple_preprocess \n",
	"import gensim.corpora as corpora\n",
	"from pprint import pprint"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	" # Loading the data\n",
	"data = pd.read_csv(\"https://raw.githubusercontent.com/SandieIJ/Capstone/master/data/sandra_csv_results-20190723-155508.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Preview the data\n",
	"data.head(10)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Isolate the descriptions\n",
	"descriptions = data.description.values.tolist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Remove new line characters\n",
	"no_new_lines = [re.sub('\\s+', ' ', sent) for sent in descriptions] \n",
	"\n",
	"#Remove non letter characters\n",
	"non_letters = [re.sub('[^a-zA-Z]', ' ', no_new_line) for no_new_line in no_new_lines]\n",
	"\n",
	"# Remove distracting single quotes\n",
	"no_quotes = [re.sub(\"\\'\", '', non_letter) for non_letter in non_letters]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Preview the data\n",
	"\n",
	"pprint(descriptions[:1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Break down sentences into words\n",
	"def sent_to_words(sentences): \n",
	" for sentence in sentences:\n",
	" yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"data_words = list(sent_to_words(descriptions)) \n",
	"\n",
	"print(data_words[:1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Build the bigram and trigram models\n",
	"\n",
	"bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) \n",
	"trigram = gensim.models.Phrases(bigram[data_words], threshold=100)\n",
	"\n",
	"# Faster way to get a sentence clubbed as a trigram/bigram\n",
	"bigram_mod = gensim.models.phrases.Phraser(bigram) \n",
	"trigram_mod = gensim.models.phrases.Phraser(trigram)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Initialize spacy\n",
	"\n",
	"nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def make_bigrams(texts):\n",
	" return [bigram_mod[doc] for doc in texts]\n",
	"\n",
	"def make_trigrams(texts):\n",
	" return [trigram_mod[bigram_mod[doc]] for doc in texts]\n",
	"\n",
	"def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): \n",
	" \"\"\"https://spacy.io/api/annotation\"\"\"\n",
	" texts_out = []\n",
	" for sent in texts:\n",
	" doc = nlp(\" \".join(sent))\n",
	" texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) \n",
	" return texts_out"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Form Bigrams\n",
	"data_words_bigrams = make_bigrams(data_words)\n",
	"\n",
	"# Perform lemmatization keeping only nouns, adjectives, verbs and adjectives\n",
	"data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) \n",
	"\n",
	"print(data_lemmatized[:1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"texts = data_lemmatized\n",
	"\n",
	"# Creates a mapping between words and their corresponding integer values\n",
	"id2word = corpora.Dictionary(texts) \n",
	"\n",
	"# Term Document Frequency and gensim creates a unique id for each word in the document\n",
	"corpus = [id2word.doc2bow(text) for text in texts]\n",
	"\n",
	"# This corpus is a mapping of (word_id, word_frequency)\n",
	"print(corpus[:1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Human readable format of corpus (term-frequency)\n",
	"[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]"
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
No results found