Created
October 9, 2017 03:02
-
-
Save GeorgeMcIntire/d003ece0b6ca171be8e095e0bc90af7e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Script took 38.52 seconds to complete\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "#Imports\n", | |
| "import time\n", | |
| "#Assign time.time() object to \"start\" so we can profile the code.\n", | |
| "start = time.time()\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "from newspaper import Article\n", | |
| "#Load in pickle file with links and slice the first 50 links\n", | |
| "links = pd.read_pickle(\"nytimes_links.pkl\")[:50]\n", | |
| "\n", | |
| "#Intialize list articles_info list\n", | |
| "articles_info = []\n", | |
| "for i in links:\n", | |
| " #Intialize dictionary\n", | |
| " article_dict = {}\n", | |
| " #Insert link \"i\" into the dictionary\n", | |
| " article_dict[\"link\"] = i\n", | |
| " #Pass link into Article() function\n", | |
| " art = Article(i)\n", | |
| " #Download contents of art object\n", | |
| " art.download()\n", | |
| " \n", | |
| " #Try/except is included because not all articles can be parsed\n", | |
| " try:\n", | |
| " #If article can be successfully parsed then insert its text, title, publish_date, keywords\n", | |
| " #and summary into corresponding keys\n", | |
| " art.parse()\n", | |
| " article_dict[\"text\"] = art.text\n", | |
| " article_dict[\"title\"] = art.title\n", | |
| " article_dict[\"date\"] = art.publish_date\n", | |
| " art.nlp()\n", | |
| " article_dict[\"keywords\"] = art.keywords\n", | |
| " article_dict[\"summary\"] = art.summary\n", | |
| " except ArticleException:\n", | |
| " #If article cannot be parse then insert null values for the following keys:\n", | |
| " #\"text\", \"title\", \"date\", \"keywords\", and \"summary\"\n", | |
| " article_dict[\"text\"] = np.nan\n", | |
| " article_dict[\"title\"] = np.nan\n", | |
| " article_dict[\"date\"] = np.nan\n", | |
| " article_dict[\"keywords\"] = np.nan\n", | |
| " article_dict[\"summary\"] = np.nan\n", | |
| " \n", | |
| " #Insert dictionary of article info into the articles_info list\n", | |
| " articles_info.append(article_dict)\n", | |
| "#Pass the list of dictionaries into a pandas data frame\n", | |
| "corpus = pd.DataFrame(articles_info)\n", | |
| "#Print how long the process took\n", | |
| "print(\"Script took {:.2f} seconds to complete\".format(time.time() - start))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style>\n", | |
| " .dataframe thead tr:only-child th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>date</th>\n", | |
| " <th>keywords</th>\n", | |
| " <th>link</th>\n", | |
| " <th>summary</th>\n", | |
| " <th>text</th>\n", | |
| " <th>title</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2017-09-08</td>\n", | |
| " <td>[intelligence, researchers, sexism, female, la...</td>\n", | |
| " <td>https://www.nytimes.com/2017/09/08/upshot/sexi...</td>\n", | |
| " <td>Across all the categories analyzed and all the...</td>\n", | |
| " <td>First, here’s what they reported finding last ...</td>\n", | |
| " <td>Sexism and Shopping: Female Players Get Most o...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2017-09-08</td>\n", | |
| " <td>[latest, versus, intelligence, tegmark, patter...</td>\n", | |
| " <td>https://www.nytimes.com/2017/09/08/books/revie...</td>\n", | |
| " <td>PhotoGODZILLA VERSUS MOTHRA: James Patterson’s...</td>\n", | |
| " <td>Photo\\n\\nGODZILLA VERSUS MOTHRA: James Patters...</td>\n", | |
| " <td>James Patterson’s Latest Villain Looks a Lot L...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2017-09-01</td>\n", | |
| " <td>[intelligence, human, harm, systems, regulate,...</td>\n", | |
| " <td>https://www.nytimes.com/2017/09/01/opinion/art...</td>\n", | |
| " <td>PhotoThe technology entrepreneur Elon Musk rec...</td>\n", | |
| " <td>Photo\\n\\nThe technology entrepreneur Elon Musk...</td>\n", | |
| " <td>How to Regulate Artificial Intelligence</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2017-09-14</td>\n", | |
| " <td>[intelligence, teaching, human, experiments, e...</td>\n", | |
| " <td>https://www.nytimes.com/2017/09/14/opinion/art...</td>\n", | |
| " <td>PhotoTo the Editor:Re “How to Regulate Artific...</td>\n", | |
| " <td>Photo\\n\\nTo the Editor:\\n\\nRe “How to Regulate...</td>\n", | |
| " <td>Ethics and Artificial Intelligence</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2017-08-30</td>\n", | |
| " <td>[assistants, mr, working, way, cortana, bezos,...</td>\n", | |
| " <td>https://www.nytimes.com/2017/08/30/technology/...</td>\n", | |
| " <td>But Mr. Bezos and Mr. Nadella are concerned th...</td>\n", | |
| " <td>But Mr. Bezos and Mr. Nadella are concerned th...</td>\n", | |
| " <td>‘Cortana, Open Alexa,’ Amazon Says. And Micros...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " date keywords \\\n", | |
| "0 2017-09-08 [intelligence, researchers, sexism, female, la... \n", | |
| "1 2017-09-08 [latest, versus, intelligence, tegmark, patter... \n", | |
| "2 2017-09-01 [intelligence, human, harm, systems, regulate,... \n", | |
| "3 2017-09-14 [intelligence, teaching, human, experiments, e... \n", | |
| "4 2017-08-30 [assistants, mr, working, way, cortana, bezos,... \n", | |
| "\n", | |
| " link \\\n", | |
| "0 https://www.nytimes.com/2017/09/08/upshot/sexi... \n", | |
| "1 https://www.nytimes.com/2017/09/08/books/revie... \n", | |
| "2 https://www.nytimes.com/2017/09/01/opinion/art... \n", | |
| "3 https://www.nytimes.com/2017/09/14/opinion/art... \n", | |
| "4 https://www.nytimes.com/2017/08/30/technology/... \n", | |
| "\n", | |
| " summary \\\n", | |
| "0 Across all the categories analyzed and all the... \n", | |
| "1 PhotoGODZILLA VERSUS MOTHRA: James Patterson’s... \n", | |
| "2 PhotoThe technology entrepreneur Elon Musk rec... \n", | |
| "3 PhotoTo the Editor:Re “How to Regulate Artific... \n", | |
| "4 But Mr. Bezos and Mr. Nadella are concerned th... \n", | |
| "\n", | |
| " text \\\n", | |
| "0 First, here’s what they reported finding last ... \n", | |
| "1 Photo\\n\\nGODZILLA VERSUS MOTHRA: James Patters... \n", | |
| "2 Photo\\n\\nThe technology entrepreneur Elon Musk... \n", | |
| "3 Photo\\n\\nTo the Editor:\\n\\nRe “How to Regulate... \n", | |
| "4 But Mr. Bezos and Mr. Nadella are concerned th... \n", | |
| "\n", | |
| " title \n", | |
| "0 Sexism and Shopping: Female Players Get Most o... \n", | |
| "1 James Patterson’s Latest Villain Looks a Lot L... \n", | |
| "2 How to Regulate Artificial Intelligence \n", | |
| "3 Ethics and Artificial Intelligence \n", | |
| "4 ‘Cortana, Open Alexa,’ Amazon Says. And Micros... " | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "#Take a look at the data frame\n", | |
| "corpus.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
unable to locate nytimes_links.pkl . Please let us know from where to download it.