GeorgeMcIntire · October 9, 2017 03:02 · amaurya72 · Oct 14, 2017
diff --git a/nyt_to_pandas.ipynb b/nyt_to_pandas.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Script took 38.52 seconds to complete\n"
     ]
    }
   ],
   "source": [
    "#Imports\n",
    "import time\n",
    "#Assign time.time() object to \"start\" so we can profile the code.\n",
    "start = time.time()\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from newspaper import Article\n",
    "#Load in pickle file with links and slice the first 50 links\n",
    "links = pd.read_pickle(\"nytimes_links.pkl\")[:50]\n",
    "\n",
    "#Intialize list articles_info list\n",
    "articles_info = []\n",
    "for i in links:\n",
    "    #Intialize dictionary\n",
    "    article_dict = {}\n",
    "    #Insert link \"i\" into the dictionary\n",
    "    article_dict[\"link\"] = i\n",
    "    #Pass link into Article() function\n",
    "    art = Article(i)\n",
    "    #Download contents of art object\n",
    "    art.download()\n",
    "    \n",
    "    #Try/except is included because not all articles can be parsed\n",
    "    try:\n",
    "        #If article can be successfully parsed then insert its text, title, publish_date, keywords\n",
    "        #and summary into corresponding keys\n",
    "        art.parse()\n",
    "        article_dict[\"text\"] = art.text\n",
    "        article_dict[\"title\"] = art.title\n",
    "        article_dict[\"date\"] = art.publish_date\n",
    "        art.nlp()\n",
    "        article_dict[\"keywords\"] = art.keywords\n",
    "        article_dict[\"summary\"] = art.summary\n",
    "    except ArticleException:\n",
    "        #If article cannot be parse then insert null values for the following keys:\n",
    "        #\"text\", \"title\", \"date\", \"keywords\", and \"summary\"\n",
    "        article_dict[\"text\"] = np.nan\n",
    "        article_dict[\"title\"] = np.nan\n",
    "        article_dict[\"date\"] = np.nan\n",
    "        article_dict[\"keywords\"] = np.nan\n",
    "        article_dict[\"summary\"] = np.nan\n",
    "        \n",
    "    #Insert dictionary of article info into the articles_info list\n",
    "    articles_info.append(article_dict)\n",
    "#Pass the list of dictionaries into a pandas data frame\n",
    "corpus = pd.DataFrame(articles_info)\n",
    "#Print how long the process took\n",
    "print(\"Script took {:.2f} seconds to complete\".format(time.time() - start))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>keywords</th>\n",
       "      <th>link</th>\n",
       "      <th>summary</th>\n",
       "      <th>text</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2017-09-08</td>\n",
       "      <td>[intelligence, researchers, sexism, female, la...</td>\n",
       "      <td>https://www.nytimes.com/2017/09/08/upshot/sexi...</td>\n",
       "      <td>Across all the categories analyzed and all the...</td>\n",
       "      <td>First, here’s what they reported finding last ...</td>\n",
       "      <td>Sexism and Shopping: Female Players Get Most o...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2017-09-08</td>\n",
       "      <td>[latest, versus, intelligence, tegmark, patter...</td>\n",
       "      <td>https://www.nytimes.com/2017/09/08/books/revie...</td>\n",
       "      <td>PhotoGODZILLA VERSUS MOTHRA: James Patterson’s...</td>\n",
       "      <td>Photo\\n\\nGODZILLA VERSUS MOTHRA: James Patters...</td>\n",
       "      <td>James Patterson’s Latest Villain Looks a Lot L...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2017-09-01</td>\n",
       "      <td>[intelligence, human, harm, systems, regulate,...</td>\n",
       "      <td>https://www.nytimes.com/2017/09/01/opinion/art...</td>\n",
       "      <td>PhotoThe technology entrepreneur Elon Musk rec...</td>\n",
       "      <td>Photo\\n\\nThe technology entrepreneur Elon Musk...</td>\n",
       "      <td>How to Regulate Artificial Intelligence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2017-09-14</td>\n",
       "      <td>[intelligence, teaching, human, experiments, e...</td>\n",
       "      <td>https://www.nytimes.com/2017/09/14/opinion/art...</td>\n",
       "      <td>PhotoTo the Editor:Re “How to Regulate Artific...</td>\n",
       "      <td>Photo\\n\\nTo the Editor:\\n\\nRe “How to Regulate...</td>\n",
       "      <td>Ethics and Artificial Intelligence</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2017-08-30</td>\n",
       "      <td>[assistants, mr, working, way, cortana, bezos,...</td>\n",
       "      <td>https://www.nytimes.com/2017/08/30/technology/...</td>\n",
       "      <td>But Mr. Bezos and Mr. Nadella are concerned th...</td>\n",
       "      <td>But Mr. Bezos and Mr. Nadella are concerned th...</td>\n",
       "      <td>‘Cortana, Open Alexa,’ Amazon Says. And Micros...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date                                           keywords  \\\n",
       "0 2017-09-08  [intelligence, researchers, sexism, female, la...   \n",
       "1 2017-09-08  [latest, versus, intelligence, tegmark, patter...   \n",
       "2 2017-09-01  [intelligence, human, harm, systems, regulate,...   \n",
       "3 2017-09-14  [intelligence, teaching, human, experiments, e...   \n",
       "4 2017-08-30  [assistants, mr, working, way, cortana, bezos,...   \n",
       "\n",
       "                                                link  \\\n",
       "0  https://www.nytimes.com/2017/09/08/upshot/sexi...   \n",
       "1  https://www.nytimes.com/2017/09/08/books/revie...   \n",
       "2  https://www.nytimes.com/2017/09/01/opinion/art...   \n",
       "3  https://www.nytimes.com/2017/09/14/opinion/art...   \n",
       "4  https://www.nytimes.com/2017/08/30/technology/...   \n",
       "\n",
       "                                             summary  \\\n",
       "0  Across all the categories analyzed and all the...   \n",
       "1  PhotoGODZILLA VERSUS MOTHRA: James Patterson’s...   \n",
       "2  PhotoThe technology entrepreneur Elon Musk rec...   \n",
       "3  PhotoTo the Editor:Re “How to Regulate Artific...   \n",
       "4  But Mr. Bezos and Mr. Nadella are concerned th...   \n",
       "\n",
       "                                                text  \\\n",
       "0  First, here’s what they reported finding last ...   \n",
       "1  Photo\\n\\nGODZILLA VERSUS MOTHRA: James Patters...   \n",
       "2  Photo\\n\\nThe technology entrepreneur Elon Musk...   \n",
       "3  Photo\\n\\nTo the Editor:\\n\\nRe “How to Regulate...   \n",
       "4  But Mr. Bezos and Mr. Nadella are concerned th...   \n",
       "\n",
       "                                               title  \n",
       "0  Sexism and Shopping: Female Players Get Most o...  \n",
       "1  James Patterson’s Latest Villain Looks a Lot L...  \n",
       "2            How to Regulate Artificial Intelligence  \n",
       "3                 Ethics and Artificial Intelligence  \n",
       "4  ‘Cortana, Open Alexa,’ Amazon Says. And Micros...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Take a look at the data frame\n",
    "corpus.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Script took 38.52 seconds to complete\n"
	]
	}
	],
	"source": [
	"#Imports\n",
	"import time\n",
	"#Assign time.time() object to \"start\" so we can profile the code.\n",
	"start = time.time()\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"from newspaper import Article\n",
	"#Load in pickle file with links and slice the first 50 links\n",
	"links = pd.read_pickle(\"nytimes_links.pkl\")[:50]\n",
	"\n",
	"#Intialize list articles_info list\n",
	"articles_info = []\n",
	"for i in links:\n",
	" #Intialize dictionary\n",
	" article_dict = {}\n",
	" #Insert link \"i\" into the dictionary\n",
	" article_dict[\"link\"] = i\n",
	" #Pass link into Article() function\n",
	" art = Article(i)\n",
	" #Download contents of art object\n",
	" art.download()\n",
	" \n",
	" #Try/except is included because not all articles can be parsed\n",
	" try:\n",
	" #If article can be successfully parsed then insert its text, title, publish_date, keywords\n",
	" #and summary into corresponding keys\n",
	" art.parse()\n",
	" article_dict[\"text\"] = art.text\n",
	" article_dict[\"title\"] = art.title\n",
	" article_dict[\"date\"] = art.publish_date\n",
	" art.nlp()\n",
	" article_dict[\"keywords\"] = art.keywords\n",
	" article_dict[\"summary\"] = art.summary\n",
	" except ArticleException:\n",
	" #If article cannot be parse then insert null values for the following keys:\n",
	" #\"text\", \"title\", \"date\", \"keywords\", and \"summary\"\n",
	" article_dict[\"text\"] = np.nan\n",
	" article_dict[\"title\"] = np.nan\n",
	" article_dict[\"date\"] = np.nan\n",
	" article_dict[\"keywords\"] = np.nan\n",
	" article_dict[\"summary\"] = np.nan\n",
	" \n",
	" #Insert dictionary of article info into the articles_info list\n",
	" articles_info.append(article_dict)\n",
	"#Pass the list of dictionaries into a pandas data frame\n",
	"corpus = pd.DataFrame(articles_info)\n",
	"#Print how long the process took\n",
	"print(\"Script took {:.2f} seconds to complete\".format(time.time() - start))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>date</th>\n",
	" <th>keywords</th>\n",
	" <th>link</th>\n",
	" <th>summary</th>\n",
	" <th>text</th>\n",
	" <th>title</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>2017-09-08</td>\n",
	" <td>[intelligence, researchers, sexism, female, la...</td>\n",
	" <td>https://www.nytimes.com/2017/09/08/upshot/sexi...</td>\n",
	" <td>Across all the categories analyzed and all the...</td>\n",
	" <td>First, here’s what they reported finding last ...</td>\n",
	" <td>Sexism and Shopping: Female Players Get Most o...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2017-09-08</td>\n",
	" <td>[latest, versus, intelligence, tegmark, patter...</td>\n",
	" <td>https://www.nytimes.com/2017/09/08/books/revie...</td>\n",
	" <td>PhotoGODZILLA VERSUS MOTHRA: James Patterson’s...</td>\n",
	" <td>Photo\\n\\nGODZILLA VERSUS MOTHRA: James Patters...</td>\n",
	" <td>James Patterson’s Latest Villain Looks a Lot L...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2017-09-01</td>\n",
	" <td>[intelligence, human, harm, systems, regulate,...</td>\n",
	" <td>https://www.nytimes.com/2017/09/01/opinion/art...</td>\n",
	" <td>PhotoThe technology entrepreneur Elon Musk rec...</td>\n",
	" <td>Photo\\n\\nThe technology entrepreneur Elon Musk...</td>\n",
	" <td>How to Regulate Artificial Intelligence</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>2017-09-14</td>\n",
	" <td>[intelligence, teaching, human, experiments, e...</td>\n",
	" <td>https://www.nytimes.com/2017/09/14/opinion/art...</td>\n",
	" <td>PhotoTo the Editor:Re “How to Regulate Artific...</td>\n",
	" <td>Photo\\n\\nTo the Editor:\\n\\nRe “How to Regulate...</td>\n",
	" <td>Ethics and Artificial Intelligence</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>2017-08-30</td>\n",
	" <td>[assistants, mr, working, way, cortana, bezos,...</td>\n",
	" <td>https://www.nytimes.com/2017/08/30/technology/...</td>\n",
	" <td>But Mr. Bezos and Mr. Nadella are concerned th...</td>\n",
	" <td>But Mr. Bezos and Mr. Nadella are concerned th...</td>\n",
	" <td>‘Cortana, Open Alexa,’ Amazon Says. And Micros...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" date keywords \\\n",
	"0 2017-09-08 [intelligence, researchers, sexism, female, la... \n",
	"1 2017-09-08 [latest, versus, intelligence, tegmark, patter... \n",
	"2 2017-09-01 [intelligence, human, harm, systems, regulate,... \n",
	"3 2017-09-14 [intelligence, teaching, human, experiments, e... \n",
	"4 2017-08-30 [assistants, mr, working, way, cortana, bezos,... \n",
	"\n",
	" link \\\n",
	"0 https://www.nytimes.com/2017/09/08/upshot/sexi... \n",
	"1 https://www.nytimes.com/2017/09/08/books/revie... \n",
	"2 https://www.nytimes.com/2017/09/01/opinion/art... \n",
	"3 https://www.nytimes.com/2017/09/14/opinion/art... \n",
	"4 https://www.nytimes.com/2017/08/30/technology/... \n",
	"\n",
	" summary \\\n",
	"0 Across all the categories analyzed and all the... \n",
	"1 PhotoGODZILLA VERSUS MOTHRA: James Patterson’s... \n",
	"2 PhotoThe technology entrepreneur Elon Musk rec... \n",
	"3 PhotoTo the Editor:Re “How to Regulate Artific... \n",
	"4 But Mr. Bezos and Mr. Nadella are concerned th... \n",
	"\n",
	" text \\\n",
	"0 First, here’s what they reported finding last ... \n",
	"1 Photo\\n\\nGODZILLA VERSUS MOTHRA: James Patters... \n",
	"2 Photo\\n\\nThe technology entrepreneur Elon Musk... \n",
	"3 Photo\\n\\nTo the Editor:\\n\\nRe “How to Regulate... \n",
	"4 But Mr. Bezos and Mr. Nadella are concerned th... \n",
	"\n",
	" title \n",
	"0 Sexism and Shopping: Female Players Get Most o... \n",
	"1 James Patterson’s Latest Villain Looks a Lot L... \n",
	"2 How to Regulate Artificial Intelligence \n",
	"3 Ethics and Artificial Intelligence \n",
	"4 ‘Cortana, Open Alexa,’ Amazon Says. And Micros... "
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#Take a look at the data frame\n",
	"corpus.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
No results found