{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Yahoo知恵袋のスクレイピングのPythonサンプルプログラム\n", "\n", "seleniumをつかったyahoo知恵袋のスクレイピングのサンプルプログラムです。
\n", "質問の検索結果まで出力できます。
\n", "出力結果は、csvファイルに書き出します。
\n", "\n", "\n", "参考にしたコード
\n", "【Python×Selenium】超簡単にWebサイトをスクレイピングしてみる
\n", "https://miyanetdev.com/archives/327" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from time import sleep\n", "import urllib\n", "import re\n", "import pandas as pd\n", "\n", "PAGE_LIMIT = 20 #ページ遷移の最大の回数\n", "SEARCH_QUERY = \"プログラミング\"\n", "SQRAPING_URL = \"https://chiebukuro.yahoo.co.jp/\"\n", "\n", "#出力結果を格納数csvファイル\n", "csv_file_name = SEARCH_QUERY + \".csv\"\n", "\n", "#ドライバーを設定する\n", "#linuxなどGUIがない環境で動かす場合は、ヘッドレスモードを入れておく\n", "#options = webdriver.ChromeOptions()\n", "#options.add_argument('--headless')\n", "\n", "\n", "#driver = webdriver.Chrome('./chromedriver', options)\n", "driver = webdriver.Chrome('./chromedriver')\n", "\n", "#知恵袋ページを読み込む\n", "driver.get(SQRAPING_URL)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#該当ページを解析する\n", "def analysis_action():\n", "\n", " elems = driver.find_elements_by_xpath('//*[@id=\"sr\"]/ul/li[*]')\n", " # 取得した要素を1つずつ表示\n", "\n", " out_puts = []\n", "\n", " if(len(elems) == 0):\n", " print(\"ページは存在しないよ〜\")\n", " else:\n", " for elem in elems:\n", " out_dic ={}\n", " out_dic['query_key'] = SEARCH_QUERY\n", " out_dic['rs_title'] = elem.find_elements_by_xpath('h3/a')[0].text\n", " out_dic['rs_link'] = elem.find_elements_by_xpath('h3/a')[0].get_attribute('href')\n", " out_dic['rs_summary'] = elem.find_elements_by_xpath('p[1]')[0].text\n", " #print(out_dic)\n", " out_puts.append(out_dic)\n", " #print(\"*\" * 60)\n", " \n", " return out_puts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def next_page_action():\n", " \"\"\"\n", " 現在のページから次のページを読み込むアクションを実行する\n", " \"\"\"\n", " rtn = False\n", " \n", " #次へボタンのクリック\n", " elems = driver.find_elements_by_xpath('//*[@id=\"pg_low\"]/div/a[*]')\n", "\n", " #現在のページ\n", " print(\"ページ遷移前のurl:\")\n", " print(driver.current_url)\n", " if(len(elems) == 0):\n", " print(\"次のページは存在しないよ〜\")\n", " else:\n", " for elem in elems:\n", " #print(elem.text)\n", " if(elem.text != \"次へ\"):\n", " continue\n", " url = elem.get_attribute('href')\n", " driver.get(url)\n", " rtn = True\n", " break\n", "\n", " return rtn\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 最初の検索を実行する\n", "search_box = driver.find_element_by_css_selector('input.txtKeyword')\n", "search_box.send_keys(SEARCH_QUERY)\n", "search_button_container = driver.find_element_by_css_selector('p.btnSearch')\n", "search_button = search_button_container.find_element_by_css_selector('input')\n", "search_button.click()\n", "sleep(2)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#知恵袋の検索結果の一覧をpandasに格納してcsvに書き出す\n", "#csvには、途中で止まっても良いように、1ページ終わったら書き出すようにしている\n", "\n", "d = analysis_action()\n", "df=pd.DataFrame(d) \n", "df.to_csv(csv_file_name, encoding=\"utf_8_sig\")\n", "\n", "analysis_list = []\n", "analysis_list.extend(d)\n", "\n", "for page in range(PAGE_LIMIT):\n", " \n", " print(\"ページ %dを実行中\" % page)\n", " sleep(5)\n", " \n", " #次のページに遷移する\n", " rtn = next_page_action()\n", " if(rtn == False):\n", " break\n", " \n", " #知恵袋の質問リストを格納する\n", " d = analysis_action()\n", " if(len(d) > 0):\n", " analysis_list.extend(d)\n", " df=pd.DataFrame(analysis_list) \n", " df.to_csv(csv_file_name, encoding=\"utf_8_sig\")\n", " \n", "driver.close()\n", "driver.quit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }