Last active
December 4, 2024 12:46
-
-
Save jshirius/e8992c0e7620de098a43d77e4bd91859 to your computer and use it in GitHub Desktop.
Yahoo知恵袋のスクレイピングのPythonサンプルプログラム
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Yahoo知恵袋のスクレイピングのPythonサンプルプログラム\n", | |
"\n", | |
"seleniumをつかったyahoo知恵袋のスクレイピングのサンプルプログラムです。<br>\n", | |
"質問の検索結果まで出力できます。<br>\n", | |
"出力結果は、csvファイルに書き出します。<br>\n", | |
"\n", | |
"\n", | |
"参考にしたコード<br>\n", | |
"【Python×Selenium】超簡単にWebサイトをスクレイピングしてみる<br>\n", | |
"https://miyanetdev.com/archives/327" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from selenium import webdriver\n", | |
"from time import sleep\n", | |
"import urllib\n", | |
"import re\n", | |
"import pandas as pd\n", | |
"\n", | |
"PAGE_LIMIT = 20 #ページ遷移の最大の回数\n", | |
"SEARCH_QUERY = \"プログラミング\"\n", | |
"SQRAPING_URL = \"https://chiebukuro.yahoo.co.jp/\"\n", | |
"\n", | |
"#出力結果を格納数csvファイル\n", | |
"csv_file_name = SEARCH_QUERY + \".csv\"\n", | |
"\n", | |
"#ドライバーを設定する\n", | |
"#linuxなどGUIがない環境で動かす場合は、ヘッドレスモードを入れておく\n", | |
"#options = webdriver.ChromeOptions()\n", | |
"#options.add_argument('--headless')\n", | |
"\n", | |
"\n", | |
"#driver = webdriver.Chrome('./chromedriver', options)\n", | |
"driver = webdriver.Chrome('./chromedriver')\n", | |
"\n", | |
"#知恵袋ページを読み込む\n", | |
"driver.get(SQRAPING_URL)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#該当ページを解析する\n", | |
"def analysis_action():\n", | |
"\n", | |
" elems = driver.find_elements_by_xpath('//*[@id=\"sr\"]/ul/li[*]')\n", | |
" # 取得した要素を1つずつ表示\n", | |
"\n", | |
" out_puts = []\n", | |
"\n", | |
" if(len(elems) == 0):\n", | |
" print(\"ページは存在しないよ〜\")\n", | |
" else:\n", | |
" for elem in elems:\n", | |
" out_dic ={}\n", | |
" out_dic['query_key'] = SEARCH_QUERY\n", | |
" out_dic['rs_title'] = elem.find_elements_by_xpath('h3/a')[0].text\n", | |
" out_dic['rs_link'] = elem.find_elements_by_xpath('h3/a')[0].get_attribute('href')\n", | |
" out_dic['rs_summary'] = elem.find_elements_by_xpath('p[1]')[0].text\n", | |
" #print(out_dic)\n", | |
" out_puts.append(out_dic)\n", | |
" #print(\"*\" * 60)\n", | |
" \n", | |
" return out_puts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def next_page_action():\n", | |
" \"\"\"\n", | |
" 現在のページから次のページを読み込むアクションを実行する\n", | |
" \"\"\"\n", | |
" rtn = False\n", | |
" \n", | |
" #次へボタンのクリック\n", | |
" elems = driver.find_elements_by_xpath('//*[@id=\"pg_low\"]/div/a[*]')\n", | |
"\n", | |
" #現在のページ\n", | |
" print(\"ページ遷移前のurl:\")\n", | |
" print(driver.current_url)\n", | |
" if(len(elems) == 0):\n", | |
" print(\"次のページは存在しないよ〜\")\n", | |
" else:\n", | |
" for elem in elems:\n", | |
" #print(elem.text)\n", | |
" if(elem.text != \"次へ\"):\n", | |
" continue\n", | |
" url = elem.get_attribute('href')\n", | |
" driver.get(url)\n", | |
" rtn = True\n", | |
" break\n", | |
"\n", | |
" return rtn\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 最初の検索を実行する\n", | |
"search_box = driver.find_element_by_css_selector('input.txtKeyword')\n", | |
"search_box.send_keys(SEARCH_QUERY)\n", | |
"search_button_container = driver.find_element_by_css_selector('p.btnSearch')\n", | |
"search_button = search_button_container.find_element_by_css_selector('input')\n", | |
"search_button.click()\n", | |
"sleep(2)\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#知恵袋の検索結果の一覧をpandasに格納してcsvに書き出す\n", | |
"#csvには、途中で止まっても良いように、1ページ終わったら書き出すようにしている\n", | |
"\n", | |
"d = analysis_action()\n", | |
"df=pd.DataFrame(d) \n", | |
"df.to_csv(csv_file_name, encoding=\"utf_8_sig\")\n", | |
"\n", | |
"analysis_list = []\n", | |
"analysis_list.extend(d)\n", | |
"\n", | |
"for page in range(PAGE_LIMIT):\n", | |
" \n", | |
" print(\"ページ %dを実行中\" % page)\n", | |
" sleep(5)\n", | |
" \n", | |
" #次のページに遷移する\n", | |
" rtn = next_page_action()\n", | |
" if(rtn == False):\n", | |
" break\n", | |
" \n", | |
" #知恵袋の質問リストを格納する\n", | |
" d = analysis_action()\n", | |
" if(len(d) > 0):\n", | |
" analysis_list.extend(d)\n", | |
" df=pd.DataFrame(analysis_list) \n", | |
" df.to_csv(csv_file_name, encoding=\"utf_8_sig\")\n", | |
" \n", | |
"driver.close()\n", | |
"driver.quit()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment