Last active
May 18, 2020 20:04
-
-
Save linzino7/2d9888f36d45555cc3ac49d20ad4fd91 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue May 12 04:38:47 2020 | |
https://medium.com/p/a8216873a9d3 | |
@author: Zino | |
""" | |
# 導入 模組(module) | |
import requests | |
# 把 到 ptt 八卦版 網址存到URL 變數中 | |
URL = "https://www.ptt.cc/bbs/Gossiping/index.html" | |
# 設定Header與Cookie | |
my_headers = {'cookie': 'over18=1;'} | |
# 發送get 請求 到 ptt 八卦版 | |
response = requests.get(URL, headers = my_headers) | |
# 印出回傳網頁程式碼 | |
print(response.text) | |
# 導入 BeautifulSoup 模組(module):解析HTML 語法工具 | |
import bs4 | |
# 2-1 把網頁程式碼(HTML) 丟入 bs4模組分析 | |
soup = bs4.BeautifulSoup(response.text,"html.parser") | |
''' | |
<div class="title"> | |
<a href="/bbs/Gossiping/M.1589705973.A.912.html"> | |
[問卦] 為什麼八卦的民意在社會上都體現不出來呢 | |
</a> | |
</div> | |
''' | |
# 2-2 查找所有html 元素 過濾出 標籤名稱為 'div' 同時class為 title | |
titles = soup.find_all('div','title') | |
# 2-3 萃取文字出來。 | |
# 因為我們有多個Tags存放在 List titles中。 | |
# 所以需要使用for 迴圈將逐筆將List | |
for t in titles: | |
print(t.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment