Last active
August 14, 2019 17:12
-
-
Save lawrencechen0921/1488de391a6500c12189c1c83e1cda5e to your computer and use it in GitHub Desktop.
技術分享 拆解台灣新聞 twnews
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#先在 pip3 install twnews | |
#然後可以用你需要的工具去分析其各大媒體網站 | |
範例 - 分解新聞 | |
from twnews.soup import NewsSoup | |
nsoup = NewsSoup('https://tw.news.appledaily.com/local/realtime/20181025/1453825') | |
print('頻道: {}'.format(nsoup.channel)) | |
print('標題: {}'.format(nsoup.title())) | |
print('日期: {}'.format(nsoup.date())) | |
print('記者: {}'.format(nsoup.author())) | |
print('內文:') | |
print(nsoup.contents()) | |
print('有效內容率: {:.2f}%'.format(nsoup.effective_text_rate() * 100)) | |
********************************************************************************************* | |
範例 - 關鍵字搜尋 + 分解新聞 | |
from twnews.search import NewsSearch | |
nsearch = NewsSearch( | |
'ltn', | |
limit=10, | |
beg_date='2018-08-03', # 自由時報的日期範圍只能在 90 天以內 | |
end_date='2018-11-01' | |
) | |
nsoups = nsearch.by_keyword('上吊', title_only=True).to_soup_list() | |
for (i, nsoup) in enumerate(nsoups): | |
print('{:03d}: {}'.format(i, nsoup.path)) | |
if nsoup.title() is not None: | |
print(' 記者: {} / 日期: {}'.format(nsoup.author(), nsoup.date())) | |
print(' 標題: {}'.format(nsoup.title())) | |
print(' {} ...'.format(nsoup.contents()[0:30])) | |
else: | |
print(' 新聞分解失敗,無法識別 DOM 結構') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment