Created
August 5, 2019 00:52
-
-
Save gihyunkim/f734364186a04c8458933fee748a2ec1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"crawl_rss\n", | |
"crawl_article\n", | |
"[Parsing Title]\n", | |
"국내 보안업체 '디지털 서명' 탑재한 악성코드 발견\n", | |
"[parsing query ] \n", | |
"[['국내', '보안', '업체', '디지털 서명', '탑재', '악성', '코드', '발견']]\n", | |
"[parsing Text]\n", | |
"--------------------------------------------------\n", | |
"e: 0.7499875006249688 \n", | |
"title: 스타트업, 손에 잡힐 듯한 독도 가상여행 개발 \n", | |
"text\n", | |
" http://www.etnews.com/20190724000436\n", | |
"e: 0.7083315972366897 \n", | |
"title: 트리엠, 국내 최초 '조달ERP 표준서비스' 4분기 출시… 제조기업서 솔루션기업 제2도약 \n", | |
"text\n", | |
" http://www.etnews.com/20190730000120\n", | |
"e: 0.681816528940646 \n", | |
"title: 인프론티브, 토종 KVM 보안 스위치로 외산 시장 판도 바꾼다…신보에 2600여대 대량 공급 \n", | |
"text\n", | |
" http://www.etnews.com/20190726000388\n", | |
"e: 0.6666638889351844 \n", | |
"title: 포스코ICT-제우스, 스마트공장·로봇사업 다각화 협력한다 \n", | |
"text\n", | |
" http://www.etnews.com/20190730000250\n", | |
"e: 0.6666611112962901 \n", | |
"title: 한국SW산업협회, 해외진출위원회 발족 \n", | |
"text\n", | |
" http://www.etnews.com/20190724000326\n", | |
"e: 0.6428551020699704 \n", | |
"title: SAP코리아, 뱅크웨어글로벌과 금융시장 공략 본격화 \n", | |
"text\n", | |
" http://www.etnews.com/20190724000467\n", | |
"e: 0.6111098765569272 \n", | |
"title: NIPA, 극한직업 CG사 참가 컴퓨터그래픽 리크루팅 캠프 개최 \n", | |
"text\n", | |
" http://www.etnews.com/20190730000135\n" | |
] | |
} | |
], | |
"source": [ | |
"import feedparser\n", | |
"from newspaper import Article\n", | |
"from konlpy.tag import Komoran # 명사만을 추출할 때 쓰임\n", | |
"from konlpy.tag import Okt\n", | |
"from collections import Counter # 특정 명사가 기사에 몇 개나 들어있나? Sorting까지 해준다.\n", | |
"from operator import eq # word끼리 같은지 검사\n", | |
"import math\n", | |
"import numpy as np\n", | |
"\n", | |
"komoran = Komoran()\n", | |
"\n", | |
"urls = (\n", | |
" 'http://rss.etnews.com/04043.xml'\n", | |
" ,'http://rss.etnews.com/04044.xml'\n", | |
" ,'http://rss.etnews.com/04042.xml'\n", | |
" , None)\n", | |
"\n", | |
"def crawl_rss(urls):\n", | |
" print(\"crawl_rss\")\n", | |
" arr_rss=[]\n", | |
" for url in urls:\n", | |
" parse_rss = feedparser.parse(url) # item을 기준으로 url의 rss로부터 속성과 값들을 가져옴\n", | |
" for p in parse_rss.entries: # 각 기사에 대해 어떠한 정보를 뽑아올 수 있다.\n", | |
" arr_rss.append({'title': p.title, 'link':p.link}) # 기사의 제목과 url을 따온다.\n", | |
" return arr_rss\n", | |
" \n", | |
"def crawl_article(url, language= 'ko'): # link를 받아와 그 url의 본문을 반환.\n", | |
" var_article = Article(url, language=language)\n", | |
" var_article.download()\n", | |
" var_article.parse()\n", | |
" return var_article.title, var_article.text\n", | |
"\n", | |
"def get_tags(text, ntags=50):\n", | |
" spliter = Okt()\n", | |
" num_unique_words = 0\n", | |
" num_most_freq = 0\n", | |
" nouns = spliter.nouns(text)\n", | |
" count = Counter(nouns)# nouns가 몇 개 들어가 있나\n", | |
" return_list = []\n", | |
" \n", | |
" for n, c in count.most_common(ntags): # 가장 많이 출현 한 것 (key : count)\n", | |
" temp = {'tag':n, 'count':c}\n", | |
" return_list.append(temp)\n", | |
" num_unique_words = num_unique_words+1\n", | |
" if num_unique_words == 1: # number of unique words in the text\n", | |
" num_most_freq = c # count the number of unique words\n", | |
" return num_unique_words, num_most_freq, return_list \n", | |
"\n", | |
"def TF(request, most_freq,tag):\n", | |
" return 0.5 + 0.5 * Howmanywords(request, tag)/(most_freq+0.0001)\n", | |
"\n", | |
"def Howmanywords(request, tag):\n", | |
" noWords = 0\n", | |
" for n in tag:\n", | |
" noun = n['tag']\n", | |
" count = n['count'] \n", | |
" if eq(noun, request):\n", | |
" return count\n", | |
" return noWords\n", | |
"\n", | |
"# main()\n", | |
"def main():\n", | |
" article_list = crawl_rss(urls) # crawling article with title and link\n", | |
" print(\"crawl_article\")\n", | |
" for article in article_list: # for all article\n", | |
" _, text = crawl_article(article['link']) # link를 넣어주면 title과 본문을 return\n", | |
" article['text'] = text\n", | |
" \n", | |
" print('[Parsing Title]')\n", | |
" noun_title = [komoran.nouns(a['title']) for a in article_list] # 모든 article에 대해 title의 명사만 뽑아서 noun_title에 넣어줌. \n", | |
" query = input()\n", | |
" print('[parsing query ] ')\n", | |
" noun_query = [komoran.nouns(query)]\n", | |
" print(noun_query)\n", | |
" print('[parsing Text]')\n", | |
" noun_text = [] # initialize list\n", | |
" for a in article_list:\n", | |
" # tag : key and count\n", | |
" num_unique_words, num_most_freq, tags = get_tags(a['text']) \n", | |
" noun_text.append({'num_unique_words':num_unique_words,\n", | |
" 'num_most_freq':num_most_freq, 'tags':tags})\n", | |
" tf_idf_title = [] # term freq inv document freq title\n", | |
" tf_idf_mean = [] # term freq inv document freq title mean\n", | |
" ##tf_idf_query = [] # term freq inv document freq query\n", | |
" ##tf_idft_mean_query = [] # term freq inv document freq query mean\n", | |
" ##tag_list = [a['tags'] for a in noun_text]\n", | |
" for i ,nouns in enumerate(noun_title):\n", | |
" tfs = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in nouns]\n", | |
" #_tfidf = [tfs[j] for j,n in enumerate(nouns)]\n", | |
" # tf_idf_title.append(_tfidf)\n", | |
" # tf_idf_mean.append(np.mean(_tfidf))\n", | |
" print(\"----------\"*5)\n", | |
" tmp = 0\n", | |
" index = 0\n", | |
" result = []\n", | |
" sort_result=[]\n", | |
" for i in range(50):\n", | |
" tfs_query = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in noun_query[0]]\n", | |
" tfidf = [tfs_query[j] for j, n in enumerate(noun_query)]\n", | |
" tf_idf_mean.append(np.mean(tfidf)) \n", | |
" #if(tmp < tfs_query[0]):\n", | |
" # result = tfs_query\n", | |
" # index = i\n", | |
" #tmp = tfs_query[0]\n", | |
" # print(tf_idf_mean)\n", | |
" for i in range(50):\n", | |
" sort_result.append([tf_idf_mean[i], i])\n", | |
" sort_result.sort(reverse=True)\n", | |
" for e, i in sort_result:\n", | |
" if e >= 0.6:\n", | |
" print(\"e: \",e,\"\\ntitle: \",article_list[i]['title'], \"\\ntext\\n\", article_list[i]['link'])\n", | |
" \n", | |
" # print(\"최종 tfs : \", result)\n", | |
" # print(\"제목: \", article_list[index]['title'])\n", | |
" # print(\"내용\\n\", article_list[index]['text'])\n", | |
"\n", | |
"# main() 함수부터 시작하고 싶다\n", | |
"if __name__ == \"__main__\":\n", | |
" main()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment