Skip to content

Instantly share code, notes, and snippets.

@gihyunkim
Created August 5, 2019 00:52
Show Gist options
  • Save gihyunkim/f734364186a04c8458933fee748a2ec1 to your computer and use it in GitHub Desktop.
Save gihyunkim/f734364186a04c8458933fee748a2ec1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"crawl_rss\n",
"crawl_article\n",
"[Parsing Title]\n",
"국내 보안업체 '디지털 서명' 탑재한 악성코드 발견\n",
"[parsing query ] \n",
"[['국내', '보안', '업체', '디지털 서명', '탑재', '악성', '코드', '발견']]\n",
"[parsing Text]\n",
"--------------------------------------------------\n",
"e: 0.7499875006249688 \n",
"title: 스타트업, 손에 잡힐 듯한 독도 가상여행 개발 \n",
"text\n",
" http://www.etnews.com/20190724000436\n",
"e: 0.7083315972366897 \n",
"title: 트리엠, 국내 최초 '조달ERP 표준서비스' 4분기 출시… 제조기업서 솔루션기업 제2도약 \n",
"text\n",
" http://www.etnews.com/20190730000120\n",
"e: 0.681816528940646 \n",
"title: 인프론티브, 토종 KVM 보안 스위치로 외산 시장 판도 바꾼다…신보에 2600여대 대량 공급 \n",
"text\n",
" http://www.etnews.com/20190726000388\n",
"e: 0.6666638889351844 \n",
"title: 포스코ICT-제우스, 스마트공장·로봇사업 다각화 협력한다 \n",
"text\n",
" http://www.etnews.com/20190730000250\n",
"e: 0.6666611112962901 \n",
"title: 한국SW산업협회, 해외진출위원회 발족 \n",
"text\n",
" http://www.etnews.com/20190724000326\n",
"e: 0.6428551020699704 \n",
"title: SAP코리아, 뱅크웨어글로벌과 금융시장 공략 본격화 \n",
"text\n",
" http://www.etnews.com/20190724000467\n",
"e: 0.6111098765569272 \n",
"title: NIPA, 극한직업 CG사 참가 컴퓨터그래픽 리크루팅 캠프 개최 \n",
"text\n",
" http://www.etnews.com/20190730000135\n"
]
}
],
"source": [
"import feedparser\n",
"from newspaper import Article\n",
"from konlpy.tag import Komoran # 명사만을 추출할 때 쓰임\n",
"from konlpy.tag import Okt\n",
"from collections import Counter # 특정 명사가 기사에 몇 개나 들어있나? Sorting까지 해준다.\n",
"from operator import eq # word끼리 같은지 검사\n",
"import math\n",
"import numpy as np\n",
"\n",
"komoran = Komoran()\n",
"\n",
"urls = (\n",
" 'http://rss.etnews.com/04043.xml'\n",
" ,'http://rss.etnews.com/04044.xml'\n",
" ,'http://rss.etnews.com/04042.xml'\n",
" , None)\n",
"\n",
"def crawl_rss(urls):\n",
" print(\"crawl_rss\")\n",
" arr_rss=[]\n",
" for url in urls:\n",
" parse_rss = feedparser.parse(url) # item을 기준으로 url의 rss로부터 속성과 값들을 가져옴\n",
" for p in parse_rss.entries: # 각 기사에 대해 어떠한 정보를 뽑아올 수 있다.\n",
" arr_rss.append({'title': p.title, 'link':p.link}) # 기사의 제목과 url을 따온다.\n",
" return arr_rss\n",
" \n",
"def crawl_article(url, language= 'ko'): # link를 받아와 그 url의 본문을 반환.\n",
" var_article = Article(url, language=language)\n",
" var_article.download()\n",
" var_article.parse()\n",
" return var_article.title, var_article.text\n",
"\n",
"def get_tags(text, ntags=50):\n",
" spliter = Okt()\n",
" num_unique_words = 0\n",
" num_most_freq = 0\n",
" nouns = spliter.nouns(text)\n",
" count = Counter(nouns)# nouns가 몇 개 들어가 있나\n",
" return_list = []\n",
" \n",
" for n, c in count.most_common(ntags): # 가장 많이 출현 한 것 (key : count)\n",
" temp = {'tag':n, 'count':c}\n",
" return_list.append(temp)\n",
" num_unique_words = num_unique_words+1\n",
" if num_unique_words == 1: # number of unique words in the text\n",
" num_most_freq = c # count the number of unique words\n",
" return num_unique_words, num_most_freq, return_list \n",
"\n",
"def TF(request, most_freq,tag):\n",
" return 0.5 + 0.5 * Howmanywords(request, tag)/(most_freq+0.0001)\n",
"\n",
"def Howmanywords(request, tag):\n",
" noWords = 0\n",
" for n in tag:\n",
" noun = n['tag']\n",
" count = n['count'] \n",
" if eq(noun, request):\n",
" return count\n",
" return noWords\n",
"\n",
"# main()\n",
"def main():\n",
" article_list = crawl_rss(urls) # crawling article with title and link\n",
" print(\"crawl_article\")\n",
" for article in article_list: # for all article\n",
" _, text = crawl_article(article['link']) # link를 넣어주면 title과 본문을 return\n",
" article['text'] = text\n",
" \n",
" print('[Parsing Title]')\n",
" noun_title = [komoran.nouns(a['title']) for a in article_list] # 모든 article에 대해 title의 명사만 뽑아서 noun_title에 넣어줌. \n",
" query = input()\n",
" print('[parsing query ] ')\n",
" noun_query = [komoran.nouns(query)]\n",
" print(noun_query)\n",
" print('[parsing Text]')\n",
" noun_text = [] # initialize list\n",
" for a in article_list:\n",
" # tag : key and count\n",
" num_unique_words, num_most_freq, tags = get_tags(a['text']) \n",
" noun_text.append({'num_unique_words':num_unique_words,\n",
" 'num_most_freq':num_most_freq, 'tags':tags})\n",
" tf_idf_title = [] # term freq inv document freq title\n",
" tf_idf_mean = [] # term freq inv document freq title mean\n",
" ##tf_idf_query = [] # term freq inv document freq query\n",
" ##tf_idft_mean_query = [] # term freq inv document freq query mean\n",
" ##tag_list = [a['tags'] for a in noun_text]\n",
" for i ,nouns in enumerate(noun_title):\n",
" tfs = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in nouns]\n",
" #_tfidf = [tfs[j] for j,n in enumerate(nouns)]\n",
" # tf_idf_title.append(_tfidf)\n",
" # tf_idf_mean.append(np.mean(_tfidf))\n",
" print(\"----------\"*5)\n",
" tmp = 0\n",
" index = 0\n",
" result = []\n",
" sort_result=[]\n",
" for i in range(50):\n",
" tfs_query = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in noun_query[0]]\n",
" tfidf = [tfs_query[j] for j, n in enumerate(noun_query)]\n",
" tf_idf_mean.append(np.mean(tfidf)) \n",
" #if(tmp < tfs_query[0]):\n",
" # result = tfs_query\n",
" # index = i\n",
" #tmp = tfs_query[0]\n",
" # print(tf_idf_mean)\n",
" for i in range(50):\n",
" sort_result.append([tf_idf_mean[i], i])\n",
" sort_result.sort(reverse=True)\n",
" for e, i in sort_result:\n",
" if e >= 0.6:\n",
" print(\"e: \",e,\"\\ntitle: \",article_list[i]['title'], \"\\ntext\\n\", article_list[i]['link'])\n",
" \n",
" # print(\"최종 tfs : \", result)\n",
" # print(\"제목: \", article_list[index]['title'])\n",
" # print(\"내용\\n\", article_list[index]['text'])\n",
"\n",
"# main() 함수부터 시작하고 싶다\n",
"if __name__ == \"__main__\":\n",
" main()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment