gihyunkim · August 5, 2019 00:52
diff --git a/rss.ipynb b/rss.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "crawl_rss\n",
      "crawl_article\n",
      "[Parsing Title]\n",
      "국내 보안업체 '디지털 서명' 탑재한 악성코드 발견\n",
      "[parsing query ] \n",
      "[['국내', '보안', '업체', '디지털 서명', '탑재', '악성', '코드', '발견']]\n",
      "[parsing Text]\n",
      "--------------------------------------------------\n",
      "e:  0.7499875006249688 \n",
      "title:  스타트업, 손에 잡힐 듯한 독도 가상여행 개발 \n",
      "text\n",
      " http://www.etnews.com/20190724000436\n",
      "e:  0.7083315972366897 \n",
      "title:  트리엠, 국내 최초 '조달ERP 표준서비스' 4분기 출시… 제조기업서 솔루션기업 제2도약 \n",
      "text\n",
      " http://www.etnews.com/20190730000120\n",
      "e:  0.681816528940646 \n",
      "title:  인프론티브, 토종 KVM 보안 스위치로 외산 시장 판도 바꾼다…신보에 2600여대 대량 공급 \n",
      "text\n",
      " http://www.etnews.com/20190726000388\n",
      "e:  0.6666638889351844 \n",
      "title:  포스코ICT-제우스, 스마트공장·로봇사업 다각화 협력한다 \n",
      "text\n",
      " http://www.etnews.com/20190730000250\n",
      "e:  0.6666611112962901 \n",
      "title:  한국SW산업협회, 해외진출위원회 발족 \n",
      "text\n",
      " http://www.etnews.com/20190724000326\n",
      "e:  0.6428551020699704 \n",
      "title:  SAP코리아, 뱅크웨어글로벌과 금융시장 공략 본격화 \n",
      "text\n",
      " http://www.etnews.com/20190724000467\n",
      "e:  0.6111098765569272 \n",
      "title:  NIPA, 극한직업 CG사 참가 컴퓨터그래픽 리크루팅 캠프 개최 \n",
      "text\n",
      " http://www.etnews.com/20190730000135\n"
     ]
    }
   ],
   "source": [
    "import feedparser\n",
    "from newspaper import Article\n",
    "from konlpy.tag import Komoran # 명사만을 추출할 때 쓰임\n",
    "from konlpy.tag import Okt\n",
    "from collections import Counter # 특정 명사가 기사에 몇 개나 들어있나? Sorting까지 해준다.\n",
    "from operator import eq # word끼리 같은지 검사\n",
    "import math\n",
    "import numpy as np\n",
    "\n",
    "komoran = Komoran()\n",
    "\n",
    "urls = (\n",
    "        'http://rss.etnews.com/04043.xml'\n",
    "        ,'http://rss.etnews.com/04044.xml'\n",
    "        ,'http://rss.etnews.com/04042.xml'\n",
    "        , None)\n",
    "\n",
    "def crawl_rss(urls):\n",
    "    print(\"crawl_rss\")\n",
    "    arr_rss=[]\n",
    "    for url in urls:\n",
    "        parse_rss = feedparser.parse(url) # item을 기준으로 url의 rss로부터 속성과 값들을 가져옴\n",
    "        for p in parse_rss.entries: # 각 기사에 대해 어떠한 정보를 뽑아올 수 있다.\n",
    "            arr_rss.append({'title': p.title, 'link':p.link}) # 기사의 제목과 url을 따온다.\n",
    "    return arr_rss\n",
    "        \n",
    "def crawl_article(url, language= 'ko'): # link를 받아와 그 url의 본문을 반환.\n",
    "    var_article = Article(url, language=language)\n",
    "    var_article.download()\n",
    "    var_article.parse()\n",
    "    return var_article.title, var_article.text\n",
    "\n",
    "def get_tags(text, ntags=50):\n",
    "    spliter = Okt()\n",
    "    num_unique_words = 0\n",
    "    num_most_freq = 0\n",
    "    nouns = spliter.nouns(text)\n",
    "    count = Counter(nouns)# nouns가 몇 개 들어가 있나\n",
    "    return_list = []\n",
    "    \n",
    "    for n, c in count.most_common(ntags): # 가장 많이 출현 한 것 (key : count)\n",
    "        temp = {'tag':n, 'count':c}\n",
    "        return_list.append(temp)\n",
    "        num_unique_words = num_unique_words+1\n",
    "        if num_unique_words == 1: # number of unique words in the text\n",
    "            num_most_freq = c # count the number of unique words\n",
    "    return num_unique_words, num_most_freq, return_list \n",
    "\n",
    "def TF(request, most_freq,tag):\n",
    "    return 0.5 + 0.5 * Howmanywords(request, tag)/(most_freq+0.0001)\n",
    "\n",
    "def Howmanywords(request, tag):\n",
    "    noWords = 0\n",
    "    for n in tag:\n",
    "        noun = n['tag']\n",
    "        count = n['count'] \n",
    "        if eq(noun, request):\n",
    "            return count\n",
    "    return noWords\n",
    "\n",
    "# main()\n",
    "def main():\n",
    "    article_list = crawl_rss(urls) # crawling article with title and link\n",
    "    print(\"crawl_article\")\n",
    "    for article in article_list: # for all article\n",
    "        _, text = crawl_article(article['link']) # link를 넣어주면 title과 본문을 return\n",
    "        article['text'] = text\n",
    "    \n",
    "    print('[Parsing Title]')\n",
    "    noun_title = [komoran.nouns(a['title']) for a in article_list] # 모든 article에 대해 title의 명사만 뽑아서 noun_title에 넣어줌.    \n",
    "    query = input()\n",
    "    print('[parsing query ] ')\n",
    "    noun_query = [komoran.nouns(query)]\n",
    "    print(noun_query)\n",
    "    print('[parsing Text]')\n",
    "    noun_text = [] # initialize list\n",
    "    for a in article_list:\n",
    "        # tag : key and count\n",
    "        num_unique_words, num_most_freq, tags = get_tags(a['text']) \n",
    "        noun_text.append({'num_unique_words':num_unique_words,\n",
    "                         'num_most_freq':num_most_freq, 'tags':tags})\n",
    "    tf_idf_title = [] # term freq inv document freq title\n",
    "    tf_idf_mean = [] # term freq inv document freq title mean\n",
    "    ##tf_idf_query = [] # term freq inv document freq query\n",
    "    ##tf_idft_mean_query = [] # term freq inv document freq query mean\n",
    "    ##tag_list = [a['tags'] for a in noun_text]\n",
    "    for i ,nouns in enumerate(noun_title):\n",
    "         tfs = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in nouns]\n",
    "     #_tfidf = [tfs[j] for j,n in enumerate(nouns)]\n",
    "      #  tf_idf_title.append(_tfidf)\n",
    "       # tf_idf_mean.append(np.mean(_tfidf))\n",
    "    print(\"----------\"*5)\n",
    "    tmp = 0\n",
    "    index = 0\n",
    "    result = []\n",
    "    sort_result=[]\n",
    "    for i in range(50):\n",
    "        tfs_query = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in noun_query[0]]\n",
    "        tfidf = [tfs_query[j] for j, n in enumerate(noun_query)]\n",
    "        tf_idf_mean.append(np.mean(tfidf))       \n",
    "        #if(tmp < tfs_query[0]):\n",
    "         #   result = tfs_query\n",
    "          #  index = i\n",
    "        #tmp = tfs_query[0]\n",
    "   # print(tf_idf_mean)\n",
    "    for i in range(50):\n",
    "        sort_result.append([tf_idf_mean[i], i])\n",
    "    sort_result.sort(reverse=True)\n",
    "    for e, i in sort_result:\n",
    "        if e >= 0.6:\n",
    "            print(\"e: \",e,\"\\ntitle: \",article_list[i]['title'], \"\\ntext\\n\", article_list[i]['link'])\n",
    "    \n",
    "   # print(\"최종 tfs : \", result)\n",
    "   # print(\"제목: \", article_list[index]['title'])\n",
    "   # print(\"내용\\n\", article_list[index]['text'])\n",
    "\n",
    "# main() 함수부터 시작하고 싶다\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"crawl_rss\n",
	"crawl_article\n",
	"[Parsing Title]\n",
	"국내 보안업체 '디지털 서명' 탑재한 악성코드 발견\n",
	"[parsing query ] \n",
	"[['국내', '보안', '업체', '디지털 서명', '탑재', '악성', '코드', '발견']]\n",
	"[parsing Text]\n",
	"--------------------------------------------------\n",
	"e: 0.7499875006249688 \n",
	"title: 스타트업, 손에 잡힐 듯한 독도 가상여행 개발 \n",
	"text\n",
	" http://www.etnews.com/20190724000436\n",
	"e: 0.7083315972366897 \n",
	"title: 트리엠, 국내 최초 '조달ERP 표준서비스' 4분기 출시… 제조기업서 솔루션기업 제2도약 \n",
	"text\n",
	" http://www.etnews.com/20190730000120\n",
	"e: 0.681816528940646 \n",
	"title: 인프론티브, 토종 KVM 보안 스위치로 외산 시장 판도 바꾼다…신보에 2600여대 대량 공급 \n",
	"text\n",
	" http://www.etnews.com/20190726000388\n",
	"e: 0.6666638889351844 \n",
	"title: 포스코ICT-제우스, 스마트공장·로봇사업 다각화 협력한다 \n",
	"text\n",
	" http://www.etnews.com/20190730000250\n",
	"e: 0.6666611112962901 \n",
	"title: 한국SW산업협회, 해외진출위원회 발족 \n",
	"text\n",
	" http://www.etnews.com/20190724000326\n",
	"e: 0.6428551020699704 \n",
	"title: SAP코리아, 뱅크웨어글로벌과 금융시장 공략 본격화 \n",
	"text\n",
	" http://www.etnews.com/20190724000467\n",
	"e: 0.6111098765569272 \n",
	"title: NIPA, 극한직업 CG사 참가 컴퓨터그래픽 리크루팅 캠프 개최 \n",
	"text\n",
	" http://www.etnews.com/20190730000135\n"
	]
	}
	],
	"source": [
	"import feedparser\n",
	"from newspaper import Article\n",
	"from konlpy.tag import Komoran # 명사만을 추출할 때 쓰임\n",
	"from konlpy.tag import Okt\n",
	"from collections import Counter # 특정 명사가 기사에 몇 개나 들어있나? Sorting까지 해준다.\n",
	"from operator import eq # word끼리 같은지 검사\n",
	"import math\n",
	"import numpy as np\n",
	"\n",
	"komoran = Komoran()\n",
	"\n",
	"urls = (\n",
	" 'http://rss.etnews.com/04043.xml'\n",
	" ,'http://rss.etnews.com/04044.xml'\n",
	" ,'http://rss.etnews.com/04042.xml'\n",
	" , None)\n",
	"\n",
	"def crawl_rss(urls):\n",
	" print(\"crawl_rss\")\n",
	" arr_rss=[]\n",
	" for url in urls:\n",
	" parse_rss = feedparser.parse(url) # item을 기준으로 url의 rss로부터 속성과 값들을 가져옴\n",
	" for p in parse_rss.entries: # 각 기사에 대해 어떠한 정보를 뽑아올 수 있다.\n",
	" arr_rss.append({'title': p.title, 'link':p.link}) # 기사의 제목과 url을 따온다.\n",
	" return arr_rss\n",
	" \n",
	"def crawl_article(url, language= 'ko'): # link를 받아와 그 url의 본문을 반환.\n",
	" var_article = Article(url, language=language)\n",
	" var_article.download()\n",
	" var_article.parse()\n",
	" return var_article.title, var_article.text\n",
	"\n",
	"def get_tags(text, ntags=50):\n",
	" spliter = Okt()\n",
	" num_unique_words = 0\n",
	" num_most_freq = 0\n",
	" nouns = spliter.nouns(text)\n",
	" count = Counter(nouns)# nouns가 몇 개 들어가 있나\n",
	" return_list = []\n",
	" \n",
	" for n, c in count.most_common(ntags): # 가장 많이 출현 한 것 (key : count)\n",
	" temp = {'tag':n, 'count':c}\n",
	" return_list.append(temp)\n",
	" num_unique_words = num_unique_words+1\n",
	" if num_unique_words == 1: # number of unique words in the text\n",
	" num_most_freq = c # count the number of unique words\n",
	" return num_unique_words, num_most_freq, return_list \n",
	"\n",
	"def TF(request, most_freq,tag):\n",
	" return 0.5 + 0.5 * Howmanywords(request, tag)/(most_freq+0.0001)\n",
	"\n",
	"def Howmanywords(request, tag):\n",
	" noWords = 0\n",
	" for n in tag:\n",
	" noun = n['tag']\n",
	" count = n['count'] \n",
	" if eq(noun, request):\n",
	" return count\n",
	" return noWords\n",
	"\n",
	"# main()\n",
	"def main():\n",
	" article_list = crawl_rss(urls) # crawling article with title and link\n",
	" print(\"crawl_article\")\n",
	" for article in article_list: # for all article\n",
	" _, text = crawl_article(article['link']) # link를 넣어주면 title과 본문을 return\n",
	" article['text'] = text\n",
	" \n",
	" print('[Parsing Title]')\n",
	" noun_title = [komoran.nouns(a['title']) for a in article_list] # 모든 article에 대해 title의 명사만 뽑아서 noun_title에 넣어줌. \n",
	" query = input()\n",
	" print('[parsing query ] ')\n",
	" noun_query = [komoran.nouns(query)]\n",
	" print(noun_query)\n",
	" print('[parsing Text]')\n",
	" noun_text = [] # initialize list\n",
	" for a in article_list:\n",
	" # tag : key and count\n",
	" num_unique_words, num_most_freq, tags = get_tags(a['text']) \n",
	" noun_text.append({'num_unique_words':num_unique_words,\n",
	" 'num_most_freq':num_most_freq, 'tags':tags})\n",
	" tf_idf_title = [] # term freq inv document freq title\n",
	" tf_idf_mean = [] # term freq inv document freq title mean\n",
	" ##tf_idf_query = [] # term freq inv document freq query\n",
	" ##tf_idft_mean_query = [] # term freq inv document freq query mean\n",
	" ##tag_list = [a['tags'] for a in noun_text]\n",
	" for i ,nouns in enumerate(noun_title):\n",
	" tfs = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in nouns]\n",
	" #_tfidf = [tfs[j] for j,n in enumerate(nouns)]\n",
	" # tf_idf_title.append(_tfidf)\n",
	" # tf_idf_mean.append(np.mean(_tfidf))\n",
	" print(\"----------\"*5)\n",
	" tmp = 0\n",
	" index = 0\n",
	" result = []\n",
	" sort_result=[]\n",
	" for i in range(50):\n",
	" tfs_query = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in noun_query[0]]\n",
	" tfidf = [tfs_query[j] for j, n in enumerate(noun_query)]\n",
	" tf_idf_mean.append(np.mean(tfidf)) \n",
	" #if(tmp < tfs_query[0]):\n",
	" # result = tfs_query\n",
	" # index = i\n",
	" #tmp = tfs_query[0]\n",
	" # print(tf_idf_mean)\n",
	" for i in range(50):\n",
	" sort_result.append([tf_idf_mean[i], i])\n",
	" sort_result.sort(reverse=True)\n",
	" for e, i in sort_result:\n",
	" if e >= 0.6:\n",
	" print(\"e: \",e,\"\\ntitle: \",article_list[i]['title'], \"\\ntext\\n\", article_list[i]['link'])\n",
	" \n",
	" # print(\"최종 tfs : \", result)\n",
	" # print(\"제목: \", article_list[index]['title'])\n",
	" # print(\"내용\\n\", article_list[index]['text'])\n",
	"\n",
	"# main() 함수부터 시작하고 싶다\n",
	"if __name__ == \"__main__\":\n",
	" main()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}