kf0jvt · December 15, 2014 16:19
diff --git a/extraction.ipynb b/extraction.ipynb
 {
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Part of the reason that these machine learning algorithms aren't working well for me might be a case of 'garbage in garbage out.' The process of downloading the news articles has resulted in some shitty text. This notebook is where I will work on improving the quality of the downloaded text.\n",
      "\n",
      "Bob Goblin (@hrbrmstr) uses a module called readability to get just the important bits of an article. That might help us to have better results."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from urlparse import urlparse\n",
      "import re\n",
      "import os\n",
      "from glob import glob\n",
      "import json\n",
      "import requests\n",
      "from bs4 import BeautifulSoup\n",
      "from readability.readability import Document\n",
      "\n",
      "vcdb_path = '/Users/v527234/Documents/development/python/vcdb/data/json'\n",
      "agent_string = \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36\""
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 34
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def getUrls(inString):\n",
      "    # we're going to replace all the semicolons with spaces so the regex can pick out\n",
      "    # urls more easily\n",
      "    dirtyUrls = re.findall(r'(https?://\\S+)', inString.replace(';', ' '))\n",
      "    cleanUrls = []\n",
      "    for eachUrl in dirtyUrls:\n",
      "        pieces = urlparse(eachUrl)\n",
      "        # Right now I can't handle pdfs\n",
      "        if pieces.path.lower().endswith('pdf'):\n",
      "            continue\n",
      "        cleanUrls.append(pieces.geturl())\n",
      "    for index, each in enumerate(cleanUrls):\n",
      "        if each.endswith(','):\n",
      "            cleanUrls[index] = each[:-1]\n",
      "    return cleanUrls\n",
      "\n",
      "def isDDOS(inDict):\n",
      "    if 'DoS' in inDict['action'].get('hacking',{}).get('variety',[]):\n",
      "        return True\n",
      "    return False\n",
      "\n",
      "def isDefacement(inDict):\n",
      "    if 'Defacement' in inDict['attribute'].get('integrity', {}).get('variety', []):\n",
      "        return True\n",
      "    return False\n",
      "\n",
      "def isLossTheft(inDict):\n",
      "    if 'Loss' in inDict['action'].get('error', {}).get('variety', []):\n",
      "        return True\n",
      "    if 'Theft' in inDict['action'].get('physical', {}).get('variety', []):\n",
      "        return True\n",
      "    return False"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 35
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "articlesPlusTags = {'dos':[], 'deface':[], 'losstheft':[], 'other':[]}\n",
      "for fullFileName in glob(vcdb_path+'/*.json'):\n",
      "    try:\n",
      "        incident = json.loads(open(fullFileName).read())\n",
      "    except:\n",
      "        print fullFileName\n",
      "        continue\n",
      "    if isDDOS(incident):\n",
      "        articlesPlusTags['dos'] += getUrls(incident.get('reference', ''))\n",
      "        continue\n",
      "    if isDefacement(incident):\n",
      "        articlesPlusTags['deface'] += getUrls(incident.get('reference', ''))\n",
      "        continue\n",
      "    if isLossTheft(incident):\n",
      "        articlesPlusTags['losstheft'] += getUrls(incident.get('reference', ''))\n",
      "        continue\n",
      "    articlesPlusTags['other'] += getUrls(incident.get('reference', ''))\n",
      "\n",
      "# This is just pruning out any duplicate urls. Sometimes we code multiple\n",
      "# incidents from a single story.\n",
      "for eachTag in articlesPlusTags.keys():\n",
      "    articlesPlusTags[eachTag] = list(set(articlesPlusTags[eachTag]))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 36
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"Articles about Denial of Service: %s\" % len(articlesPlusTags['dos'])\n",
      "print \"Articles about Defacement:        %s\" % len(articlesPlusTags['deface'])\n",
      "print \"Articles about Loss and Theft:    %s\" % len(articlesPlusTags['losstheft'])\n",
      "print \"All other articles:               %s\" % len(articlesPlusTags['other'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Articles about Denial of Service: 119\n",
        "Articles about Defacement:        86\n",
        "Articles about Loss and Theft:    507\n",
        "All other articles:               2154\n"
       ]
      }
     ],
     "prompt_number": 37
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for dirName in articlesPlusTags.keys():\n",
      "    try:\n",
      "        os.mkdir(dirName)\n",
      "        print \"Created folder %s.\" % dirName\n",
      "    except OSError as e:\n",
      "        if e[1] == \"File exists\":\n",
      "            print \"%s already exists. Deleting its files\" % dirName\n",
      "            for each in os.listdir(dirName):\n",
      "                os.remove(os.path.join(dirName,each))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "deface already exists. Deleting its files\n",
        "dos already exists. Deleting its files"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "other already exists. Deleting its files"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "losstheft already exists. Deleting its files"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 38
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for eachLabel in articlesPlusTags.keys():\n",
      "    for index, eachUrl in enumerate(articlesPlusTags[eachLabel]):\n",
      "        try:\n",
      "            rawHTML = requests.get(eachUrl, headers={'User-Agent':agent_string})\n",
      "        except:\n",
      "            continue\n",
      "        # print rawHTML.status_code\n",
      "        if rawHTML.status_code >= 400 and rawHTML.status_code != 405 and rawHTML.status_code != 403:\n",
      "            continue\n",
      "        try:\n",
      "            readable = BeautifulSoup(Document(rawHTML.content).summary()).get_text()\n",
      "        except:\n",
      "            pass\n",
      "            with open(os.path.join(eachLabel,str(index).zfill(4)+'.txt'),'w') as outfile:\n",
      "                outfile.write(readable.encode('UTF-8'))\n",
      "            # print readable"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.ehackingnews.com\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): news.softpedia.com\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.theverge.com\n"
       ]
      }
     ],
     "prompt_number": "*"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Part of the reason that these machine learning algorithms aren't working well for me might be a case of 'garbage in garbage out.' The process of downloading the news articles has resulted in some shitty text. This notebook is where I will work on improving the quality of the downloaded text.\n",
	"\n",
	"Bob Goblin (@hrbrmstr) uses a module called readability to get just the important bits of an article. That might help us to have better results."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from urlparse import urlparse\n",
	"import re\n",
	"import os\n",
	"from glob import glob\n",
	"import json\n",
	"import requests\n",
	"from bs4 import BeautifulSoup\n",
	"from readability.readability import Document\n",
	"\n",
	"vcdb_path = '/Users/v527234/Documents/development/python/vcdb/data/json'\n",
	"agent_string = \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36\""
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 34
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def getUrls(inString):\n",
	" # we're going to replace all the semicolons with spaces so the regex can pick out\n",
	" # urls more easily\n",
	" dirtyUrls = re.findall(r'(https?://\\S+)', inString.replace(';', ' '))\n",
	" cleanUrls = []\n",
	" for eachUrl in dirtyUrls:\n",
	" pieces = urlparse(eachUrl)\n",
	" # Right now I can't handle pdfs\n",
	" if pieces.path.lower().endswith('pdf'):\n",
	" continue\n",
	" cleanUrls.append(pieces.geturl())\n",
	" for index, each in enumerate(cleanUrls):\n",
	" if each.endswith(','):\n",
	" cleanUrls[index] = each[:-1]\n",
	" return cleanUrls\n",
	"\n",
	"def isDDOS(inDict):\n",
	" if 'DoS' in inDict['action'].get('hacking',{}).get('variety',[]):\n",
	" return True\n",
	" return False\n",
	"\n",
	"def isDefacement(inDict):\n",
	" if 'Defacement' in inDict['attribute'].get('integrity', {}).get('variety', []):\n",
	" return True\n",
	" return False\n",
	"\n",
	"def isLossTheft(inDict):\n",
	" if 'Loss' in inDict['action'].get('error', {}).get('variety', []):\n",
	" return True\n",
	" if 'Theft' in inDict['action'].get('physical', {}).get('variety', []):\n",
	" return True\n",
	" return False"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 35
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"articlesPlusTags = {'dos':[], 'deface':[], 'losstheft':[], 'other':[]}\n",
	"for fullFileName in glob(vcdb_path+'/*.json'):\n",
	" try:\n",
	" incident = json.loads(open(fullFileName).read())\n",
	" except:\n",
	" print fullFileName\n",
	" continue\n",
	" if isDDOS(incident):\n",
	" articlesPlusTags['dos'] += getUrls(incident.get('reference', ''))\n",
	" continue\n",
	" if isDefacement(incident):\n",
	" articlesPlusTags['deface'] += getUrls(incident.get('reference', ''))\n",
	" continue\n",
	" if isLossTheft(incident):\n",
	" articlesPlusTags['losstheft'] += getUrls(incident.get('reference', ''))\n",
	" continue\n",
	" articlesPlusTags['other'] += getUrls(incident.get('reference', ''))\n",
	"\n",
	"# This is just pruning out any duplicate urls. Sometimes we code multiple\n",
	"# incidents from a single story.\n",
	"for eachTag in articlesPlusTags.keys():\n",
	" articlesPlusTags[eachTag] = list(set(articlesPlusTags[eachTag]))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 36
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print \"Articles about Denial of Service: %s\" % len(articlesPlusTags['dos'])\n",
	"print \"Articles about Defacement: %s\" % len(articlesPlusTags['deface'])\n",
	"print \"Articles about Loss and Theft: %s\" % len(articlesPlusTags['losstheft'])\n",
	"print \"All other articles: %s\" % len(articlesPlusTags['other'])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Articles about Denial of Service: 119\n",
	"Articles about Defacement: 86\n",
	"Articles about Loss and Theft: 507\n",
	"All other articles: 2154\n"
	]
	}
	],
	"prompt_number": 37
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"for dirName in articlesPlusTags.keys():\n",
	" try:\n",
	" os.mkdir(dirName)\n",
	" print \"Created folder %s.\" % dirName\n",
	" except OSError as e:\n",
	" if e[1] == \"File exists\":\n",
	" print \"%s already exists. Deleting its files\" % dirName\n",
	" for each in os.listdir(dirName):\n",
	" os.remove(os.path.join(dirName,each))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"deface already exists. Deleting its files\n",
	"dos already exists. Deleting its files"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"other already exists. Deleting its files"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"losstheft already exists. Deleting its files"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n"
	]
	}
	],
	"prompt_number": 38
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"for eachLabel in articlesPlusTags.keys():\n",
	" for index, eachUrl in enumerate(articlesPlusTags[eachLabel]):\n",
	" try:\n",
	" rawHTML = requests.get(eachUrl, headers={'User-Agent':agent_string})\n",
	" except:\n",
	" continue\n",
	" # print rawHTML.status_code\n",
	" if rawHTML.status_code >= 400 and rawHTML.status_code != 405 and rawHTML.status_code != 403:\n",
	" continue\n",
	" try:\n",
	" readable = BeautifulSoup(Document(rawHTML.content).summary()).get_text()\n",
	" except:\n",
	" pass\n",
	" with open(os.path.join(eachLabel,str(index).zfill(4)+'.txt'),'w') as outfile:\n",
	" outfile.write(readable.encode('UTF-8'))\n",
	" # print readable"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stderr",
	"text": [
	"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.ehackingnews.com\n"
	]
	},
	{
	"output_type": "stream",
	"stream": "stderr",
	"text": [
	"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): news.softpedia.com\n"
	]
	},
	{
	"output_type": "stream",
	"stream": "stderr",
	"text": [
	"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.theverge.com\n"
	]
	}
	],
	"prompt_number": "*"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}