Created
December 15, 2014 16:19
-
-
Save kf0jvt/7b5bca89d465a07074b8 to your computer and use it in GitHub Desktop.
Code to extract article text from VCDB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Part of the reason that these machine learning algorithms aren't working well for me might be a case of 'garbage in garbage out.' The process of downloading the news articles has resulted in some shitty text. This notebook is where I will work on improving the quality of the downloaded text.\n", | |
"\n", | |
"Bob Goblin (@hrbrmstr) uses a module called readability to get just the important bits of an article. That might help us to have better results." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from urlparse import urlparse\n", | |
"import re\n", | |
"import os\n", | |
"from glob import glob\n", | |
"import json\n", | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"from readability.readability import Document\n", | |
"\n", | |
"vcdb_path = '/Users/v527234/Documents/development/python/vcdb/data/json'\n", | |
"agent_string = \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36\"" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 34 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def getUrls(inString):\n", | |
" # we're going to replace all the semicolons with spaces so the regex can pick out\n", | |
" # urls more easily\n", | |
" dirtyUrls = re.findall(r'(https?://\\S+)', inString.replace(';', ' '))\n", | |
" cleanUrls = []\n", | |
" for eachUrl in dirtyUrls:\n", | |
" pieces = urlparse(eachUrl)\n", | |
" # Right now I can't handle pdfs\n", | |
" if pieces.path.lower().endswith('pdf'):\n", | |
" continue\n", | |
" cleanUrls.append(pieces.geturl())\n", | |
" for index, each in enumerate(cleanUrls):\n", | |
" if each.endswith(','):\n", | |
" cleanUrls[index] = each[:-1]\n", | |
" return cleanUrls\n", | |
"\n", | |
"def isDDOS(inDict):\n", | |
" if 'DoS' in inDict['action'].get('hacking',{}).get('variety',[]):\n", | |
" return True\n", | |
" return False\n", | |
"\n", | |
"def isDefacement(inDict):\n", | |
" if 'Defacement' in inDict['attribute'].get('integrity', {}).get('variety', []):\n", | |
" return True\n", | |
" return False\n", | |
"\n", | |
"def isLossTheft(inDict):\n", | |
" if 'Loss' in inDict['action'].get('error', {}).get('variety', []):\n", | |
" return True\n", | |
" if 'Theft' in inDict['action'].get('physical', {}).get('variety', []):\n", | |
" return True\n", | |
" return False" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 35 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"articlesPlusTags = {'dos':[], 'deface':[], 'losstheft':[], 'other':[]}\n", | |
"for fullFileName in glob(vcdb_path+'/*.json'):\n", | |
" try:\n", | |
" incident = json.loads(open(fullFileName).read())\n", | |
" except:\n", | |
" print fullFileName\n", | |
" continue\n", | |
" if isDDOS(incident):\n", | |
" articlesPlusTags['dos'] += getUrls(incident.get('reference', ''))\n", | |
" continue\n", | |
" if isDefacement(incident):\n", | |
" articlesPlusTags['deface'] += getUrls(incident.get('reference', ''))\n", | |
" continue\n", | |
" if isLossTheft(incident):\n", | |
" articlesPlusTags['losstheft'] += getUrls(incident.get('reference', ''))\n", | |
" continue\n", | |
" articlesPlusTags['other'] += getUrls(incident.get('reference', ''))\n", | |
"\n", | |
"# This is just pruning out any duplicate urls. Sometimes we code multiple\n", | |
"# incidents from a single story.\n", | |
"for eachTag in articlesPlusTags.keys():\n", | |
" articlesPlusTags[eachTag] = list(set(articlesPlusTags[eachTag]))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 36 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print \"Articles about Denial of Service: %s\" % len(articlesPlusTags['dos'])\n", | |
"print \"Articles about Defacement: %s\" % len(articlesPlusTags['deface'])\n", | |
"print \"Articles about Loss and Theft: %s\" % len(articlesPlusTags['losstheft'])\n", | |
"print \"All other articles: %s\" % len(articlesPlusTags['other'])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Articles about Denial of Service: 119\n", | |
"Articles about Defacement: 86\n", | |
"Articles about Loss and Theft: 507\n", | |
"All other articles: 2154\n" | |
] | |
} | |
], | |
"prompt_number": 37 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for dirName in articlesPlusTags.keys():\n", | |
" try:\n", | |
" os.mkdir(dirName)\n", | |
" print \"Created folder %s.\" % dirName\n", | |
" except OSError as e:\n", | |
" if e[1] == \"File exists\":\n", | |
" print \"%s already exists. Deleting its files\" % dirName\n", | |
" for each in os.listdir(dirName):\n", | |
" os.remove(os.path.join(dirName,each))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"deface already exists. Deleting its files\n", | |
"dos already exists. Deleting its files" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"other already exists. Deleting its files" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"losstheft already exists. Deleting its files" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for eachLabel in articlesPlusTags.keys():\n", | |
" for index, eachUrl in enumerate(articlesPlusTags[eachLabel]):\n", | |
" try:\n", | |
" rawHTML = requests.get(eachUrl, headers={'User-Agent':agent_string})\n", | |
" except:\n", | |
" continue\n", | |
" # print rawHTML.status_code\n", | |
" if rawHTML.status_code >= 400 and rawHTML.status_code != 405 and rawHTML.status_code != 403:\n", | |
" continue\n", | |
" try:\n", | |
" readable = BeautifulSoup(Document(rawHTML.content).summary()).get_text()\n", | |
" except:\n", | |
" pass\n", | |
" with open(os.path.join(eachLabel,str(index).zfill(4)+'.txt'),'w') as outfile:\n", | |
" outfile.write(readable.encode('UTF-8'))\n", | |
" # print readable" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.ehackingnews.com\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): news.softpedia.com\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.theverge.com\n" | |
] | |
} | |
], | |
"prompt_number": "*" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment