Skip to content

Instantly share code, notes, and snippets.

@kf0jvt
Created December 15, 2014 16:19
Show Gist options
  • Save kf0jvt/7b5bca89d465a07074b8 to your computer and use it in GitHub Desktop.
Save kf0jvt/7b5bca89d465a07074b8 to your computer and use it in GitHub Desktop.
Code to extract article text from VCDB
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Part of the reason that these machine learning algorithms aren't working well for me might be a case of 'garbage in garbage out.' The process of downloading the news articles has resulted in some shitty text. This notebook is where I will work on improving the quality of the downloaded text.\n",
"\n",
"Bob Goblin (@hrbrmstr) uses a module called readability to get just the important bits of an article. That might help us to have better results."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from urlparse import urlparse\n",
"import re\n",
"import os\n",
"from glob import glob\n",
"import json\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"from readability.readability import Document\n",
"\n",
"vcdb_path = '/Users/v527234/Documents/development/python/vcdb/data/json'\n",
"agent_string = \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36\""
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def getUrls(inString):\n",
" # we're going to replace all the semicolons with spaces so the regex can pick out\n",
" # urls more easily\n",
" dirtyUrls = re.findall(r'(https?://\\S+)', inString.replace(';', ' '))\n",
" cleanUrls = []\n",
" for eachUrl in dirtyUrls:\n",
" pieces = urlparse(eachUrl)\n",
" # Right now I can't handle pdfs\n",
" if pieces.path.lower().endswith('pdf'):\n",
" continue\n",
" cleanUrls.append(pieces.geturl())\n",
" for index, each in enumerate(cleanUrls):\n",
" if each.endswith(','):\n",
" cleanUrls[index] = each[:-1]\n",
" return cleanUrls\n",
"\n",
"def isDDOS(inDict):\n",
" if 'DoS' in inDict['action'].get('hacking',{}).get('variety',[]):\n",
" return True\n",
" return False\n",
"\n",
"def isDefacement(inDict):\n",
" if 'Defacement' in inDict['attribute'].get('integrity', {}).get('variety', []):\n",
" return True\n",
" return False\n",
"\n",
"def isLossTheft(inDict):\n",
" if 'Loss' in inDict['action'].get('error', {}).get('variety', []):\n",
" return True\n",
" if 'Theft' in inDict['action'].get('physical', {}).get('variety', []):\n",
" return True\n",
" return False"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"articlesPlusTags = {'dos':[], 'deface':[], 'losstheft':[], 'other':[]}\n",
"for fullFileName in glob(vcdb_path+'/*.json'):\n",
" try:\n",
" incident = json.loads(open(fullFileName).read())\n",
" except:\n",
" print fullFileName\n",
" continue\n",
" if isDDOS(incident):\n",
" articlesPlusTags['dos'] += getUrls(incident.get('reference', ''))\n",
" continue\n",
" if isDefacement(incident):\n",
" articlesPlusTags['deface'] += getUrls(incident.get('reference', ''))\n",
" continue\n",
" if isLossTheft(incident):\n",
" articlesPlusTags['losstheft'] += getUrls(incident.get('reference', ''))\n",
" continue\n",
" articlesPlusTags['other'] += getUrls(incident.get('reference', ''))\n",
"\n",
"# This is just pruning out any duplicate urls. Sometimes we code multiple\n",
"# incidents from a single story.\n",
"for eachTag in articlesPlusTags.keys():\n",
" articlesPlusTags[eachTag] = list(set(articlesPlusTags[eachTag]))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print \"Articles about Denial of Service: %s\" % len(articlesPlusTags['dos'])\n",
"print \"Articles about Defacement: %s\" % len(articlesPlusTags['deface'])\n",
"print \"Articles about Loss and Theft: %s\" % len(articlesPlusTags['losstheft'])\n",
"print \"All other articles: %s\" % len(articlesPlusTags['other'])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Articles about Denial of Service: 119\n",
"Articles about Defacement: 86\n",
"Articles about Loss and Theft: 507\n",
"All other articles: 2154\n"
]
}
],
"prompt_number": 37
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for dirName in articlesPlusTags.keys():\n",
" try:\n",
" os.mkdir(dirName)\n",
" print \"Created folder %s.\" % dirName\n",
" except OSError as e:\n",
" if e[1] == \"File exists\":\n",
" print \"%s already exists. Deleting its files\" % dirName\n",
" for each in os.listdir(dirName):\n",
" os.remove(os.path.join(dirName,each))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"deface already exists. Deleting its files\n",
"dos already exists. Deleting its files"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"other already exists. Deleting its files"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"losstheft already exists. Deleting its files"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for eachLabel in articlesPlusTags.keys():\n",
" for index, eachUrl in enumerate(articlesPlusTags[eachLabel]):\n",
" try:\n",
" rawHTML = requests.get(eachUrl, headers={'User-Agent':agent_string})\n",
" except:\n",
" continue\n",
" # print rawHTML.status_code\n",
" if rawHTML.status_code >= 400 and rawHTML.status_code != 405 and rawHTML.status_code != 403:\n",
" continue\n",
" try:\n",
" readable = BeautifulSoup(Document(rawHTML.content).summary()).get_text()\n",
" except:\n",
" pass\n",
" with open(os.path.join(eachLabel,str(index).zfill(4)+'.txt'),'w') as outfile:\n",
" outfile.write(readable.encode('UTF-8'))\n",
" # print readable"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.ehackingnews.com\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): news.softpedia.com\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.theverge.com\n"
]
}
],
"prompt_number": "*"
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment