Created
March 23, 2018 01:04
-
-
Save thebecwar/198c32f04d54a22cdd37d8264be4d02d to your computer and use it in GitHub Desktop.
Killed By Police Data Grab
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==UserScript== | |
// @name KBP | |
// @namespace http://tampermonkey.net/ | |
// @version 0.1 | |
// @description Parse KilledByPolice page into JSON data | |
// @author You | |
// @match http://killedbypolice.net/* | |
// @match http://www.killedbypolice.net/* | |
// @require https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js | |
// @grant none | |
// ==/UserScript== | |
var dateRegex = /\(([^)]*)\)[^\w]*([\w]*)[^\w]([\w]*)[^\w]*([\w]*)/g; | |
$(document).ready(function(){ | |
var count = 0; | |
var items = []; | |
$('tr').each(function(){ | |
var childNodes = $(this).children('td'); | |
if (childNodes.length < 2) { | |
//console.log(this); | |
return; | |
} | |
dateRegex.lastIndex = 0; | |
var datecol = dateRegex.exec(childNodes[0].innerText); | |
if (datecol == null) { | |
//console.log(childNodes[0].innerText); | |
return; | |
} | |
var item = { | |
sequence: datecol[1], | |
month: datecol[2], | |
day: datecol[3], | |
year: datecol[4], | |
state: childNodes[1].innerText.trim(), | |
gr: childNodes[2].innerText.split('\n')[0].trim(), | |
nameAge: childNodes[3].innerText.split('\n')[0].trim(), | |
method: childNodes[4].innerText.trim(), | |
fb: childNodes[5].innerText.trim(), | |
news: childNodes[6].innerText.trim(), | |
}; | |
items.push(item); | |
count++; | |
datecol = dateRegex.exec(childNodes[0].innerText); | |
while (datecol != null) | |
{ | |
var idx = 1; | |
item = { | |
sequence: datecol[1], | |
month: datecol[2], | |
day: datecol[3], | |
year: datecol[4], | |
state: childNodes[1].innerText.trim(), | |
gr: childNodes[2].innerText.includes('\n') ? childNodes[2].innerText.split('\n')[idx].trim() : "", | |
nameAge: childNodes[3].innerText.includes('\n') ? childNodes[3].innerText.split('\n')[idx].trim() : "", | |
method: childNodes[4].innerText.trim(), | |
fb: childNodes[5].innerText.trim(), | |
news: childNodes[6].innerText.trim(), | |
}; | |
if (!item.sequence.startsWith('K')) { | |
items.push(item); | |
count++; | |
idx++; | |
} | |
datecol = dateRegex.exec(childNodes[0].innerText); | |
} | |
}); | |
console.log(items); | |
// Items Parsed, build JSON string: | |
$("body").prepend("<button id='copyclip'>Copy To Clipboard (" + items.length + ")</button>"); | |
$("#copyclip").click(function(ev){ | |
var active = document.activeElement; | |
$("#JSON")[0].focus(); | |
$("#JSON")[0].select(); | |
document.execCommand("copy"); | |
active.focus(); | |
}); | |
$("body").append("<textarea id='JSON'>" + JSON.stringify(items) + "</textarea>"); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
class Victim(): | |
def __init__(self, sequence, month, day, year, state, gr, nameAge, method, fb, news): | |
self.sequence = sequence.strip() | |
self.month = month.strip() | |
self.day = day.strip() | |
self.year = year.strip() | |
self.state = state.strip() | |
if '/' in gr: | |
tmp = gr.split('/') | |
self.gender = tmp[0].strip() | |
self.race = tmp[1].strip() | |
else: | |
self.gender = gr.strip() | |
self.race = "?" | |
if ',' in nameAge: | |
tmp = nameAge.split(',') | |
self.name = tmp[0].strip() | |
self.age = tmp[1].strip() | |
else: | |
self.name = nameAge.strip() | |
self.age = '?' | |
self.method = method.strip().replace('\n',',') | |
self.facebook = fb.strip() | |
self.news = news.strip().replace('\n',' ') | |
def to_record(self, separator='|'): | |
result = self.sequence + separator | |
result += self.month + separator | |
result += self.day + separator | |
result += self.year + separator | |
result += self.state + separator | |
result += self.gender + separator | |
result += self.race + separator | |
result += self.name + separator | |
result += self.age + separator | |
result += self.method + separator | |
result += self.facebook + separator | |
result += self.news + separator | |
return result | |
def header(separator='|'): | |
result = 'Sequence' + separator | |
result += 'Month' + separator | |
result += 'Day' + separator | |
result += 'Year' + separator | |
result += 'State' + separator | |
result += 'Gender' + separator | |
result += 'Race' + separator | |
result += 'Name' + separator | |
result += 'Age' + separator | |
result += 'Method' + separator | |
result += 'Facebook' + separator | |
result += 'News' + separator | |
return result | |
class KilledByPolice(): | |
def __init__(self): | |
self.results = [] | |
def convert(self, filename): | |
lines = [] | |
with open(filename) as f: | |
lines = f.readlines() | |
for l in lines: | |
data = json.loads(l) | |
for i in data: | |
self.results.append(Victim(**i)) | |
kbp = KilledByPolice() | |
# rawdata.txt is plain-text, one page worth of JSON from the userscript per line. | |
kbp.convert("c:\\temp\\rawdata.txt") | |
header = Victim.header() | |
results = [r.to_record() for r in kbp.results] | |
with open('c:\\temp\\outfile.txt', 'w') as f: | |
f.write(header + '\n') | |
for l in results: | |
f.write(l + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment