Skip to content

Instantly share code, notes, and snippets.

@thebecwar
Created March 23, 2018 01:04
Show Gist options
  • Save thebecwar/198c32f04d54a22cdd37d8264be4d02d to your computer and use it in GitHub Desktop.
Save thebecwar/198c32f04d54a22cdd37d8264be4d02d to your computer and use it in GitHub Desktop.
Killed By Police Data Grab
// ==UserScript==
// @name KBP
// @namespace http://tampermonkey.net/
// @version 0.1
// @description Parse KilledByPolice page into JSON data
// @author You
// @match http://killedbypolice.net/*
// @match http://www.killedbypolice.net/*
// @require https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js
// @grant none
// ==/UserScript==
var dateRegex = /\(([^)]*)\)[^\w]*([\w]*)[^\w]([\w]*)[^\w]*([\w]*)/g;
$(document).ready(function(){
var count = 0;
var items = [];
$('tr').each(function(){
var childNodes = $(this).children('td');
if (childNodes.length < 2) {
//console.log(this);
return;
}
dateRegex.lastIndex = 0;
var datecol = dateRegex.exec(childNodes[0].innerText);
if (datecol == null) {
//console.log(childNodes[0].innerText);
return;
}
var item = {
sequence: datecol[1],
month: datecol[2],
day: datecol[3],
year: datecol[4],
state: childNodes[1].innerText.trim(),
gr: childNodes[2].innerText.split('\n')[0].trim(),
nameAge: childNodes[3].innerText.split('\n')[0].trim(),
method: childNodes[4].innerText.trim(),
fb: childNodes[5].innerText.trim(),
news: childNodes[6].innerText.trim(),
};
items.push(item);
count++;
datecol = dateRegex.exec(childNodes[0].innerText);
while (datecol != null)
{
var idx = 1;
item = {
sequence: datecol[1],
month: datecol[2],
day: datecol[3],
year: datecol[4],
state: childNodes[1].innerText.trim(),
gr: childNodes[2].innerText.includes('\n') ? childNodes[2].innerText.split('\n')[idx].trim() : "",
nameAge: childNodes[3].innerText.includes('\n') ? childNodes[3].innerText.split('\n')[idx].trim() : "",
method: childNodes[4].innerText.trim(),
fb: childNodes[5].innerText.trim(),
news: childNodes[6].innerText.trim(),
};
if (!item.sequence.startsWith('K')) {
items.push(item);
count++;
idx++;
}
datecol = dateRegex.exec(childNodes[0].innerText);
}
});
console.log(items);
// Items Parsed, build JSON string:
$("body").prepend("<button id='copyclip'>Copy To Clipboard (" + items.length + ")</button>");
$("#copyclip").click(function(ev){
var active = document.activeElement;
$("#JSON")[0].focus();
$("#JSON")[0].select();
document.execCommand("copy");
active.focus();
});
$("body").append("<textarea id='JSON'>" + JSON.stringify(items) + "</textarea>");
});
import json
class Victim():
def __init__(self, sequence, month, day, year, state, gr, nameAge, method, fb, news):
self.sequence = sequence.strip()
self.month = month.strip()
self.day = day.strip()
self.year = year.strip()
self.state = state.strip()
if '/' in gr:
tmp = gr.split('/')
self.gender = tmp[0].strip()
self.race = tmp[1].strip()
else:
self.gender = gr.strip()
self.race = "?"
if ',' in nameAge:
tmp = nameAge.split(',')
self.name = tmp[0].strip()
self.age = tmp[1].strip()
else:
self.name = nameAge.strip()
self.age = '?'
self.method = method.strip().replace('\n',',')
self.facebook = fb.strip()
self.news = news.strip().replace('\n',' ')
def to_record(self, separator='|'):
result = self.sequence + separator
result += self.month + separator
result += self.day + separator
result += self.year + separator
result += self.state + separator
result += self.gender + separator
result += self.race + separator
result += self.name + separator
result += self.age + separator
result += self.method + separator
result += self.facebook + separator
result += self.news + separator
return result
def header(separator='|'):
result = 'Sequence' + separator
result += 'Month' + separator
result += 'Day' + separator
result += 'Year' + separator
result += 'State' + separator
result += 'Gender' + separator
result += 'Race' + separator
result += 'Name' + separator
result += 'Age' + separator
result += 'Method' + separator
result += 'Facebook' + separator
result += 'News' + separator
return result
class KilledByPolice():
def __init__(self):
self.results = []
def convert(self, filename):
lines = []
with open(filename) as f:
lines = f.readlines()
for l in lines:
data = json.loads(l)
for i in data:
self.results.append(Victim(**i))
kbp = KilledByPolice()
# rawdata.txt is plain-text, one page worth of JSON from the userscript per line.
kbp.convert("c:\\temp\\rawdata.txt")
header = Victim.header()
results = [r.to_record() for r in kbp.results]
with open('c:\\temp\\outfile.txt', 'w') as f:
f.write(header + '\n')
for l in results:
f.write(l + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment