Created
August 4, 2017 03:10
-
-
Save JordanMakesMaps/5995cada8ccd985795d888141678efe4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is a program to help clean up the School Shooting Dataset that was | |
web-scraped from Wikipedia. | |
Basically removes all of the excess data from the file and outputs the | |
location, date, deaths, injuries, and description of the event to a | |
clean .csv which can then easily be thrown into ArcGIS Online or any | |
other mapping software. Because the locations are only given as cities | |
and states, I'm using geopy's geolocator to get an approximate lat/lon | |
the point-shape files can be plotted in ArcGIS. | |
Unforunately, because I'm based in China and geopy uses Google's API, | |
finishing the program before it throws an exception was impossible. | |
I finally found someone advising a clever use of recursive | |
functions that keeps calling until it obtains the coordinates. | |
Hopefully this can help someone working with similar source data. | |
""" | |
import pyexcel | |
import pandas as pd | |
import numpy as np | |
import geopy | |
from geopy import Nominatim | |
from geopy.exc import GeocoderTimedOut | |
geolocator = Nominatim() | |
#recursive function that keeps calling until it gets | |
# the coordinates from the Geopy's geolocator. | |
# Works, but stay connected to the internet and | |
#make sure the computer stays plugged in :/ | |
def getGeo(location): | |
try: | |
return geolocator.geocode(location) | |
except GeocoderTimedOut: | |
return getGeo(location) | |
#main - Inputting the web-scraped data | |
filehandle = open("Shooting.csv") | |
shootingFile = filehandle.read() | |
#for pandas | |
dates = [] | |
locations = [] | |
deaths = [] | |
injuries = [] | |
descs = [] | |
lat = [] | |
lon = [] | |
#splitting by each line which contains one event/record | |
perShooting = shootingFile.split('"\n') | |
#loops through the entire file parsing the information into | |
# categories. Date, location, deaths, injuries, & description. | |
for thisLine in perShooting: | |
#EOF | |
if not thisLine: | |
break | |
#removes the beginning and saves the year | |
thisLine = thisLine.strip('"0') | |
myDate = int(thisLine[:4]) | |
#removes the beginning and saves the location | |
thisLine = thisLine[thisLine.index(',"') + 2:] | |
myLocation = thisLine.partition("!")[0] | |
geoAdd = getGeo(myLocation) | |
#removes the beginning and saves the # of deaths | |
thisLine = thisLine[thisLine.index('",')+2:] | |
myDeath = int(thisLine.partition(',')[0]) | |
#removes the beginning and saves the # of injuries | |
thisLine = thisLine[thisLine.index(',')+1:] | |
myInjury = (thisLine.partition(',')[0]) | |
if myInjury == "": | |
myInjury = "0" | |
#takes the rest and stores as the description | |
myDesc = thisLine[thisLine.index('"')+1:] | |
#for pandas dataframe to .csv | |
dates.append(myDate) | |
locations.append(myLocation) | |
deaths.append(myDeath) | |
injuries.append(myInjury) | |
descs.append(myDesc) | |
lat.append(geoAdd.latitude) | |
lon.append(geoAdd.longitude) | |
#acts as a progress bar | |
print(myDate, myLocation, geoAdd.latitude, geoAdd.longitude) | |
#stores the information as a dictionary (string/list) | |
data = {'Date': dates, 'Location':locations, 'Latitude':lat, 'Longitude':lon, 'Deaths':deaths, 'Injuries':injuries, 'Description':descs} | |
#writes the dictonary as a panda dataframe, acts like an excel spreadsheet then poops it out to a .csv | |
df = pd.DataFrame(data, columns=['Date', 'Location', 'Latitude', 'Longitude', 'Deaths', 'Injuries', 'Description']) | |
df.to_csv('CleanSchoolShooting.csv') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment