Last active
June 13, 2022 20:04
-
-
Save dannguyen/57423dbcb1713d31b659 to your computer and use it in GitHub Desktop.
A Bash script, using the jq JSON-parser, to scrape all the NHTSA 5 star vehcile ratings from its API http://www.nhtsa.gov/webapi/Default.aspx?SafetyRatings/API/5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Note: this is deprecated. jq is still awesome, so now we just get JSON all the way | |
# jq JSON parser is awesome: | |
# http://stedolan.github.io/jq/ | |
# The NHTSA API is pretty clunky, requiring you to get a list of all the years, then all the models in that year, then all the makes per model, and then | |
# finally, you get the vehicle IDs needed to query the endpoint for one vehicle at a time. | |
# | |
# I query for JSON for most of the loop, and in the end, I get the Vehicle data in CSV format | |
# Note, there are a lot of errors in the API, because the NHTSA doesn't properly escape the "/" in a car's name. And many other | |
# whitespace related errors. | |
BURL='http://www.nhtsa.gov/webapi/api/SafetyRatings' | |
# get all the years first | |
curl -s "$BURL?format=json" | jq -r '.Results[] .ModelYear' | \ | |
while read year; do | |
echo "$year" | |
echo "######" | |
curl -s "$BURL/modelyear/$year?format=json" | jq -r '.Results[] .Make' | sed 's/ /%20/g' | sed 's/&/_/g' | \ | |
while read -r carmake; do | |
# Get the year and make | |
echo " $carmake" | |
echo " =======" | |
curl -s "$BURL/modelyear/$year/make/$carmake?format=json" | jq -r '.Results[] .Model' | sed 's/ /%20/g' | sed 's/&/_/g' | \ | |
while read -r model; do | |
echo " $model" | |
echo " -------" | |
# Get the year, make, and model | |
curl -s "$BURL/modelyear/$year/make/$carmake/model/$model?format=json" | jq -r '.Results[] .VehicleId' | \ | |
while read -r id; do | |
echo " $id: $year - $carmake - $model" | |
curl -s "$BURL/VehicleId/$id?format=csv" -o "$id.csv" | |
done | |
echo ' ' | |
done | |
done | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# As it turns out, the CSV produced by NHTSA is broken. | |
# So now, let's just iterate through all possible JSON values (assuming no car is at 10000) | |
# then use jq to collect all possible keys (which varies widely) | |
# and then map every result to that array of keys | |
mkdir -p json/vehicles | |
for id in $(seq 1 10000); do | |
echo "$id.json" | |
curl -sS "http://www.nhtsa.gov/webapi/api/SafetyRatings/VehicleId/$id?format=json" -o "json/vehicles/$id.json" | |
done | |
# remove bad json | |
find ./json/vehicles -name "*.json" | xargs grep -l '</html>' | xargs rm | |
# using this expression: sed 's/\\r\\n//g' | sed 's/\\u00A0//g' | |
# because these characters are inexplicably displayed as literal characters, until they aren't. | |
# get the keys | |
allkeys=$(find ./json/vehicles -name "*.json" | xargs cat | jq --sort-keys -r 'select(.Count == 1) .Results[0] | keys | @csv' | grep -oE '[[:alnum:]]+' | sort | uniq | sed -E 's/^/./' | sed 's/\\r\\n//g' | sed 's/\\u00A0//g'| paste -s -d ',' -) | |
echo $allkeys | tr -d '.' | csvfix echo -osep '|' -smq > all-vehicles.psv | |
find ./json/vehicles -name "*.json" | xargs cat | sed 's/\\r\\n//g' | sed 's/\\u00A0//g' | jq --sort-keys -r "select(.Count == 1) .Results | map($allkeys) | @csv" | csvfix echo -osep '|' -smq >> all-vehicles.psv | |
# MAKE A JSON FOR GOOD TIMES SAKE | |
# refactor later | |
find ./json/vehicles -name "*.json" | xargs cat | sed 's/\\r\\n//g' | sed 's/\\u00A0//g' | jq --sort-keys -r "select(.Count == 1) .Results[0] | {$(echo $allkeys | tr -d '.')}" | jq --slurp '.' > all-vehicles.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"ComplaintsCount": 185, | |
"FrontCrashDriversideNotes": null, | |
"FrontCrashDriversideRating": "3", | |
"FrontCrashDriversideSafetyConcern": null, | |
"FrontCrashPassengersideNotes": null, | |
"FrontCrashPassengersideRating": "2", | |
"FrontCrashPicture": "http://www.safercar.gov/staticfiles/DOT/safercar/ncapmedia/images/2011/v07127P062.jpg", | |
"FrontCrashVideo": "http://www.safercar.gov/staticfiles/DOT/safercar/ncapmedia/movies/2011/v07127C017.wmv", | |
"FrontPassengersideSafetyConcern": null, | |
"InvestigationCount": 1, | |
"Make": "RAM", | |
"Model": "1500 QUAD", | |
"ModelYear": 2011, | |
"NHTSAElectronicStabilityControl": "Standard", | |
"NHTSAForwardCollisionWarning": "No", | |
"NHTSALaneDepartureWarning": "No", | |
"NHTSARearviewVideoSystems": null, | |
"OverallFrontCrashRating": "2", | |
"OverallRating": "3", | |
"OverallSideCrashRating": "5", | |
"RecallsCount": 2, | |
"RolloverNotes": null, | |
"RolloverPossibility": 0.198, | |
"RolloverPossibility2": 0, | |
"RolloverRating": "4", | |
"RolloverRating2": "Not Rated", | |
"SideCrashDriversideNotes": null, | |
"SideCrashDriversideRating": "5", | |
"SideCrashDriversideSafetyConcern": null, | |
"SideCrashPassengersideNotes": null, | |
"SideCrashPassengersideRating": "5", | |
"SideCrashPassengersideSafetyConcern": null, | |
"SideCrashPicture": "http://www.safercar.gov/staticfiles/DOT/safercar/ncapmedia/images/2011/v07129P078.jpg", | |
"SideCrashVideo": "http://www.safercar.gov/staticfiles/DOT/safercar/ncapmedia/movies/2011/v07129C013.wmv", | |
"SidePoleCrashRating": "1", | |
"SidePoleNotes": "Although not included in the star rating, the driver dummy's abdomen rib deflection and thoracic rib deflection readings were elevated", | |
"SidePolePicture": "http://www.safercar.gov/staticfiles/DOT/safercar/ncapmedia/images/2011/v07065P010.jpg", | |
"SidePoleSafetyConcern": "Due to the intrusion of the driver door during the side impact pole test, the interior door panel struck the torso of the driver dummy, causing high resultant lower spine acceleration of 87 g's. High resultant lower spine accelerations, in excess of 82 g's, have a higher likelihood of thoracic injury.", | |
"SidePoleVideo": "http://www.safercar.gov/staticfiles/DOT/safercar/ncapmedia/movies/2011/v07065C012.wmv", | |
"VehicleDescription": "2011 Ram 1500 Quad PU/CC RWD", | |
"VehicleId": 109, | |
"VehiclePicture": "http://www.safercar.gov/staticfiles/DOT/safercar/ncapmedia/images/2011/v07127P005.jpg" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment