Created
January 26, 2024 18:00
-
-
Save colehocking/bb65132c332321719d49959e1b70605d to your computer and use it in GitHub Desktop.
Extract a line-separated list of IPs from a pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Extract a line-separated list of IPs from a pdf | |
# Assumes the dots are enclosed in square brackets | |
# -- Cole Hocking | |
PDF_FILE="$1" | |
# Reference text file with same basename | |
FILENAME="$(basename -- "${PDF_FILE}")" | |
# file extension | |
EXT="${FILENAME##*.}" | |
# file name without extension | |
FILE_HEAD="${FILENAME%.*}" | |
# Text file conversion | |
TXT_FILE="${FILE_HEAD}.txt" | |
# File containing only IPs | |
IP_FILE="${FILE_HEAD}_ip_list.txt" | |
# Convert pdf to text file | |
# requires 'pdftotext' | |
# (install with: sudo apt install poppler-utils) | |
convert_pdf(){ | |
# produces a .txt file with the same name as the pdf | |
pdftotext "${PDF_FILE}" | |
} | |
# extract IP addresses | |
extract_ips(){ | |
# Check to ensure the text file was created | |
if [[ -f "${TXT_FILE}" ]]; then | |
# Extract only the IPs | |
# sed removes the brackets; awk removes the extra line-feed chars | |
grep -E '[0-9]{1,3}\[\.\][0-9]{1,3}\[\.\][0-9]{1,3}\[\.\][0-9]{1,3}' "${TXT_FILE}" | sed 's/[][]//g' | awk '/^\014/{sub("\014","")}1' >> "${IP_FILE}" | |
else | |
echo "${TXT_FILE} -- File not found." | |
exit 1 | |
fi | |
} | |
main(){ | |
if [[ -f ${PDF_FILE} ]]; then | |
convert_pdf | |
extract_ips | |
echo "IP List Created: ${IP_FILE}" | |
rm "${TXT_FILE}" | |
else | |
# check that the pdf file is supplied | |
echo "Usage: ./extract_ips.txt <file>.pdf" | |
exit 1 | |
fi | |
} | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment