Created
August 20, 2016 00:37
-
-
Save gojun077/b3be8c56c9706522ffb338dea5de9bfe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# http-rpmlist-parser.sh | |
# Copyright (C) 2016 Jun Go | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
# Jun Go [email protected] | |
# Last Updated: 2016-08-18 | |
# This script uses lynx to render an html page containing a list | |
# of rpm filenames and output the raw text without html tags to | |
# a file. Then the raw text will be parsed using grep, awk, and | |
# sed to return a list of filenames that can be directly compared | |
# with the output of the RHEL command 'rpm - qa' | |
# USAGE: ./http-rpmlist-parser.sh [URL] [output file] | |
# EXAMPLE: | |
# /http-rpmlist-parser.sh \ | |
# http://vault.centos.org/6.6/updates/x86_64/Packages/ \ | |
# cent66-errata-list-clean.txt | |
F0="lynx-temp0.txt" | |
F1="lynx-temp1.txt" | |
F2="lynx-temp2.txt" | |
F3="lynx-temp3.txt" | |
TEMP=("${F0}" | |
"${F1}" | |
"${F2}" | |
"${F3}" | |
) | |
######################################## | |
### Function for removing temp files ### | |
cleanup() | |
{ | |
for i in ${TEMP[*]}; do | |
if [ -f "$i" ]; then | |
rm "$i" | |
else | |
echo "Cannot find temp file $i" | |
fi | |
done | |
} | |
######################################## | |
if [ -z "$1" ]; then | |
echo "Please enter a URL to parse" | |
exit 1 | |
elif [ -z "$2" ]; then | |
echo "Please specify an output file name" | |
exit 1 | |
fi | |
# Check that lynx is installed on the system | |
if ! which lynx > /dev/null 2>&1; then | |
echo "This script requires lynx. Please install lynx and try again" | |
exit 1 | |
fi | |
# Parse html into tagless text using lynx browser | |
lynx -dump -dont_wrap_pre -width=990 -nolist "$1" > "${F0}" | |
# Return lines containing the string '.rpm' | |
grep ".rpm" "${F0}" > "${F1}" | |
# replace all tabs with 4 spaces b/c | |
# awk will interpret [:space:] as FS | |
sed "s:\t: :g" "${F1}" > "${F2}" | |
# Extract the third field containing the filename | |
# Note that html pages containing file lists from EPEL, CentOS Vault, | |
# and HP all use the same format which consists of square brackets, | |
# package name, date, and file size (optional) | |
# [ ] fibreutils-3.2-6.x86_64.rpm 07-Jun-20 | |
awk '{ print $3 }' "${F2}" > "${F3}" | |
# Remove the ".rpm" extension from each filename so that the file | |
# list is directly comparable to the output of 'rpm -qa' | |
sed "s:\(\.rpm\)::g" "${F3}" > "$2" | |
# remove temp files | |
cleanup |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment