Skip to content

Instantly share code, notes, and snippets.

@gojun077
Created August 20, 2016 00:37
Show Gist options
  • Save gojun077/b3be8c56c9706522ffb338dea5de9bfe to your computer and use it in GitHub Desktop.
Save gojun077/b3be8c56c9706522ffb338dea5de9bfe to your computer and use it in GitHub Desktop.
#!/bin/bash
# http-rpmlist-parser.sh
# Copyright (C) 2016 Jun Go
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Jun Go [email protected]
# Last Updated: 2016-08-18
# This script uses lynx to render an html page containing a list
# of rpm filenames and output the raw text without html tags to
# a file. Then the raw text will be parsed using grep, awk, and
# sed to return a list of filenames that can be directly compared
# with the output of the RHEL command 'rpm - qa'
# USAGE: ./http-rpmlist-parser.sh [URL] [output file]
# EXAMPLE:
# /http-rpmlist-parser.sh \
# http://vault.centos.org/6.6/updates/x86_64/Packages/ \
# cent66-errata-list-clean.txt
F0="lynx-temp0.txt"
F1="lynx-temp1.txt"
F2="lynx-temp2.txt"
F3="lynx-temp3.txt"
TEMP=("${F0}"
"${F1}"
"${F2}"
"${F3}"
)
########################################
### Function for removing temp files ###
cleanup()
{
for i in ${TEMP[*]}; do
if [ -f "$i" ]; then
rm "$i"
else
echo "Cannot find temp file $i"
fi
done
}
########################################
if [ -z "$1" ]; then
echo "Please enter a URL to parse"
exit 1
elif [ -z "$2" ]; then
echo "Please specify an output file name"
exit 1
fi
# Check that lynx is installed on the system
if ! which lynx > /dev/null 2>&1; then
echo "This script requires lynx. Please install lynx and try again"
exit 1
fi
# Parse html into tagless text using lynx browser
lynx -dump -dont_wrap_pre -width=990 -nolist "$1" > "${F0}"
# Return lines containing the string '.rpm'
grep ".rpm" "${F0}" > "${F1}"
# replace all tabs with 4 spaces b/c
# awk will interpret [:space:] as FS
sed "s:\t: :g" "${F1}" > "${F2}"
# Extract the third field containing the filename
# Note that html pages containing file lists from EPEL, CentOS Vault,
# and HP all use the same format which consists of square brackets,
# package name, date, and file size (optional)
# [ ] fibreutils-3.2-6.x86_64.rpm 07-Jun-20
awk '{ print $3 }' "${F2}" > "${F3}"
# Remove the ".rpm" extension from each filename so that the file
# list is directly comparable to the output of 'rpm -qa'
sed "s:\(\.rpm\)::g" "${F3}" > "$2"
# remove temp files
cleanup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment