Last active
March 7, 2018 22:56
-
-
Save CesarCapillas/add746aaed3f42a9263a7c775fbb8c34 to your computer and use it in GitHub Desktop.
Nutch recipes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
./bin/nutch plugin urlfilter-regex org.apache.nutch.urlfilter.regex.RegexURLFilter -Durlfilter.regex.file=regex |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# Default parameters | |
COLLECTION=${2:-zylk} | |
if [ -z "$1" ]; then | |
# Usage | |
echo 'Usage: check_url_innutch.sh <url> [<collection-name>]' | |
else | |
./bin/nutch readdb $2/crawldb -url "$1" | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# Default parameters | |
COLLECTION=${2:-zylk} | |
SERVER=${3:-localhost} | |
PORT=${4:-8983} | |
if [ -z "$1" ]; then | |
# Usage | |
echo 'Usage: check_url_insolr.sh <url> [<collection-name> <solr-server=localhost> <port=8983>]' | |
else | |
echo `curl -s "http://${SERVER}:${PORT}/solr/${COLLECTION}/select?indent=on&q=id:\"$1\"&wt=json" | jq ".response.numFound"` $1 | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
export MYCOLLECTION=zylk | |
export MYDATE=`/bin/date +%Y%m%d` | |
cd /opt/crawler/$MYCOLLECTION | |
rm -rf /opt/crawler/$MYCOLLECTION/stats/* | |
STATUSLIST="fetched unfetched duplicate notmodified redir_temp redir_perm gone" | |
for i in $STATUSLIST | |
do | |
./bin/nutch readdb $MYCOLLECTION/crawldb/ -dump stats/dump_${MYDATE}_$i -status db_$i -format csv | |
awk -F, '{print $1}' stats/dump_${MYDATE}_$i/part-00000 > stats/$i.txt | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
lynx -listonly -dump $1 | awk '{print $2}'| sort | uniq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment