Last active
June 7, 2016 03:06
-
-
Save jachermocilla/2aba93544c5cb5822123391b9bf8817c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#by [email protected] | |
# | |
# This script recrawls the sites indicated in URLS_DIR and | |
# forwards results to Solr. | |
# This script deletes everyting in Nutch and Solr. | |
# | |
JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") | |
NUTCH_HOME=/home/ubuntu/apache-nutch-1.11 | |
URLS_DIR=$NUTCH_HOME/urls | |
CRAWL_DIR=$NUTCH_HOME/crawl | |
SOLR_URL=http://10.0.3.234:8080/solr | |
COUNT=2 | |
rm -fr $CRAWL_DIR | |
curl $SOLR_URL/update --data '<delete><query>*:*</query></delete>' -H 'Content-type:text/xml; charset=utf-8' | |
curl $SOLR_URL/update --data '<commit/>' -H 'Content-type:text/xml; charset=utf-8' | |
$NUTCH_HOME/bin/crawl -i -D solr.server.url=$SOLR_URL $URLS_DIR $CRAWL_DIR $COUNT |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment