Skip to content

Instantly share code, notes, and snippets.

@rubeniskov
Last active September 29, 2022 00:39
Show Gist options
  • Save rubeniskov/52dc8c39ac33f1e78eed11aa981ea010 to your computer and use it in GitHub Desktop.
Save rubeniskov/52dc8c39ac33f1e78eed11aa981ea010 to your computer and use it in GitHub Desktop.
Makefile crawler urls snippet
# $(call crawler,<url>,<depth_levels>)
define crawler
{ echo '$(1)'\
$(foreach DEPTH_LEVEL, $(shell printf '1 %.0s' {1..$(2)}), \
| xargs -n 1 -P $(shell echo $$(($$(nproc 2>/dev/null|| sysctl -n hw.physicalcpu) * 32))) \
wget \
--delete-after \
--level=$(DEPTH_LEVEL) \
--domains="$(shell echo '$(1)'|awk -F/ '{print $$3}')" \
--spider \
--reject="css,js,jpg,jpeg,png,gif" \
--force-html \
--no-parent \
--recursive \
--random-wait \
--limit-rate=20k \
--execute="robots=off" \
--user-agent="Googlebot/2.1 (+http://www.google.com/bot.html)" 2>&1 |\
grep --line-buffered '^--' |\
awk '{ print $$3 }' | uniq ); } | sort -u
endef
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment