avblink · July 6, 2017 16:14
diff --git a/SEO:Crawling:UrlTitle b/SEO:Crawling:UrlTitle
 #!/bin/bash

 #Domain name without trailing slash please
 domain='http://local.vwt'

 directories=(
    '/markets'
    '/applications'
    '/technologies'
    '/products'
    '/services'
    '/customer-services'
    '/about-us'
    'careers'
    'news'
    'tech-resources'
    'case-studies'
    'contact'
    'privacy-policy'
    'cookie-policy'
 )

 count=-1

 for DIRECTORY in "${directories[@]}"
 do
    ((count++))
    DOMAIN="$domain$DIRECTORY"
    FILE="${DIRECTORY#/}"

    #Skip processing if file already exists
    if [[ -f "$FILE.txt" ]] ; then
        continue;
    fi

 	echo "Processing $DOMAIN"
    echo "Collecting links"
    
    #Get links
    wget -r --spider --delete-after --force-html -e robots=off \
    -I $DIRECTORY \
    $DOMAIN 2>&1 \
    | tee "$FILE-raw.txt" \
    | grep '^--' | awk '{ print $3 }' | grep -v '\. \(css\|js\|png\|gif\|jpg\)$' | sort | uniq > "$FILE-links.txt"

    echo "Parsing titles"

    #Create new file
    >"$FILE.txt"

    #Get titles
    while IFS='' read -r line || [[ -n "$line" ]]; do
        curl -s "$line" | grep -Po '(?<=<title>).*(?=</title>)' \
        | php -r 'while(($line=fgets(STDIN)) !== FALSE) echo html_entity_decode($line, ENT_QUOTES|ENT_HTML401);' \
        | xargs echo -e "$line\t" >> "$FILE.txt"
    done < "$FILE-links.txt"

    rm "$FILE-raw.txt"
    rm "$FILE-links.txt"

    #To convert html entities you could also use commands below instead of php
    #| recode html..ascii \
    #| perl -MHTML::Entities -pe 'decode_entities($_);' \
 done
	#!/bin/bash

	#Domain name without trailing slash please
	domain='http://local.vwt'

	directories=(
	'/markets'
	'/applications'
	'/technologies'
	'/products'
	'/services'
	'/customer-services'
	'/about-us'
	'careers'
	'news'
	'tech-resources'
	'case-studies'
	'contact'
	'privacy-policy'
	'cookie-policy'
	)

	count=-1

	for DIRECTORY in "${directories[@]}"
	do
	((count++))
	DOMAIN="$domain$DIRECTORY"
	FILE="${DIRECTORY#/}"

	#Skip processing if file already exists
	if [[ -f "$FILE.txt" ]] ; then
	continue;
	fi

	echo "Processing $DOMAIN"
	echo "Collecting links"

	#Get links
	wget -r --spider --delete-after --force-html -e robots=off \
	-I $DIRECTORY \
	$DOMAIN 2>&1 \
	\| tee "$FILE-raw.txt" \
	\| grep '^--' \| awk '{ print $3 }' \| grep -v '\. \(css\\|js\\|png\\|gif\\|jpg\)$' \| sort \| uniq > "$FILE-links.txt"

	echo "Parsing titles"

	#Create new file
	>"$FILE.txt"

	#Get titles
	while IFS='' read -r line \|\| [[ -n "$line" ]]; do
	curl -s "$line" \| grep -Po '(?<=<title>).*(?=</title>)' \
	\| php -r 'while(($line=fgets(STDIN)) !== FALSE) echo html_entity_decode($line, ENT_QUOTES\|ENT_HTML401);' \
	\| xargs echo -e "$line\t" >> "$FILE.txt"
	done < "$FILE-links.txt"

	rm "$FILE-raw.txt"
	rm "$FILE-links.txt"

	#To convert html entities you could also use commands below instead of php
	#\| recode html..ascii \
	#\| perl -MHTML::Entities -pe 'decode_entities($_);' \
	done
No results found