sunapi386 · June 24, 2018 21:56
diff --git a/storename-scraper.fish b/storename-scraper.fish
 #! /usr/bin/fish
 # Author: Jason Sun
 #
 # To extract and parse safeway website buyable items.
 # A tutorial in fish shell script.
 # I decided to do it in fish because I started playing with wget and checking out the data with vim, and
 # seems gluing linux commands were pretty straight forward, quick and easy. Obviously the downside is
 # this script is platform dependent. I'm using vanilla Ubuntu 16.04 Linux with the 4.4.0-127-generic kernel.
 #

 # Noticed the starting point can be from https://shop.safeway.com/aisles.1483.html
 # we can find all the aisles links in here by
 # $ grep -o -P '/aisles/.*.html' aisles.1483.html
 #
 # but there are duplicates, and nesting, e.g.
 # /aisles/baby-care.1483.html
 # /aisles/baby-care/baby-bath-skin-care.1483.html
 # /aisles/baby-care/diapers-wipes.1483.html
 # /aisles/baby-care/formula-baby-food.1483.html
 # /aisles/baby-care.1483.html
 # /aisles/beverages.1483.html
 # /aisles/beverages/juice-nectars.1483.html
 # /aisles/beverages/soft-drinks-seltzer.1483.html
 # /aisles/beverages/water-drinks.1483.html
 # /aisles/beverages.1483.html
 #
 # we need to get rid of the base category links,
 # e.g. /aisles/baby-care.1483.html
 # because any aisles with baby-care would have sub-content
 # e.g. /aisles/baby-care/baby-bath-skin-care.1483.html
 # notice the ones we want have at least 3 slashes '/', so look for at least 3 (more is okay)
 # $ grep '/.*/.*/'
 #
 # reconstruct the url by prepending the base url
 # $ awk '{print "https://shop.safeway.com" $0}'
 #
 # let's make this into a function called extract_links

 function extract_links
  grep -o -P '/aisles/.*.html' $argv | grep '/.*/.*/' | sort | uniq | awk '{print "https://shop.safeway.com" $0}'
 end

 # note in fish shell, argv is a list of all the items, in here we assume it is 1
 #
 # from this extract_links, we get bunch of interesting links we want to explore, such as
 # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care.1483.html
 # now we want fetch the page and in detail links, ignoring less nested links
 # e.g.
 # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care.1483.html
 # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-body-wash.1483.html
 # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-lotion-oil.1483.html
 # we want the last 2, because the 1st one we got from calling extract_links already

 function extract_links_1_more_slash
  grep -o -P '/aisles/.*.html' $argv | grep '/.*/.*/.*/' | sort | uniq | awk '{print "https://shop.safeway.com" $0}'
 end

 # example result extract_links_1_more_slash:
 # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-body-wash.1483.html
 #
 # with this page, we find item descriptions.
 # e.g.
 # <input type="hidden" name="gridDataSource"
 # value="{&#34;ack&#34;:false,&#34;products&#34;:[{&#34;id&#34;:&#34;
 # #960120416&#34;,&#34;name&#34;:&#34;Ore-Ida Potatoes Spicy Seasoned With Skins
 # - 28 Oz&#34;,&#34;quantity&#34;:&#34;0& #34;,&#34;price&#34;:4.49,&#34;image&#
 # 34;:&#34;https://https://shop.safeway.com/productimages/100x100/960120416_100x
 # 100.jpg/productimages/100x100/960120416_100x100.jpg&#34;,&#34;unitOfMeasure&#3
 # 4;:&#34;OUNCE&#34;,&#34;pricePer&#34;:0.16,&#34;promoDescription&#34;:&#34;&#3
 # 4;,&#34;promoEndDate&#34;:&#34;01 Jan 1950 00:00:00&#34;,&#34;description&#34;
 # :null,&#34;salesRank&#34;:17778,&#34;aisleId&#34;:&#34;1_15_5_1&#34;,&#34;depa
 # rtmentName&#34;:&#34;Frozen Foods&#34;,&#34;aisleName&#34;:&#34;Frozen
 # Vegetables&#34;,&#34;shelfName&#34;:&#34;Frozen Potatoes &amp; Onions&#34;,&#3
 # 4;restrictedValue&#34;:0},{&#34;id&#34;:&#34;960096001&#34;,&#34;name&#34;:&#3
 # 4;Alexia Smart Classics Roasted Crinkle Cut Fries With Sea Salt - 32
 # Oz&#34;,&#34;quantity&
 #
 # extracting this one item can be done via regex, looking for expression `value="{.*}"`
 #
 # $ grep -o -P 'value="{.*}"' frozen-potatoes-onions.1483.html
 #
 # results in just the json contents nested within the `value="` and `"`, so we'll trim that
 #
 # $ sed 's/^value="//' | sed 's/"$//'
 #
 # then there is the encoding `&#34;` -> `"` mapping, so that needs to get replaced
 #
 # $ sed 's/&#34;/"/g'
 #
 # lastly make it nice human formatted with jq tool
 #
 # $ jq '.'
 #
 # putting it all together in a function:

 function extract_items_to_json
  grep -o -P 'value="{.*}"' $argv | sed 's/^value="//' | sed 's/"$//' | sed 's/&#34;/"/g' | jq '.'
 end

 # with these functions, we can get detail list of all the aisles
 #
 # lastly, we need a helper to extract the page from a link
 # we'll use awk to split string on '/' and print the last element

 function extract_page_from_link
  echo $argv | awk -F/ '{print $NF}'
 end


 # lastly a few helpers to smooth out operations
 #
 # helpers
 #
 # a helper to sleep random time interval to avoid hitting the server too fast we might get in trouble

 function random_sleep
  set base_time 2
  set random_range 3
  set random_seconds (math (random)%$random_range+$base_time)
  echo "sleep $random_seconds"
  sleep $random_seconds
 end

 # a helper to both print to screen and write to log at the same time with the current timestamp

 function date_print_log
  set curr_time (date +%Y_%m_%d-%H:%M:%S_%Z)
  echo "$curr_time $argv"
  echo "$curr_time $argv" >> last_run.log
 end

 # inspecting the dumped json, it seems there are some issues
 # 1. the image link seems malformed
 # 2. we get extra info, but perhaps this isn't so bad, so we'll ignore it for now.
 #   "ack": false,
 #   "products": [
 #     {
 # e.g.
 # "image": "https://https://shop.safeway.com/productimages/100x100/960131033_100x100.jpg/productimages/100x100/960131033_100x100.jpg"
 # using a regex this can be trimmed, we know this regex works because this grep returns the portion we want
 #
 # $ grep -oP 'https://shop.*?.jpg'
 #
 # "image": "https://shop.safeway.com/productimages/100x100/960131033_100x100.jpg"
 #
 # we need a buffer in order to fix this, so first we'll find all these image links, fix them, and put it back.
 # select all the image links with
 #
 # $ jq '.products[].image'
 #
 # but reading the manual, we want to match do update-assignment with |=
 # https://stedolan.github.io/jq/manual/
 # and update the value with jsut what is matched, where use the regex above

 function fix_json_image_link
  set filename $argv
  cp $argv $argv.bak
  echo "$filename -> $filename.bak"
  jq '.products[].image |= match("https://shop.*?.jpg")' $filename > $filename.jq_tmpfile
  mv $filename.jq_tmpfile $filename
 end

 # in order to trim all the extra info and put it into one file, we'll make a function to put all them together
 # note the result in unified_db.json is not actually json because commas are missing between }{ characters
 # we'll fix it with
 #
 # $ sed '/}/c\},'
 #
 # I trial and errored this, not sure why this regex works. Got the idea from "Mr. T"
 # https://stackoverflow.com/questions/8822097/how-to-replace-whole-line-with-sed
 # note this leaves an extra comma at the end of the elements which is problematic,
 # but thankfully
 #
 # $ sed '$ s/.$//'
 #
 # removes it

 function summarize_this_json_to_single_file
  echo "summarize_this_json_to_single_file: $argv"
  jq '.products[]' $argv >> unified_db.json.in_progress
 end

 ######
 # setup
 ######
 date_print_log "Start"
 set curr_date (date +%Y_%m_%d-%H:%M:%S_%Z)
 set new_dir_name "scrape-$curr_date"
 date_print_log "working in directory: $new_dir_name"
 mkdir $new_dir_name
 cd $new_dir_name


 ######
 # main
 ######
 wget -q https://shop.safeway.com/aisles.1483.html  # we start with the main aisles page
 extract_links aisles.1483.html > extract_links.log  # optional, dump the links for debugging
 set aisles_links (extract_links aisles.1483.html)  # run again because set converts newline to space
 set n_aisles_links (count $aisles_links)  # counter to track progress
 for a_l in $aisles_links  # explore each of the link from extract_links
  date_print_log "$n_aisles_links links remaining, currently: $a_l"
  wget -q $a_l
  set a_l_file (extract_page_from_link $a_l)
  extract_links_1_more_slash $a_l_file > extract_links_1_more_slash-$a_l_file.log  # for debug
  set detailed_aisles_links (extract_links_1_more_slash $a_l_file)
  set n_detailed_aisles_links (count $detailed_aisles_links)
  for d_a_l in $detailed_aisles_links
    date_print_log "$n_aisles_links : $n_detailed_aisles_links processing $d_a_l to json"
    wget -q $d_a_l
    set d_a_l_file (extract_page_from_link $d_a_l)
    extract_items_to_json $d_a_l_file > extract_items_to_json-$d_a_l_file.json
    fix_json_image_link extract_items_to_json-$d_a_l_file.json
    set n_detailed_aisles_links (math $n_detailed_aisles_links - 1)
    random_sleep
  end
  set n_aisles_links (math $n_aisles_links - 1)  # decrement counter
  random_sleep
 end
 rm unified_db.json*
 set files (ls *.json)
 for j_file in $files
  summarize_this_json_to_single_file $j_file
 end
 echo '{ "products": [' > unified_db.json
 sed '/}/c\},' unified_db.json.in_progress >> unified_db.json
 sed -i '$ s/.$//' unified_db.json  # update file in place -i
 echo ']}' >> unified_db.json
 cp unified_db.json unified_db.json.bak
 jq '.' unified_db.json.bak > unified_db.json  # format it nicely again with jq
 rm unified_db.json.in_progress

 set n_jsons (ls *.json | wc -l)
 date_print_log "Done. $n_jsons json files created."

 # cleanup
 mkdir html
 mkdir json
 mkdir log
 mkdir bak
 mv *.html html
 mv *.json json
 mv *.log log
 mv *.bak bak
 rm *.1


 # ###
 # # debug version of main, basically don't spam the server with wget, just get 1 of each type of links
 # ###
 # wget https://shop.safeway.com/aisles.1483.html  # we start with the main aisles page
 # extract_links aisles.1483.html > extract_links.log  # optional, dump the links for debugging
 # set aisles_links (extract_links aisles.1483.html)  # run again because set converts newline to space
 # set n_aisles_links (count $aisles_links)  # counter to track progress
 # set a_l (echo $aisles_links | cut -d' ' -f1)  # -debug removed- for a_l in $aisles_links
 #   echo "$n_aisles_links links remaining, currently: $a_l"
 #   wget $a_l
 #   set a_l_file (extract_page_from_link $a_l)
 #   extract_links_1_more_slash $a_l_file > extract_links_1_more_slash-$a_l_file.log  # for debug
 #   set detailed_aisles_links (extract_links_1_more_slash $a_l_file)
 #   set n_detailed_aisles_links (count $detailed_aisles_links)
 #   set d_a_l (echo $detailed_aisles_links | cut -d' ' -f1)  # -debug removed- for d_a_l in $detailed_aisles_links
 #     echo "$n_aisles_links : $n_detailed_aisles_links processing $d_a_l to json"
 #     wget $d_a_l
 #     set d_a_l_file (extract_page_from_link $d_a_l)
 #     extract_items_to_json $d_a_l_file > extract_items_to_json-$d_a_l_file.json
 #     set n_detailed_aisles_links (math $n_detailed_aisles_links - 1)
 #     sleep 1
 #   # -debug removed- end
 #   set n_aisles_links (math $n_aisles_links - 1)  # decrement counter
 #   sleep 1  # don't do this too quickly or we might get banned
 # # -debug removed- end
	#! /usr/bin/fish
	# Author: Jason Sun
	#
	# To extract and parse safeway website buyable items.
	# A tutorial in fish shell script.
	# I decided to do it in fish because I started playing with wget and checking out the data with vim, and
	# seems gluing linux commands were pretty straight forward, quick and easy. Obviously the downside is
	# this script is platform dependent. I'm using vanilla Ubuntu 16.04 Linux with the 4.4.0-127-generic kernel.
	#

	# Noticed the starting point can be from https://shop.safeway.com/aisles.1483.html
	# we can find all the aisles links in here by
	# $ grep -o -P '/aisles/.*.html' aisles.1483.html
	#
	# but there are duplicates, and nesting, e.g.
	# /aisles/baby-care.1483.html
	# /aisles/baby-care/baby-bath-skin-care.1483.html
	# /aisles/baby-care/diapers-wipes.1483.html
	# /aisles/baby-care/formula-baby-food.1483.html
	# /aisles/baby-care.1483.html
	# /aisles/beverages.1483.html
	# /aisles/beverages/juice-nectars.1483.html
	# /aisles/beverages/soft-drinks-seltzer.1483.html
	# /aisles/beverages/water-drinks.1483.html
	# /aisles/beverages.1483.html
	#
	# we need to get rid of the base category links,
	# e.g. /aisles/baby-care.1483.html
	# because any aisles with baby-care would have sub-content
	# e.g. /aisles/baby-care/baby-bath-skin-care.1483.html
	# notice the ones we want have at least 3 slashes '/', so look for at least 3 (more is okay)
	# $ grep '/././'
	#
	# reconstruct the url by prepending the base url
	# $ awk '{print "https://shop.safeway.com" $0}'
	#
	# let's make this into a function called extract_links

	function extract_links
	grep -o -P '/aisles/..html' $argv \| grep '/./.*/' \| sort \| uniq \| awk '{print "https://shop.safeway.com" $0}'
	end

	# note in fish shell, argv is a list of all the items, in here we assume it is 1
	#
	# from this extract_links, we get bunch of interesting links we want to explore, such as
	# https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care.1483.html
	# now we want fetch the page and in detail links, ignoring less nested links
	# e.g.
	# https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care.1483.html
	# https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-body-wash.1483.html
	# https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-lotion-oil.1483.html
	# we want the last 2, because the 1st one we got from calling extract_links already

	function extract_links_1_more_slash
	grep -o -P '/aisles/..html' $argv \| grep '/./././' \| sort \| uniq \| awk '{print "https://shop.safeway.com" $0}'
	end

	# example result extract_links_1_more_slash:
	# https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-body-wash.1483.html
	#
	# with this page, we find item descriptions.
	# e.g.
	# <input type="hidden" name="gridDataSource"
	# value="{"ack":false,"products":[{"id":"
	# #960120416","name":"Ore-Ida Potatoes Spicy Seasoned With Skins
	# - 28 Oz","quantity":"0& #34;,"price":4.49,"image&#
	# 34;:"https://https://shop.safeway.com/productimages/100x100/960120416_100x
	# 100.jpg/productimages/100x100/960120416_100x100.jpg","unitOfMeasure&#3
	# 4;:"OUNCE","pricePer":0.16,"promoDescription":"&#3
	# 4;,"promoEndDate":"01 Jan 1950 00:00:00","description"
	# :null,"salesRank":17778,"aisleId":"1_15_5_1","depa
	# rtmentName":"Frozen Foods","aisleName":"Frozen
	# Vegetables","shelfName":"Frozen Potatoes & Onions",&#3
	# 4;restrictedValue":0},{"id":"960096001","name":&#3
	# 4;Alexia Smart Classics Roasted Crinkle Cut Fries With Sea Salt - 32
	# Oz","quantity&
	#
	# extracting this one item can be done via regex, looking for expression `value="{.*}"`
	#
	# $ grep -o -P 'value="{.*}"' frozen-potatoes-onions.1483.html
	#
	# results in just the json contents nested within the `value="` and `"`, so we'll trim that
	#
	# $ sed 's/^value="//' \| sed 's/"$//'
	#
	# then there is the encoding `"` -> `"` mapping, so that needs to get replaced
	#
	# $ sed 's/"/"/g'
	#
	# lastly make it nice human formatted with jq tool
	#
	# $ jq '.'
	#
	# putting it all together in a function:

	function extract_items_to_json
	grep -o -P 'value="{.*}"' $argv \| sed 's/^value="//' \| sed 's/"$//' \| sed 's/"/"/g' \| jq '.'
	end

	# with these functions, we can get detail list of all the aisles
	#
	# lastly, we need a helper to extract the page from a link
	# we'll use awk to split string on '/' and print the last element

	function extract_page_from_link
	echo $argv \| awk -F/ '{print $NF}'
	end


	# lastly a few helpers to smooth out operations
	#
	# helpers
	#
	# a helper to sleep random time interval to avoid hitting the server too fast we might get in trouble

	function random_sleep
	set base_time 2
	set random_range 3
	set random_seconds (math (random)%$random_range+$base_time)
	echo "sleep $random_seconds"
	sleep $random_seconds
	end

	# a helper to both print to screen and write to log at the same time with the current timestamp

	function date_print_log
	set curr_time (date +%Y_%m_%d-%H:%M:%S_%Z)
	echo "$curr_time $argv"
	echo "$curr_time $argv" >> last_run.log
	end

	# inspecting the dumped json, it seems there are some issues
	# 1. the image link seems malformed
	# 2. we get extra info, but perhaps this isn't so bad, so we'll ignore it for now.
	# "ack": false,
	# "products": [
	# {
	# e.g.
	# "image": "https://https://shop.safeway.com/productimages/100x100/960131033_100x100.jpg/productimages/100x100/960131033_100x100.jpg"
	# using a regex this can be trimmed, we know this regex works because this grep returns the portion we want
	#
	# $ grep -oP 'https://shop.*?.jpg'
	#
	# "image": "https://shop.safeway.com/productimages/100x100/960131033_100x100.jpg"
	#
	# we need a buffer in order to fix this, so first we'll find all these image links, fix them, and put it back.
	# select all the image links with
	#
	# $ jq '.products[].image'
	#
	# but reading the manual, we want to match do update-assignment with \|=
	# https://stedolan.github.io/jq/manual/
	# and update the value with jsut what is matched, where use the regex above

	function fix_json_image_link
	set filename $argv
	cp $argv $argv.bak
	echo "$filename -> $filename.bak"
	jq '.products[].image \|= match("https://shop.*?.jpg")' $filename > $filename.jq_tmpfile
	mv $filename.jq_tmpfile $filename
	end

	# in order to trim all the extra info and put it into one file, we'll make a function to put all them together
	# note the result in unified_db.json is not actually json because commas are missing between }{ characters
	# we'll fix it with
	#
	# $ sed '/}/c\},'
	#
	# I trial and errored this, not sure why this regex works. Got the idea from "Mr. T"
	# https://stackoverflow.com/questions/8822097/how-to-replace-whole-line-with-sed
	# note this leaves an extra comma at the end of the elements which is problematic,
	# but thankfully
	#
	# $ sed '$ s/.$//'
	#
	# removes it

	function summarize_this_json_to_single_file
	echo "summarize_this_json_to_single_file: $argv"
	jq '.products[]' $argv >> unified_db.json.in_progress
	end

	######
	# setup
	######
	date_print_log "Start"
	set curr_date (date +%Y_%m_%d-%H:%M:%S_%Z)
	set new_dir_name "scrape-$curr_date"
	date_print_log "working in directory: $new_dir_name"
	mkdir $new_dir_name
	cd $new_dir_name


	######
	# main
	######
	wget -q https://shop.safeway.com/aisles.1483.html # we start with the main aisles page
	extract_links aisles.1483.html > extract_links.log # optional, dump the links for debugging
	set aisles_links (extract_links aisles.1483.html) # run again because set converts newline to space
	set n_aisles_links (count $aisles_links) # counter to track progress
	for a_l in $aisles_links # explore each of the link from extract_links
	date_print_log "$n_aisles_links links remaining, currently: $a_l"
	wget -q $a_l
	set a_l_file (extract_page_from_link $a_l)
	extract_links_1_more_slash $a_l_file > extract_links_1_more_slash-$a_l_file.log # for debug
	set detailed_aisles_links (extract_links_1_more_slash $a_l_file)
	set n_detailed_aisles_links (count $detailed_aisles_links)
	for d_a_l in $detailed_aisles_links
	date_print_log "$n_aisles_links : $n_detailed_aisles_links processing $d_a_l to json"
	wget -q $d_a_l
	set d_a_l_file (extract_page_from_link $d_a_l)
	extract_items_to_json $d_a_l_file > extract_items_to_json-$d_a_l_file.json
	fix_json_image_link extract_items_to_json-$d_a_l_file.json
	set n_detailed_aisles_links (math $n_detailed_aisles_links - 1)
	random_sleep
	end
	set n_aisles_links (math $n_aisles_links - 1) # decrement counter
	random_sleep
	end
	rm unified_db.json*
	set files (ls *.json)
	for j_file in $files
	summarize_this_json_to_single_file $j_file
	end
	echo '{ "products": [' > unified_db.json
	sed '/}/c\},' unified_db.json.in_progress >> unified_db.json
	sed -i '$ s/.$//' unified_db.json # update file in place -i
	echo ']}' >> unified_db.json
	cp unified_db.json unified_db.json.bak
	jq '.' unified_db.json.bak > unified_db.json # format it nicely again with jq
	rm unified_db.json.in_progress

	set n_jsons (ls *.json \| wc -l)
	date_print_log "Done. $n_jsons json files created."

	# cleanup
	mkdir html
	mkdir json
	mkdir log
	mkdir bak
	mv *.html html
	mv *.json json
	mv *.log log
	mv *.bak bak
	rm *.1


	# ###
	# # debug version of main, basically don't spam the server with wget, just get 1 of each type of links
	# ###
	# wget https://shop.safeway.com/aisles.1483.html # we start with the main aisles page
	# extract_links aisles.1483.html > extract_links.log # optional, dump the links for debugging
	# set aisles_links (extract_links aisles.1483.html) # run again because set converts newline to space
	# set n_aisles_links (count $aisles_links) # counter to track progress
	# set a_l (echo $aisles_links \| cut -d' ' -f1) # -debug removed- for a_l in $aisles_links
	# echo "$n_aisles_links links remaining, currently: $a_l"
	# wget $a_l
	# set a_l_file (extract_page_from_link $a_l)
	# extract_links_1_more_slash $a_l_file > extract_links_1_more_slash-$a_l_file.log # for debug
	# set detailed_aisles_links (extract_links_1_more_slash $a_l_file)
	# set n_detailed_aisles_links (count $detailed_aisles_links)
	# set d_a_l (echo $detailed_aisles_links \| cut -d' ' -f1) # -debug removed- for d_a_l in $detailed_aisles_links
	# echo "$n_aisles_links : $n_detailed_aisles_links processing $d_a_l to json"
	# wget $d_a_l
	# set d_a_l_file (extract_page_from_link $d_a_l)
	# extract_items_to_json $d_a_l_file > extract_items_to_json-$d_a_l_file.json
	# set n_detailed_aisles_links (math $n_detailed_aisles_links - 1)
	# sleep 1
	# # -debug removed- end
	# set n_aisles_links (math $n_aisles_links - 1) # decrement counter
	# sleep 1 # don't do this too quickly or we might get banned
	# # -debug removed- end
No results found