Last active
June 24, 2018 21:56
-
-
Save sunapi386/b82b2ea97ac704fcd0739328ed510af9 to your computer and use it in GitHub Desktop.
fish example web scraper tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/fish | |
| # Author: Jason Sun | |
| # | |
| # To extract and parse safeway website buyable items. | |
| # A tutorial in fish shell script. | |
| # I decided to do it in fish because I started playing with wget and checking out the data with vim, and | |
| # seems gluing linux commands were pretty straight forward, quick and easy. Obviously the downside is | |
| # this script is platform dependent. I'm using vanilla Ubuntu 16.04 Linux with the 4.4.0-127-generic kernel. | |
| # | |
| # Noticed the starting point can be from https://shop.safeway.com/aisles.1483.html | |
| # we can find all the aisles links in here by | |
| # $ grep -o -P '/aisles/.*.html' aisles.1483.html | |
| # | |
| # but there are duplicates, and nesting, e.g. | |
| # /aisles/baby-care.1483.html | |
| # /aisles/baby-care/baby-bath-skin-care.1483.html | |
| # /aisles/baby-care/diapers-wipes.1483.html | |
| # /aisles/baby-care/formula-baby-food.1483.html | |
| # /aisles/baby-care.1483.html | |
| # /aisles/beverages.1483.html | |
| # /aisles/beverages/juice-nectars.1483.html | |
| # /aisles/beverages/soft-drinks-seltzer.1483.html | |
| # /aisles/beverages/water-drinks.1483.html | |
| # /aisles/beverages.1483.html | |
| # | |
| # we need to get rid of the base category links, | |
| # e.g. /aisles/baby-care.1483.html | |
| # because any aisles with baby-care would have sub-content | |
| # e.g. /aisles/baby-care/baby-bath-skin-care.1483.html | |
| # notice the ones we want have at least 3 slashes '/', so look for at least 3 (more is okay) | |
| # $ grep '/.*/.*/' | |
| # | |
| # reconstruct the url by prepending the base url | |
| # $ awk '{print "https://shop.safeway.com" $0}' | |
| # | |
| # let's make this into a function called extract_links | |
| function extract_links | |
| grep -o -P '/aisles/.*.html' $argv | grep '/.*/.*/' | sort | uniq | awk '{print "https://shop.safeway.com" $0}' | |
| end | |
| # note in fish shell, argv is a list of all the items, in here we assume it is 1 | |
| # | |
| # from this extract_links, we get bunch of interesting links we want to explore, such as | |
| # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care.1483.html | |
| # now we want fetch the page and in detail links, ignoring less nested links | |
| # e.g. | |
| # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care.1483.html | |
| # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-body-wash.1483.html | |
| # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-lotion-oil.1483.html | |
| # we want the last 2, because the 1st one we got from calling extract_links already | |
| function extract_links_1_more_slash | |
| grep -o -P '/aisles/.*.html' $argv | grep '/.*/.*/.*/' | sort | uniq | awk '{print "https://shop.safeway.com" $0}' | |
| end | |
| # example result extract_links_1_more_slash: | |
| # https://shop.safeway.com/aisles/baby-care/baby-bath-skin-care/baby-body-wash.1483.html | |
| # | |
| # with this page, we find item descriptions. | |
| # e.g. | |
| # <input type="hidden" name="gridDataSource" | |
| # value="{"ack":false,"products":[{"id":" | |
| # #960120416","name":"Ore-Ida Potatoes Spicy Seasoned With Skins | |
| # - 28 Oz","quantity":"0& #34;,"price":4.49,"image&# | |
| # 34;:"https://https://shop.safeway.com/productimages/100x100/960120416_100x | |
| # 100.jpg/productimages/100x100/960120416_100x100.jpg","unitOfMeasure | |
| # 4;:"OUNCE","pricePer":0.16,"promoDescription":" | |
| # 4;,"promoEndDate":"01 Jan 1950 00:00:00","description" | |
| # :null,"salesRank":17778,"aisleId":"1_15_5_1","depa | |
| # rtmentName":"Frozen Foods","aisleName":"Frozen | |
| # Vegetables","shelfName":"Frozen Potatoes & Onions", | |
| # 4;restrictedValue":0},{"id":"960096001","name": | |
| # 4;Alexia Smart Classics Roasted Crinkle Cut Fries With Sea Salt - 32 | |
| # Oz","quantity& | |
| # | |
| # extracting this one item can be done via regex, looking for expression `value="{.*}"` | |
| # | |
| # $ grep -o -P 'value="{.*}"' frozen-potatoes-onions.1483.html | |
| # | |
| # results in just the json contents nested within the `value="` and `"`, so we'll trim that | |
| # | |
| # $ sed 's/^value="//' | sed 's/"$//' | |
| # | |
| # then there is the encoding `"` -> `"` mapping, so that needs to get replaced | |
| # | |
| # $ sed 's/"/"/g' | |
| # | |
| # lastly make it nice human formatted with jq tool | |
| # | |
| # $ jq '.' | |
| # | |
| # putting it all together in a function: | |
| function extract_items_to_json | |
| grep -o -P 'value="{.*}"' $argv | sed 's/^value="//' | sed 's/"$//' | sed 's/"/"/g' | jq '.' | |
| end | |
| # with these functions, we can get detail list of all the aisles | |
| # | |
| # lastly, we need a helper to extract the page from a link | |
| # we'll use awk to split string on '/' and print the last element | |
| function extract_page_from_link | |
| echo $argv | awk -F/ '{print $NF}' | |
| end | |
| # lastly a few helpers to smooth out operations | |
| # | |
| # helpers | |
| # | |
| # a helper to sleep random time interval to avoid hitting the server too fast we might get in trouble | |
| function random_sleep | |
| set base_time 2 | |
| set random_range 3 | |
| set random_seconds (math (random)%$random_range+$base_time) | |
| echo "sleep $random_seconds" | |
| sleep $random_seconds | |
| end | |
| # a helper to both print to screen and write to log at the same time with the current timestamp | |
| function date_print_log | |
| set curr_time (date +%Y_%m_%d-%H:%M:%S_%Z) | |
| echo "$curr_time $argv" | |
| echo "$curr_time $argv" >> last_run.log | |
| end | |
| # inspecting the dumped json, it seems there are some issues | |
| # 1. the image link seems malformed | |
| # 2. we get extra info, but perhaps this isn't so bad, so we'll ignore it for now. | |
| # "ack": false, | |
| # "products": [ | |
| # { | |
| # e.g. | |
| # "image": "https://https://shop.safeway.com/productimages/100x100/960131033_100x100.jpg/productimages/100x100/960131033_100x100.jpg" | |
| # using a regex this can be trimmed, we know this regex works because this grep returns the portion we want | |
| # | |
| # $ grep -oP 'https://shop.*?.jpg' | |
| # | |
| # "image": "https://shop.safeway.com/productimages/100x100/960131033_100x100.jpg" | |
| # | |
| # we need a buffer in order to fix this, so first we'll find all these image links, fix them, and put it back. | |
| # select all the image links with | |
| # | |
| # $ jq '.products[].image' | |
| # | |
| # but reading the manual, we want to match do update-assignment with |= | |
| # https://stedolan.github.io/jq/manual/ | |
| # and update the value with jsut what is matched, where use the regex above | |
| function fix_json_image_link | |
| set filename $argv | |
| cp $argv $argv.bak | |
| echo "$filename -> $filename.bak" | |
| jq '.products[].image |= match("https://shop.*?.jpg")' $filename > $filename.jq_tmpfile | |
| mv $filename.jq_tmpfile $filename | |
| end | |
| # in order to trim all the extra info and put it into one file, we'll make a function to put all them together | |
| # note the result in unified_db.json is not actually json because commas are missing between }{ characters | |
| # we'll fix it with | |
| # | |
| # $ sed '/}/c\},' | |
| # | |
| # I trial and errored this, not sure why this regex works. Got the idea from "Mr. T" | |
| # https://stackoverflow.com/questions/8822097/how-to-replace-whole-line-with-sed | |
| # note this leaves an extra comma at the end of the elements which is problematic, | |
| # but thankfully | |
| # | |
| # $ sed '$ s/.$//' | |
| # | |
| # removes it | |
| function summarize_this_json_to_single_file | |
| echo "summarize_this_json_to_single_file: $argv" | |
| jq '.products[]' $argv >> unified_db.json.in_progress | |
| end | |
| ###### | |
| # setup | |
| ###### | |
| date_print_log "Start" | |
| set curr_date (date +%Y_%m_%d-%H:%M:%S_%Z) | |
| set new_dir_name "scrape-$curr_date" | |
| date_print_log "working in directory: $new_dir_name" | |
| mkdir $new_dir_name | |
| cd $new_dir_name | |
| ###### | |
| # main | |
| ###### | |
| wget -q https://shop.safeway.com/aisles.1483.html # we start with the main aisles page | |
| extract_links aisles.1483.html > extract_links.log # optional, dump the links for debugging | |
| set aisles_links (extract_links aisles.1483.html) # run again because set converts newline to space | |
| set n_aisles_links (count $aisles_links) # counter to track progress | |
| for a_l in $aisles_links # explore each of the link from extract_links | |
| date_print_log "$n_aisles_links links remaining, currently: $a_l" | |
| wget -q $a_l | |
| set a_l_file (extract_page_from_link $a_l) | |
| extract_links_1_more_slash $a_l_file > extract_links_1_more_slash-$a_l_file.log # for debug | |
| set detailed_aisles_links (extract_links_1_more_slash $a_l_file) | |
| set n_detailed_aisles_links (count $detailed_aisles_links) | |
| for d_a_l in $detailed_aisles_links | |
| date_print_log "$n_aisles_links : $n_detailed_aisles_links processing $d_a_l to json" | |
| wget -q $d_a_l | |
| set d_a_l_file (extract_page_from_link $d_a_l) | |
| extract_items_to_json $d_a_l_file > extract_items_to_json-$d_a_l_file.json | |
| fix_json_image_link extract_items_to_json-$d_a_l_file.json | |
| set n_detailed_aisles_links (math $n_detailed_aisles_links - 1) | |
| random_sleep | |
| end | |
| set n_aisles_links (math $n_aisles_links - 1) # decrement counter | |
| random_sleep | |
| end | |
| rm unified_db.json* | |
| set files (ls *.json) | |
| for j_file in $files | |
| summarize_this_json_to_single_file $j_file | |
| end | |
| echo '{ "products": [' > unified_db.json | |
| sed '/}/c\},' unified_db.json.in_progress >> unified_db.json | |
| sed -i '$ s/.$//' unified_db.json # update file in place -i | |
| echo ']}' >> unified_db.json | |
| cp unified_db.json unified_db.json.bak | |
| jq '.' unified_db.json.bak > unified_db.json # format it nicely again with jq | |
| rm unified_db.json.in_progress | |
| set n_jsons (ls *.json | wc -l) | |
| date_print_log "Done. $n_jsons json files created." | |
| # cleanup | |
| mkdir html | |
| mkdir json | |
| mkdir log | |
| mkdir bak | |
| mv *.html html | |
| mv *.json json | |
| mv *.log log | |
| mv *.bak bak | |
| rm *.1 | |
| # ### | |
| # # debug version of main, basically don't spam the server with wget, just get 1 of each type of links | |
| # ### | |
| # wget https://shop.safeway.com/aisles.1483.html # we start with the main aisles page | |
| # extract_links aisles.1483.html > extract_links.log # optional, dump the links for debugging | |
| # set aisles_links (extract_links aisles.1483.html) # run again because set converts newline to space | |
| # set n_aisles_links (count $aisles_links) # counter to track progress | |
| # set a_l (echo $aisles_links | cut -d' ' -f1) # -debug removed- for a_l in $aisles_links | |
| # echo "$n_aisles_links links remaining, currently: $a_l" | |
| # wget $a_l | |
| # set a_l_file (extract_page_from_link $a_l) | |
| # extract_links_1_more_slash $a_l_file > extract_links_1_more_slash-$a_l_file.log # for debug | |
| # set detailed_aisles_links (extract_links_1_more_slash $a_l_file) | |
| # set n_detailed_aisles_links (count $detailed_aisles_links) | |
| # set d_a_l (echo $detailed_aisles_links | cut -d' ' -f1) # -debug removed- for d_a_l in $detailed_aisles_links | |
| # echo "$n_aisles_links : $n_detailed_aisles_links processing $d_a_l to json" | |
| # wget $d_a_l | |
| # set d_a_l_file (extract_page_from_link $d_a_l) | |
| # extract_items_to_json $d_a_l_file > extract_items_to_json-$d_a_l_file.json | |
| # set n_detailed_aisles_links (math $n_detailed_aisles_links - 1) | |
| # sleep 1 | |
| # # -debug removed- end | |
| # set n_aisles_links (math $n_aisles_links - 1) # decrement counter | |
| # sleep 1 # don't do this too quickly or we might get banned | |
| # # -debug removed- end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment