- Python 3
- Pip 3
$ brew install python3| # http://docs.wand-py.org/en/0.5.9/ | |
| # http://www.imagemagick.org/script/formats.php | |
| # brew install freetype imagemagick | |
| # brew install PIL | |
| # brew install tesseract | |
| # pip3 install wand | |
| # pip3 install pyocr | |
| import pyocr.builders | |
| import requests | |
| from io import BytesIO |
| gz_buffer = BytesIO() | |
| json_buffer = StringIO() | |
| download_url = "{0}{1}/file".format(request_url, file_id) | |
| request_download = requests.request("GET", download_url, headers=json_header, stream=True) | |
| with zipfile.ZipFile(BytesIO(request_download.content), mode='r') as z: | |
| unzip_file = StringIO(z.read(z.infolist()[0]).decode('utf-8')) | |
| json_responses = json.load(unzip_file)['responses'] | |
| for response in json_responses: | |
| json_buffer.write(json.dumps(response)) |
| # List Comprehension: | |
| process_dict = dict([(attributes.filename, attributes.st_size) for attributes in file_list if attributes.filename.startswith('solcon')]) | |
| # Whitespace Generous: | |
| for attributes in file_list: | |
| if attributes.filename.startswith('solcon'): | |
| process_dict[attributes.filename] = attributes.st_size |
| SET hive.execution.engine = mr; | |
| SET hive.support.concurrency = false; | |
| SET hive.exec.parallel = true; | |
| SET hive.exec.dynamic.partition.mode=nonstrict; | |
| USE hosting_stats; | |
| WITH Rank AS ( | |
| SELECT | |
| cid |
| WITH Landing AS ( | |
| SELECT | |
| visit_id | |
| ,COLLECT_SET(shopper_id) AS shopper_array | |
| ,MIN(sequence) AS min_sequence | |
| FROM | |
| visits | |
| WHERE | |
| page_type = 'landing' | |
| GROUP BY |
| CHECK_HDFS="/some/path/to/file" | |
| function hdfsCheck { | |
| RETRY=0 | |
| while [ $RETRY -lt 9 ]; | |
| do | |
| COUNT=$(hdfs dfs -ls "${CHECK_HDFS}" | wc -l) 2> stderr.txt | |
| if [ $COUNT -lt 1 ]; then |
| if [ $(date +"%-H") -ge 4 ] && [ $(date +"%-H") -le 17 ]; then | |
| sleep $(((17 - $(date +"%-H")) * 60))m | |
| else | |
| sleep 5m | |
| fi |