Last active
October 10, 2020 13:44
-
-
Save camallen/c167a711641cf44efdab78f0bacf4405 to your computer and use it in GitHub Desktop.
Update blob content types - use a search index to generate a list of az storage cli cmds
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from azure.core.credentials import AzureKeyCredential | |
from azure.search.documents import SearchClient | |
from urllib.parse import urlparse | |
from progress.bar import Bar | |
from sys import platform | |
import re | |
import subprocess | |
import mimetypes | |
import sys | |
import os | |
import base64 | |
import pdb | |
# ensure we have our mime types db | |
mimetypes.init() | |
# list of file extensions to skip processing for...we don't care about these | |
skip_extensions_list = [".fz", ".fits"] | |
# setup our base64 decode system cmd | |
if platform == "linux" or platform == "linux2": | |
base_64_cmd = "echo {} | base64 -d" | |
elif platform == "darwin": | |
base_64_cmd = "echo {} | base64 -D" | |
# not used since we fixed base64 decoding via `fix_base64_encoded_string` function | |
def shell_base64_decode(encoded): | |
decode_shell_cmd = base_64_cmd.format(encoded) | |
shell_base64_decode = subprocess.Popen( | |
decode_shell_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
stdout, stderr = shell_base64_decode.communicate() | |
blob_file_path = stdout.decode("utf-8") | |
return blob_file_path | |
# convert the azure 'special' base64 encoded nonsense | |
# to correctly padded '=' base64 | |
def fix_base64_encoded_string(base64_encoded): | |
base64_prefix = base64_encoded[:-1] | |
last_digit = base64_encoded[-1] | |
padding_suffix = '=' * int(last_digit) | |
return base64_prefix + padding_suffix | |
# We'll connect to the Azure Cognitive Search public sandbox and send a | |
# query to its "nycjobs" index built from a public dataset of available jobs | |
# in New York. | |
index_name = "azureblob-index" | |
# get this API key from the azure portal | |
api_key = os.environ['SEARCH_API_KEY'] | |
# setup an AZ cli cmd to update the storage blob data content type | |
az_cmd_prefix = "az storage blob update --auth-mode login --account-name galaxyzoosubjects" | |
# Setup the output file for az cli commands to feed into gnu parallels | |
out_file_name = "www_gz_subjects_az_cli_fix_mime_type_cmds.txt" | |
out_file = open(out_file_name, "w") | |
# Create a SearchClient to send queries to our custom made gz subjects search index | |
endpoint = "https://gz-blob-searcher-take-2.search.windows.net/" | |
credential = AzureKeyCredential(api_key) | |
client = SearchClient(endpoint=endpoint, | |
index_name=index_name, | |
credential=credential) | |
# search index query setup | |
skip = 0 # used for paging combined with page_size | |
page_size = 50 # default value for $top | |
# search index query filter, only find the default content types for non fits files | |
filter = "metadata_storage_content_type eq 'application/octet-stream' and metadata_storage_file_extension ne '.fz' and metadata_storage_file_extension ne '.fits'" | |
total_index_count = client.search(search_text='*', include_total_count=True, filter=filter).get_count() | |
print("Found {} records in the index. Processing....".format(total_index_count)) | |
# keep a count of the skipped files (.fits, etc) | |
skipped_file_count = 0 | |
# setup our progress bar | |
bar = Bar('Processing the blob index', max=total_index_count) | |
# loop over all the docs in the index | |
while skip < total_index_count: | |
# ALL docs that are filtered to the 'invalid' content-type | |
# NOTE: for paging to work we must provide the top & skip params, without them you get a error :shrug: | |
# azure.core.exceptions.HttpResponseError: Operation returned an invalid status 'Forbidden' | |
# https://docs.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python | |
# The number of search results to retrieve. | |
# This can be used in conjunction with $skip to implement client-side paging of search results. | |
# If results are truncated due to server-side paging, the response will include a continuation token that can be used to issue another Search request for the next page of results. | |
results = client.search(search_text='*', top=page_size, skip=skip, filter=filter) | |
for result in results: | |
# update the progress bar | |
bar.next() | |
# WHY U NO base64 decode this string? It works on the termimal!? | |
# for some stupid reason, python base64 can't decode these encoded path strings :( | |
# using system shell cmds in the meantime | |
# This is why!? https://robertoprevato.github.io/Things-I-would-have-liked-to-know-about-Azure-Search/ | |
# why the fuck do i have to add this in, what the fuck was azure thinking? | |
real_base_64_string = fix_base64_encoded_string( | |
result["metadata_storage_path"]) | |
blob_file_path = base64.b64decode(real_base_64_string).decode("utf-8") | |
file_extension = result["metadata_storage_file_extension"] | |
# skip the fits files | |
if file_extension in skip_extensions_list: | |
# skip this entry as the fits files aren't for use in a browser | |
skipped_file_count += 1 | |
continue | |
try: | |
correct_mime_type = mimetypes.types_map[file_extension] | |
except KeyError: | |
error_msg = "Can't find the MIME type for the file path: {}\n with file extension: {}".format( | |
blob_file_path, file_extension) | |
# let's stop now and report the error | |
sys.exit(error_msg) | |
if correct_mime_type == result["metadata_storage_content_type"]: | |
# skip this entry if the content type doesn't change | |
continue | |
# so we've got two options here: | |
# 1. use the python sdk to loop through all the search index results iteratively (slow) | |
# 2. use something like gnu parallel and our az storage cmd to update the content types | |
# e.g. a bash cmd like this would update the gz blob at -n path with correct mime type. | |
# i'm going to pursue gnu parallel short term and see how it goes | |
blob_url_parts = urlparse(blob_file_path) | |
blob_url_path_parts = blob_url_parts.path.split("/") | |
container_name = blob_url_path_parts[1] | |
# get rid of the '' and container name in the list | |
del blob_url_path_parts[0:2] | |
blob_container_path = '/'.join(blob_url_path_parts) | |
# $ az storage blob update --auth-mode login --account-name galaxyzoosubjects -c '$web' -n subjects/decals/thumbnail/J211326.08+005811.6_thumbnail.jpeg --content-type 'image/jpeg' | |
az_cmd_suffix = "-c '{}' -n {} --content-type '{}'".format( | |
container_name, blob_container_path, correct_mime_type) | |
blob_update_az_cmd = "{} {}".format(az_cmd_prefix, az_cmd_suffix) | |
# write the az cli cmd to the output file | |
print(blob_update_az_cmd, file=out_file) | |
# we've processed the result set, setup the next page (50) | |
skip += page_size | |
# print out the total count | |
print("Found {} records in the index. Skipped {} files. Expecting {} uniq rows in the output file {}".format( | |
total_index_count, skipped_file_count, (total_index_count-skipped_file_count), out_file_name)) | |
# cleanup our output file handle | |
out_file.close() | |
# finish the progress bar | |
bar.finish() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment