Created
          September 2, 2022 20:49 
        
      - 
      
- 
        Save nickrsan/12a022b7793b08f8871f8669e4c38ed6 to your computer and use it in GitHub Desktop. 
    Download Items from Public Google Cloud Storage Bucket
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import requests | |
| import re | |
| from pathlib import Path | |
| def get_public_export_urls(bucket_name, prefix=""): | |
| """ | |
| Downloads items from a *public* Google Storage bucket without using a GCloud login. Filters only to files | |
| with the specified prefix | |
| :param bucket_name: | |
| :param prefix: A prefix to use to filter items in the bucket - only URLs where the path matches this prefix will be returned - defaults to all files | |
| :return: list of urls | |
| """ | |
| base_url = "http://storage.googleapis.com/" | |
| request_url = f"{base_url}{bucket_name}/" | |
| # get the content of the bucket (it needs to be public | |
| listing = requests.get(request_url).text | |
| # comes back as an XML listing - don't need to parse the XML, just need the values of the Key elements | |
| pattern = re.compile("\<Key\>(.*?)\<\/Key\>") | |
| items = pattern.findall(listing) | |
| # make them into full URLs with the bucket URL at the front and check if the files have the prefix specific | |
| filtered = [f"{request_url}{item}" for item in items if item.startswith(prefix)] | |
| return filtered | |
| def download_public_export(bucket_name, output_folder, prefix=""): | |
| # get the urls of items in the bucket with the specified prefix | |
| urls = get_public_export_urls(bucket_name, prefix) | |
| for url in urls: | |
| filename = url.split("/")[-1] # get the filename | |
| output_path = Path(output_folder) / filename # construct the output path | |
| response = requests.get(url) # get the data - this could be a problem if it's larger than fits in RAM - I believe requests has a way to operate as a streambuffer - not looking into that at this moment | |
| output_path.write_bytes(response.content) # write it to a file | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment