Skip to content

Instantly share code, notes, and snippets.

@madhurprash
Created April 2, 2025 17:15
Show Gist options
  • Save madhurprash/1e6e8ea3f86b5d18c81c4d7374c79ea0 to your computer and use it in GitHub Desktop.
Save madhurprash/1e6e8ea3f86b5d18c81c4d7374c79ea0 to your computer and use it in GitHub Desktop.
This gist helps get insights into s3 uri's that the knowledge base has ingested as a part of its data s
import boto3
import json
from typing import Dict, List, Optional
import re
def get_detailed_s3_uris(
knowledge_base_id: str,
data_source_id: str
) -> Dict:
"""
Gets comprehensive S3 URI information for a specific data source in a knowledge base,
including any sub-URIs and file paths.
Args:
knowledge_base_id: The unique identifier of the knowledge base
data_source_id: The unique identifier of the data source
Returns:
Dictionary containing detailed S3 URI information
"""
# Initialize the Bedrock client
bedrock_client = boto3.client('bedrock-agent')
# Make the API call to get data source information
response = bedrock_client.get_data_source(
knowledgeBaseId=knowledge_base_id,
dataSourceId=data_source_id
)
# Extract S3 configuration information
data_source = response.get('dataSource', {})
data_source_config = data_source.get('dataSourceConfiguration', {})
s3_config = data_source_config.get('s3Configuration', {})
# Get bucket information
bucket_arn = s3_config.get('bucketArn', '')
bucket_name = bucket_arn.split(':')[-1] if bucket_arn else ""
inclusion_prefixes = s3_config.get('inclusionPrefixes', [])
# Create S3 client to list objects
s3_client = boto3.client('s3')
# Initialize result structure
result = {
'main_info': {
'data_source_name': data_source.get('name', ''),
'data_source_id': data_source_id,
'bucket_name': bucket_name,
'bucket_arn': bucket_arn,
'inclusion_prefixes': inclusion_prefixes,
'base_uris': [f"s3://{bucket_name}/{prefix}" for prefix in inclusion_prefixes if bucket_name]
},
'sub_uris': []
}
# Get sub-URIs (objects within the prefixes)
if bucket_name and inclusion_prefixes:
for prefix in inclusion_prefixes:
try:
# List objects with this prefix
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
for page in pages:
for obj in page.get('Contents', []):
key = obj.get('Key', '')
if key and key != prefix: # Skip the prefix itself
# Add to sub-URIs list
obj_uri = f"s3://{bucket_name}/{key}"
result['sub_uris'].append({
'uri': obj_uri,
'size': obj.get('Size', 0),
'last_modified': obj.get('LastModified', '').isoformat() if obj.get('LastModified') else None
})
except Exception as e:
# Add error information to the result
result['errors'] = result.get('errors', []) + [f"Error listing objects for prefix {prefix}: {str(e)}"]
return result
# Example usage for your knowledge base
if __name__ == "__main__":
kb_id = "" # Your knowledge base ID
ds_id = "" # Your data source ID from the previous output
try:
# Get detailed S3 URI information
uri_info = get_detailed_s3_uris(kb_id, ds_id)
# Print main information
print(f"Data Source: {uri_info['main_info']['data_source_name']} (ID: {uri_info['main_info']['data_source_id']})")
print(f"Bucket ARN: {uri_info['main_info']['bucket_arn']}")
print(f"Base S3 URIs:")
for uri in uri_info['main_info']['base_uris']:
print(f" {uri}")
# Print sub-URIs
print("\nSub-URIs:")
for i, uri_data in enumerate(uri_info['sub_uris'], 1):
print(f"{i}. {uri_data['uri']}")
print(f" Size: {uri_data['size']} bytes")
print(f" Last Modified: {uri_data['last_modified']}")
# Print any errors that occurred
if 'errors' in uri_info:
print("\nErrors encountered:")
for error in uri_info['errors']:
print(f"- {error}")
except Exception as e:
print(f"Error retrieving S3 URI information: {str(e)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment