Created
April 2, 2025 17:15
-
-
Save madhurprash/1e6e8ea3f86b5d18c81c4d7374c79ea0 to your computer and use it in GitHub Desktop.
This gist helps get insights into s3 uri's that the knowledge base has ingested as a part of its data s
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import json | |
from typing import Dict, List, Optional | |
import re | |
def get_detailed_s3_uris( | |
knowledge_base_id: str, | |
data_source_id: str | |
) -> Dict: | |
""" | |
Gets comprehensive S3 URI information for a specific data source in a knowledge base, | |
including any sub-URIs and file paths. | |
Args: | |
knowledge_base_id: The unique identifier of the knowledge base | |
data_source_id: The unique identifier of the data source | |
Returns: | |
Dictionary containing detailed S3 URI information | |
""" | |
# Initialize the Bedrock client | |
bedrock_client = boto3.client('bedrock-agent') | |
# Make the API call to get data source information | |
response = bedrock_client.get_data_source( | |
knowledgeBaseId=knowledge_base_id, | |
dataSourceId=data_source_id | |
) | |
# Extract S3 configuration information | |
data_source = response.get('dataSource', {}) | |
data_source_config = data_source.get('dataSourceConfiguration', {}) | |
s3_config = data_source_config.get('s3Configuration', {}) | |
# Get bucket information | |
bucket_arn = s3_config.get('bucketArn', '') | |
bucket_name = bucket_arn.split(':')[-1] if bucket_arn else "" | |
inclusion_prefixes = s3_config.get('inclusionPrefixes', []) | |
# Create S3 client to list objects | |
s3_client = boto3.client('s3') | |
# Initialize result structure | |
result = { | |
'main_info': { | |
'data_source_name': data_source.get('name', ''), | |
'data_source_id': data_source_id, | |
'bucket_name': bucket_name, | |
'bucket_arn': bucket_arn, | |
'inclusion_prefixes': inclusion_prefixes, | |
'base_uris': [f"s3://{bucket_name}/{prefix}" for prefix in inclusion_prefixes if bucket_name] | |
}, | |
'sub_uris': [] | |
} | |
# Get sub-URIs (objects within the prefixes) | |
if bucket_name and inclusion_prefixes: | |
for prefix in inclusion_prefixes: | |
try: | |
# List objects with this prefix | |
paginator = s3_client.get_paginator('list_objects_v2') | |
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix) | |
for page in pages: | |
for obj in page.get('Contents', []): | |
key = obj.get('Key', '') | |
if key and key != prefix: # Skip the prefix itself | |
# Add to sub-URIs list | |
obj_uri = f"s3://{bucket_name}/{key}" | |
result['sub_uris'].append({ | |
'uri': obj_uri, | |
'size': obj.get('Size', 0), | |
'last_modified': obj.get('LastModified', '').isoformat() if obj.get('LastModified') else None | |
}) | |
except Exception as e: | |
# Add error information to the result | |
result['errors'] = result.get('errors', []) + [f"Error listing objects for prefix {prefix}: {str(e)}"] | |
return result | |
# Example usage for your knowledge base | |
if __name__ == "__main__": | |
kb_id = "" # Your knowledge base ID | |
ds_id = "" # Your data source ID from the previous output | |
try: | |
# Get detailed S3 URI information | |
uri_info = get_detailed_s3_uris(kb_id, ds_id) | |
# Print main information | |
print(f"Data Source: {uri_info['main_info']['data_source_name']} (ID: {uri_info['main_info']['data_source_id']})") | |
print(f"Bucket ARN: {uri_info['main_info']['bucket_arn']}") | |
print(f"Base S3 URIs:") | |
for uri in uri_info['main_info']['base_uris']: | |
print(f" {uri}") | |
# Print sub-URIs | |
print("\nSub-URIs:") | |
for i, uri_data in enumerate(uri_info['sub_uris'], 1): | |
print(f"{i}. {uri_data['uri']}") | |
print(f" Size: {uri_data['size']} bytes") | |
print(f" Last Modified: {uri_data['last_modified']}") | |
# Print any errors that occurred | |
if 'errors' in uri_info: | |
print("\nErrors encountered:") | |
for error in uri_info['errors']: | |
print(f"- {error}") | |
except Exception as e: | |
print(f"Error retrieving S3 URI information: {str(e)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment