Last active
April 4, 2024 23:18
-
-
Save rmhrisk/2334a56d4a7034cd23562e1ada253585 to your computer and use it in GitHub Desktop.
Look at CA distribution by region and country
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests | |
from cryptography import x509 | |
from cryptography.hazmat.backends import default_backend | |
from cryptography.hazmat.primitives import hashes | |
from io import StringIO | |
from datetime import datetime, timezone | |
import time | |
import matplotlib.pyplot as plt | |
# Regional country codes for percentage calculation | |
european_countries = set(['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'MT', 'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK']) | |
north_america_countries = set(['US', 'CA', 'MX']) | |
apac_countries = set(['AU', 'CN', 'JP', 'IN', 'NZ', 'SG', 'KR', 'TH', 'ID', 'MY', 'PH', 'VN']) | |
latin_america_countries = set(['AR', 'BR', 'CL', 'CO', 'PE', 'VE', 'PR', 'UY', 'EC', 'GT', 'CU', 'BO', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA']) | |
africa_countries = set(['ZA', 'NG', 'EG', 'DZ', 'MA', 'KE', 'ET', 'GH', 'TZ', 'CI', 'UG', 'TN', 'ML', 'ZW', 'SD', 'AO', 'LY', 'CM', 'NA', 'BF']) | |
def download_csv(url): | |
print(f"Downloading CSV from {url}...") | |
response = requests.get(url) | |
if response.ok: | |
print("Download complete.") | |
return StringIO(response.text) | |
else: | |
print(f"Failed to download CSV. Status code: {response.status_code}") | |
return None | |
def compute_fingerprint(pem_data): | |
try: | |
cert = x509.load_pem_x509_certificate(pem_data.encode(), default_backend()) | |
return cert.fingerprint(hashes.SHA256()).hex().upper() | |
except Exception as e: | |
print(f"Error computing fingerprint: {e}") | |
return None | |
def extract_country_from_certificate(pem_data): | |
try: | |
cert = x509.load_pem_x509_certificate(pem_data.encode(), default_backend()) | |
country_names = [i.value for i in cert.subject.get_attributes_for_oid(x509.NameOID.COUNTRY_NAME)] | |
return country_names[0] if country_names else "Not Available" | |
except Exception as e: | |
print(f"Error extracting country from certificate: {e}") | |
return "Error" | |
def search_gleif(ca_owner_name, country_code): | |
if country_code == "Unknown": | |
print(f"Searching GLEIF for '{ca_owner_name}' without specifying country...") | |
else: | |
print(f"Searching GLEIF for '{ca_owner_name}' in country '{country_code}'...") | |
url = "https://api.gleif.org/api/v1/lei-records" | |
params = { | |
"page[size]": 1, | |
"page[number]": 1, | |
"filter[entity.names]": ca_owner_name | |
} | |
if country_code != "Unknown": | |
params["filter[entity.legalAddress.country]"] = country_code | |
headers = {"Accept": "application/vnd.api+json"} | |
response = requests.get(url, headers=headers, params=params) | |
if response.status_code == 200: | |
data = response.json() | |
if data['data']: | |
record = data['data'][0] | |
lei = record['id'] | |
legal_entity_name = record['attributes']['entity']['legalName']['name'] | |
hq_address = record['attributes']['entity']['legalAddress'] | |
address_parts = [ | |
", ".join([line for line in hq_address.get('addressLines', []) if line]), | |
hq_address.get('city', ''), | |
hq_address.get('region', ''), | |
hq_address.get('country', ''), | |
hq_address.get('postalCode', '') | |
] | |
complete_address = ", ".join(filter(None, address_parts)) | |
return "Found", lei, legal_entity_name, complete_address | |
else: | |
return "Not Found", None, None, None | |
else: | |
print(f"Failed to search GLEIF. Status code: {response.status_code}") | |
return "Failed", None, None, None | |
def is_certificate_valid(pem_data): | |
cert = x509.load_pem_x509_certificate(pem_data.encode(), default_backend()) | |
# Use the not_valid_after_utc property directly | |
not_valid_after_with_tz = cert.not_valid_after_utc | |
return datetime.now(timezone.utc) < not_valid_after_with_tz | |
def calculate_region_percentage(country_data, region_countries): | |
region_ca_owners = sum(country in region_countries for country in country_data) | |
total_ca_owners_checked = len(country_data) | |
return (region_ca_owners / total_ca_owners_checked) * 100 if total_ca_owners_checked > 0 else 0 | |
def generate_pie_chart(data, title, labels, explode=None): | |
fig, ax = plt.subplots(figsize=(12, 7)) # Match the figure size to the first script for consistency | |
wedges, texts, autotexts = ax.pie(data, explode=explode, labels=labels, startangle=90, autopct='%1.1f%%') | |
ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | |
plt.title(title) | |
# Create a legend with both the labels and the corresponding percentages. | |
# The lambda function in autopct only shows the percentage if it's greater than 1% for clarity. | |
legend_labels = [f"{label} - {int(value/sum(data)*100)}%" for label, value in zip(labels, data)] | |
# Position the legend similar to the first script, using bbox_to_anchor for precise control. | |
plt.legend(wedges, legend_labels, title="Certificate Authorities", loc="center left", bbox_to_anchor=(1, 0.5)) | |
# Adjust the layout to prevent cutting off any elements, specifically the legend. | |
plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust the right side as needed to fit the legend | |
plt.show() | |
def generate_ca_address_table_from_url(ca_url, roots_url): | |
ca_csv_data = download_csv(ca_url) | |
if ca_csv_data is None: | |
return | |
ca_data = pd.read_csv(ca_csv_data) | |
roots_csv_data = download_csv(roots_url) | |
if roots_csv_data is None: | |
return | |
roots_data = pd.read_csv(roots_csv_data) | |
lei_count = 0 | |
roots_data.columns = roots_data.columns.str.strip() | |
roots_data['Computed Fingerprint'] = roots_data['PEM'].apply(compute_fingerprint) | |
roots_data['CERT:COUNTRY'] = roots_data['PEM'].apply(extract_country_from_certificate) | |
roots_data['IsValid'] = roots_data['PEM'].apply(is_certificate_valid) | |
ca_owner_search_results = [] | |
searched_combinations = set() | |
country_data = [] | |
for index, row in roots_data.iterrows(): | |
if row['IsValid']: | |
ca_owner = ca_data[ca_data['SHA-256 Fingerprint'] == row['Computed Fingerprint']].iloc[0]['CA Owner'] | |
country_code = row['CERT:COUNTRY'] if row['CERT:COUNTRY'] != "Not Available" else "Unknown" | |
search_key = (ca_owner, country_code) | |
if search_key not in searched_combinations: | |
searched_combinations.add(search_key) | |
status, lei, legal_entity_name, complete_address = search_gleif(ca_owner, country_code) | |
ca_owner_search_results.append({ | |
"CCADB:OWNER": ca_owner, | |
"CERT:COUNTRY": country_code if country_code != "Unknown" else None, | |
"GLEIF:STATUS": status, | |
"GLEIF:legalEntityName": legal_entity_name, | |
"GLEIF:LEI": lei, | |
"GLEIF:legalAddress": complete_address | |
}) | |
if country_code != "Unknown": | |
country_data.append(country_code) | |
if lei: lei_count += 1 | |
time.sleep(1) # Respectful delay for API calls | |
print(pd.DataFrame(ca_owner_search_results).to_markdown(index=False)) # Ensure the markdown table is displayed | |
# Calculate and generate pie chart for regions | |
region_data = [calculate_region_percentage(country_data, eu_countries), | |
calculate_region_percentage(country_data, north_america_countries), | |
calculate_region_percentage(country_data, apac_countries), | |
calculate_region_percentage(country_data, latin_america_countries), | |
calculate_region_percentage(country_data, africa_countries)] | |
regions = ['European Union', 'North America', 'APAC', 'Latin America', 'Africa'] | |
generate_pie_chart(region_data, 'CA Owners by Region', regions) | |
# Generate pie chart for countries | |
country_counts = pd.Series(country_data).value_counts().rename_axis('Country').reset_index(name='Counts') | |
labels = country_counts['Country'].tolist() | |
data = country_counts['Counts'].tolist() | |
generate_pie_chart(data, 'CA Owner By Country', labels) | |
# Calculate and print regional distribution | |
region_counts = {region: calculate_region_percentage(country_data, countries) for region, countries in { | |
"Europe": european_countries, | |
"North America": north_america_countries, | |
"APAC": apac_countries, | |
"Latin America": latin_america_countries, | |
"Africa": africa_countries | |
}.items()} | |
print("\nRegional Distribution of CAs:") | |
for region, percentage in region_counts.items(): | |
print(f"{region}: {percentage:.2f}%") | |
# Calculate and print the percentage of sovereign nations with a CA owner | |
unique_countries = set(country_data) - {"Unknown"} | |
sovereign_nations_with_ca = len(unique_countries) | |
total_sovereign_nations = sum(len(region) for region in [eu_countries, north_america_countries, apac_countries, latin_america_countries, africa_countries]) | |
percentage_sovereign_nations = (sovereign_nations_with_ca / total_sovereign_nations) * 100 | |
print(f"\nPercentage of Sovereign Nations with a CA Owner: {percentage_sovereign_nations:.2f}%") | |
# Calculate and display the LEI summary | |
total_cas = len(set([result['CCADB:OWNER'] for result in ca_owner_search_results])) | |
lei_percentage = (lei_count / total_cas) * 100 if total_cas > 0 else 0 | |
print(f"\nNumber of CAs with an LEI: {lei_count}") | |
print(f"Percentage of CAs with an LEI: {lei_percentage:.2f}%") | |
# URLs to your data sources remain unchanged | |
ca_url = 'https://ccadb.my.salesforce-sites.com/ccadb/AllCertificateRecordsCSVFormatv2' | |
roots_url = 'https://ccadb.my.salesforce-sites.com/mozilla/IncludedRootsDistrustTLSSSLPEMCSV?TrustBitsInclude=Websites' | |
generate_ca_address_table_from_url(ca_url, roots_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I wanted to examine the distribution of CAs by country and region. However, the CCADB does not contain the necessary information, so I decided to see if I could correlate the data from CCADB with LEI entries, since they offer an open API.
The method I am using is flawed because I have not devised a reliable way to match the CA owner to a legal entity and, ultimately, their physical address in GLEIF. Despite this, there is still some intriguing data here. For instance, I am using the 'C' field in the intermediate certificates to both determine the country and region. This also provides a straightforward view into the countries associated with the intermediate certificates linked to a CA owner.
The layout in the pie charts is also not great but still valuable.
I think I will next try to get the address out of CP/CPS's using a LLM, I believe CCADB has the necessary links to make this analysis possible.
Here is an example output from this script:
Regional Distribution of CAs:
European Union: 38.18%
North America: 25.45%
APAC: 18.18%
Latin America: 0.00%
Africa: 1.82%
Percentage of Sovereign Nations with a CA Owner: 32.50%