Last active
March 15, 2023 14:09
-
-
Save geekygirldawn/8b94b1b6493a56ce1a1073a81142e91a to your computer and use it in GitHub Desktop.
Gets short numeric links for all CHAOSS metrics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I created a super hacky Jupyter Notebook that looks at | |
# https://chaoss.community/kbtopic/all-metrics/ which | |
# contains a list of all of our metrics, and then uses | |
# BeautifulSoup + hackiness to extract the short link | |
# from the wordpress pages. I was in a hurry, so I | |
# printed the csv output to the screen instead of to | |
# a csv file. | |
import urllib.request as urllib2 | |
from bs4 import BeautifulSoup | |
# Read and parse the file that contains all of the metrics | |
response = urllib2.urlopen('https://chaoss.community/kbtopic/all-metrics/') | |
html_doc = response.read() | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
# Find html links for all of the individual metrics pages | |
links = soup.find_all("h2", {"class": "mkb-entry-title"}) | |
# Loop through those links and read / parse those pages | |
for div in links: | |
links = div.findAll('a') | |
for a in links: | |
try: | |
#Get the human readable URL and use it to read the page | |
metric_url = a['href'] | |
# Read / Parse the metrics page | |
metric_response = urllib2.urlopen(metric_url) | |
metric_html_doc = metric_response.read() | |
metric_soup = BeautifulSoup(metric_html_doc, 'html.parser') | |
# Find the shortlink aka permalink | |
short_link = metric_soup.find("link",{"rel":"shortlink"})['href'] | |
# Hacky way to grab the details about the focus areas & github url | |
# out of some debug stuff that is in the html pages and convert it | |
# to a string to work with it. | |
debug = metric_soup.find("div", {"class": "wpb_row"}) | |
debug_str = str(debug) | |
# Finds where the focus areas are mentioned and does some hacky stuff | |
# to grab the next 500 chars after that. | |
index = str(debug).find('focus-areas') | |
index_end = index + 500 | |
# Splits the string using single quotes and grabs the first | |
# part of it, which will have the actual focus area | |
focus_str = debug_str[index:index_end].split("'")[0] | |
# Split it again by single quote to grab the string at index 2 and | |
# use the string that starts at index 15 to remove "Debug Objects: " | |
# from the front of the GitHub URL | |
gh_url = debug_str[index:index_end].split("'")[2][15:].split('"')[0] | |
# As a quick way to get the csv, print each line in csv format to | |
# the screen to be copied into a csv file. Hacky, but quick. | |
line = metric_url + ',' + short_link + ',' + gh_url + ',' + focus_str | |
print(line) | |
except: | |
# Print a message if something went wrong and avoid crashing the script | |
print(a['href'], 'not parsed') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment