Skip to content

Instantly share code, notes, and snippets.

@JPBM135
Last active May 24, 2024 03:08
Show Gist options
  • Save JPBM135/8bde43a2f8af915ae93b91a6d4ff8f01 to your computer and use it in GitHub Desktop.
Save JPBM135/8bde43a2f8af915ae93b91a6d4ff8f01 to your computer and use it in GitHub Desktop.
Parser for CSM draws
from bs4 import BeautifulSoup
from urllib.request import urlopen
import json
import os
import re
BASE_URL = "https://www.canada.ca/"
HTML_PAGE_PATH_SUFFIX = "en/immigration-refugees-citizenship/services/immigrate-canada/express-entry/submit-profile/rounds-invitations.html"
INPUT_HTML = """
<!DOCTYPE html>
<html lang="en">
<head>
<title>Title</title>
<!-- Required meta tags -->
<meta charset="utf-8" />
<meta
name="viewport"
content="width=device-width, initial-scale=1, shrink-to-fit=no"
/>
<!-- Bootstrap CSS v5.2.1 -->
<link
href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css"
rel="stylesheet"
integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN"
crossorigin="anonymous"
/>
</head>
<body>
<header>
<!-- place navbar here -->
</header>
<main>
<div class="container-fluid"></div>
</main>
<footer>
<!-- place footer here -->
</footer>
<!-- Bootstrap JavaScript Libraries -->
<script
src="https://cdn.jsdelivr.net/npm/@popperjs/[email protected]/dist/umd/popper.min.js"
integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r"
crossorigin="anonymous"
></script>
<script
src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.min.js"
integrity="sha384-BBtl+eGJRgqQAUMxJ7pMwbEyER4l1g+O15P+16Ep7Q9Q+zqX6gSbd85u4mG4QzX+"
crossorigin="anonymous"
></script>
</body>
</html>
"""
def getJsonFile(bs4Soup):
jsonManager = bs4Soup.find_all(attrs={"data-wb-jsonmanager": True})
dataJsonSuffix = json.loads(jsonManager[0]["data-wb-jsonmanager"])["url"]
dataJson = urlopen(BASE_URL + dataJsonSuffix)
return json.load(dataJson)
def createColDiv(text, emptySoup):
createColDivDiv = emptySoup.new_tag("div")
createColDivDiv["class"] = "col"
createColDivDiv.string = text
return createColDivDiv
def createHeader(headerData, emptySoup):
inner_div = emptySoup.new_tag("div")
inner_div["class"] = "crs-container col"
inner_div_h1 = emptySoup.new_tag("h1")
inner_div_h1.string = "Express Entry Round " + headerData["drawNumber"]
inner_div_div = emptySoup.new_tag("div")
inner_div_div["class"] = "row justify-content-center align-items-center g-2"
inner_div_div_div1 = createColDiv(
"Number of Invitation: " + headerData["drawSize"], emptySoup
)
inner_div_div_div2 = createColDiv(
"Date and time: " + headerData["drawDateFull"], emptySoup
)
inner_div_div_div3 = createColDiv(
"Lowest CRS score: " + headerData["drawCRS"], emptySoup
)
inner_div_div.append(inner_div_div_div1)
inner_div_div.append(inner_div_div_div2)
inner_div_div.append(inner_div_div_div3)
inner_div.append(inner_div_h1)
inner_div.append(inner_div_div)
return inner_div
def createTableHeader(emptySoup):
thead = emptySoup.new_tag("thead")
tr = emptySoup.new_tag("tr")
for key in ["CRS Score", "Number of invitations"]:
th = emptySoup.new_tag("th")
h3 = emptySoup.new_tag("h3")
h3.string = key
th.append(h3)
tr.append(th)
thead.append(tr)
return thead
def createTableBody(jsonData, emptySoup):
tbody = emptySoup.new_tag("tbody")
keys_that_start_with_dd = [k for k in jsonData.keys() if re.search(r"dd\d+$", k)]
for key in keys_that_start_with_dd:
tr = emptySoup.new_tag("tr")
keyWithInterval = key + "_interval"
hasStrong = jsonData[key + "_has_bold"]
for keyToAdd in [keyWithInterval, key]:
td = emptySoup.new_tag("td")
if hasStrong:
strong = emptySoup.new_tag("strong")
strong.string = jsonData[keyToAdd]
td.append(strong)
else:
td.string = jsonData[keyToAdd]
tr.append(td)
tbody.append(tr)
return tbody
def createTable(jsonData, emptySoup):
wrapper_div = emptySoup.new_tag("div")
wrapper_div["class"] = "table-responsive"
table = emptySoup.new_tag("table")
table["class"] = "table table-primary"
thead = createTableHeader(emptySoup)
tbody = createTableBody(jsonData, emptySoup)
table.append(thead)
table.append(tbody)
return table
def parseHtmlAndCreateData(soup, data):
draws_list = []
json_elements = soup.find_all(attrs={"data-json-replace": True})
for data_round in data["rounds"]:
data_to_append = {}
for element in json_elements:
json_replace_path = element["data-json-replace"]
last_key = json_replace_path.split("/")[-1]
# Add the key to the dataToAppend dictionary
data_to_append[last_key] = (
data_round[last_key] if last_key in data_round else None
)
# Check if the last key ends with "dd" and a number
last_key_ends_with_draw_pattern = re.search(r"^dd\d+$", last_key)
if last_key_ends_with_draw_pattern:
drawInterval = element.find_parent().find_parent()
hasStrong = drawInterval.find("strong")
if hasStrong:
# If the strong tag exists, then the parent is one up
drawInterval = drawInterval.find_parent()
# Edge case for the last draw
if not drawInterval:
data_to_append[last_key + "_interval"] = "Total"
data_to_append[last_key + "_has_bold"] = True
else:
# If the strong tag exists, then take its text, otherwise take the text of the td tag
drawInterval = (
drawInterval.find("strong") if hasStrong else drawInterval
)
# Save the interval and if the text is bold
data_to_append[last_key + "_interval"] = (
drawInterval.text.strip() or "Total"
)
data_to_append[last_key + "_has_bold"] = hasStrong is not None
draws_list.append(data_to_append)
return draws_list
def main():
html = urlopen(BASE_URL + HTML_PAGE_PATH_SUFFIX)
soup = BeautifulSoup(html.read(), "html.parser")
data = getJsonFile(soup)
data_parsed_list = parseHtmlAndCreateData(soup, data)
with open(
os.path.join(os.path.dirname(__file__), "output.json"), "w", encoding="utf-8"
) as file:
json.dump(data_parsed_list, file, indent=4)
empty_soup = BeautifulSoup(INPUT_HTML, "html.parser")
main_div = empty_soup.find("div", {"class": "container-fluid"})
for data_parsed_to_append in data_parsed_list:
div = empty_soup.new_tag("div")
div["class"] = "row justify-content-center align-items-center g-2"
div.append(createHeader(data_parsed_to_append, empty_soup))
div.append(createTable(data_parsed_to_append, empty_soup))
main_div.append(div)
with open(
os.path.join(os.path.dirname(__file__), "output.html"), "w", encoding="utf-8"
) as file:
file.write(empty_soup.prettify())
main()
<!DOCTYPE html>
<html lang="en">
<head>
<title>Title</title>
<!-- Required meta tags -->
<meta charset="utf-8" />
<meta
name="viewport"
content="width=device-width, initial-scale=1, shrink-to-fit=no"
/>
<!-- Bootstrap CSS v5.2.1 -->
<link
href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css"
rel="stylesheet"
integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN"
crossorigin="anonymous"
/>
</head>
<body>
<header>
<!-- place navbar here -->
</header>
<main>
<div class="container-fluid"></div>
</main>
<footer>
<!-- place footer here -->
</footer>
<!-- Bootstrap JavaScript Libraries -->
<script
src="https://cdn.jsdelivr.net/npm/@popperjs/[email protected]/dist/umd/popper.min.js"
integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r"
crossorigin="anonymous"
></script>
<script
src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.min.js"
integrity="sha384-BBtl+eGJRgqQAUMxJ7pMwbEyER4l1g+O15P+16Ep7Q9Q+zqX6gSbd85u4mG4QzX+"
crossorigin="anonymous"
></script>
</body>
</html>
from bs4 import BeautifulSoup
from urllib.request import urlopen
import json
import os
import re
BASE_URL = "https://www.canada.ca/"
HTML_PAGE_PATH_SUFFIX = "en/immigration-refugees-citizenship/services/immigrate-canada/express-entry/submit-profile/rounds-invitations.html"
def getJsonFile(bs4Soup):
jsonManager = bs4Soup.find_all(attrs={"data-wb-jsonmanager": True})
dataJsonSuffix = json.loads(jsonManager[0]["data-wb-jsonmanager"])["url"]
dataJson = urlopen(BASE_URL + dataJsonSuffix)
return json.load(dataJson)
def createColDiv(text, emptySoup):
createColDivDiv = emptySoup.new_tag("div")
createColDivDiv["class"] = "col"
createColDivDiv.string = text
return createColDivDiv
def createHeader(headerData, emptySoup):
inner_div = emptySoup.new_tag("div")
inner_div["class"] = "crs-container col"
inner_div_h1 = emptySoup.new_tag("h1")
inner_div_h1.string = "Express Entry Round " + headerData["drawNumber"]
inner_div_div = emptySoup.new_tag("div")
inner_div_div["class"] = "row justify-content-center align-items-center g-2"
inner_div_div_div1 = createColDiv(
"Number of Invitation: " + headerData["drawSize"], emptySoup
)
inner_div_div_div2 = createColDiv(
"Date and time: " + headerData["drawDateFull"], emptySoup
)
inner_div_div_div3 = createColDiv(
"Lowest CRS score: " + headerData["drawCRS"], emptySoup
)
inner_div_div.append(inner_div_div_div1)
inner_div_div.append(inner_div_div_div2)
inner_div_div.append(inner_div_div_div3)
inner_div.append(inner_div_h1)
inner_div.append(inner_div_div)
return inner_div
def createTableHeader(emptySoup):
thead = emptySoup.new_tag("thead")
tr = emptySoup.new_tag("tr")
for key in ["CRS Score", "Number of invitations"]:
th = emptySoup.new_tag("th")
h3 = emptySoup.new_tag("h3")
h3.string = key
th.append(h3)
tr.append(th)
thead.append(tr)
return thead
def createTableBody(jsonData, emptySoup):
tbody = emptySoup.new_tag("tbody")
keys_that_start_with_dd = [k for k in jsonData.keys() if re.search(r"dd\d+$", k)]
for key in keys_that_start_with_dd:
tr = emptySoup.new_tag("tr")
keyWithInterval = key + "_interval"
hasStrong = jsonData[key + "_has_bold"]
for keyToAdd in [keyWithInterval, key]:
td = emptySoup.new_tag("td")
if hasStrong:
strong = emptySoup.new_tag("strong")
strong.string = jsonData[keyToAdd]
td.append(strong)
else:
td.string = jsonData[keyToAdd]
tr.append(td)
tbody.append(tr)
return tbody
def createTable(jsonData, emptySoup):
wrapper_div = emptySoup.new_tag("div")
wrapper_div["class"] = "table-responsive"
table = emptySoup.new_tag("table")
table["class"] = "table table-primary"
thead = createTableHeader(emptySoup)
tbody = createTableBody(jsonData, emptySoup)
table.append(thead)
table.append(tbody)
return table
def parseHtmlAndCreateData(soup, data):
draws_list = []
json_elements = soup.find_all(attrs={"data-json-replace": True})
for data_round in data["rounds"]:
data_to_append = {}
for element in json_elements:
json_replace_path = element["data-json-replace"]
last_key = json_replace_path.split("/")[-1]
# Add the key to the dataToAppend dictionary
data_to_append[last_key] = (
data_round[last_key] if last_key in data_round else None
)
# Check if the last key ends with "dd" and a number
last_key_ends_with_draw_pattern = re.search(r"^dd\d+$", last_key)
if last_key_ends_with_draw_pattern:
drawInterval = element.find_parent().find_parent()
hasStrong = drawInterval.find("strong")
if hasStrong:
# If the strong tag exists, then the parent is one up
drawInterval = drawInterval.find_parent()
# Edge case for the last draw
if not drawInterval:
data_to_append[last_key + "_interval"] = "Total"
data_to_append[last_key + "_has_bold"] = True
else:
# If the strong tag exists, then take its text, otherwise take the text of the td tag
drawInterval = (
drawInterval.find("strong") if hasStrong else drawInterval
)
# Save the interval and if the text is bold
data_to_append[last_key + "_interval"] = (
drawInterval.text.strip() or "Total"
)
data_to_append[last_key + "_has_bold"] = hasStrong is not None
draws_list.append(data_to_append)
return draws_list
def main():
html = urlopen(BASE_URL + HTML_PAGE_PATH_SUFFIX)
soup = BeautifulSoup(html.read(), "html.parser")
data = getJsonFile(soup)
data_parsed_list = parseHtmlAndCreateData(soup, data)
with open(
os.path.join(os.path.dirname(__file__), "output.json"), "w", encoding="utf-8"
) as file:
json.dump(data_parsed_list, file, indent=4)
input_base_html_file = os.path.join(os.path.dirname(__file__), "input.html")
with open(input_base_html_file, "r", encoding="utf-8") as file:
base_html = file.read()
empty_soup = BeautifulSoup(base_html, "html.parser")
main_div = empty_soup.find("div", {"class": "container-fluid"})
for data_parsed_to_append in data_parsed_list:
div = empty_soup.new_tag("div")
div["class"] = "row justify-content-center align-items-center g-2"
div.append(createHeader(data_parsed_to_append, empty_soup))
div.append(createTable(data_parsed_to_append, empty_soup))
main_div.append(div)
with open(
os.path.join(os.path.dirname(__file__), "output.html"), "w", encoding="utf-8"
) as file:
file.write(empty_soup.prettify())
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment