Last active
May 24, 2024 03:08
-
-
Save JPBM135/8bde43a2f8af915ae93b91a6d4ff8f01 to your computer and use it in GitHub Desktop.
Parser for CSM draws
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
import json | |
import os | |
import re | |
BASE_URL = "https://www.canada.ca/" | |
HTML_PAGE_PATH_SUFFIX = "en/immigration-refugees-citizenship/services/immigrate-canada/express-entry/submit-profile/rounds-invitations.html" | |
INPUT_HTML = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<title>Title</title> | |
<!-- Required meta tags --> | |
<meta charset="utf-8" /> | |
<meta | |
name="viewport" | |
content="width=device-width, initial-scale=1, shrink-to-fit=no" | |
/> | |
<!-- Bootstrap CSS v5.2.1 --> | |
<link | |
href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" | |
rel="stylesheet" | |
integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" | |
crossorigin="anonymous" | |
/> | |
</head> | |
<body> | |
<header> | |
<!-- place navbar here --> | |
</header> | |
<main> | |
<div class="container-fluid"></div> | |
</main> | |
<footer> | |
<!-- place footer here --> | |
</footer> | |
<!-- Bootstrap JavaScript Libraries --> | |
<script | |
src="https://cdn.jsdelivr.net/npm/@popperjs/[email protected]/dist/umd/popper.min.js" | |
integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r" | |
crossorigin="anonymous" | |
></script> | |
<script | |
src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.min.js" | |
integrity="sha384-BBtl+eGJRgqQAUMxJ7pMwbEyER4l1g+O15P+16Ep7Q9Q+zqX6gSbd85u4mG4QzX+" | |
crossorigin="anonymous" | |
></script> | |
</body> | |
</html> | |
""" | |
def getJsonFile(bs4Soup): | |
jsonManager = bs4Soup.find_all(attrs={"data-wb-jsonmanager": True}) | |
dataJsonSuffix = json.loads(jsonManager[0]["data-wb-jsonmanager"])["url"] | |
dataJson = urlopen(BASE_URL + dataJsonSuffix) | |
return json.load(dataJson) | |
def createColDiv(text, emptySoup): | |
createColDivDiv = emptySoup.new_tag("div") | |
createColDivDiv["class"] = "col" | |
createColDivDiv.string = text | |
return createColDivDiv | |
def createHeader(headerData, emptySoup): | |
inner_div = emptySoup.new_tag("div") | |
inner_div["class"] = "crs-container col" | |
inner_div_h1 = emptySoup.new_tag("h1") | |
inner_div_h1.string = "Express Entry Round " + headerData["drawNumber"] | |
inner_div_div = emptySoup.new_tag("div") | |
inner_div_div["class"] = "row justify-content-center align-items-center g-2" | |
inner_div_div_div1 = createColDiv( | |
"Number of Invitation: " + headerData["drawSize"], emptySoup | |
) | |
inner_div_div_div2 = createColDiv( | |
"Date and time: " + headerData["drawDateFull"], emptySoup | |
) | |
inner_div_div_div3 = createColDiv( | |
"Lowest CRS score: " + headerData["drawCRS"], emptySoup | |
) | |
inner_div_div.append(inner_div_div_div1) | |
inner_div_div.append(inner_div_div_div2) | |
inner_div_div.append(inner_div_div_div3) | |
inner_div.append(inner_div_h1) | |
inner_div.append(inner_div_div) | |
return inner_div | |
def createTableHeader(emptySoup): | |
thead = emptySoup.new_tag("thead") | |
tr = emptySoup.new_tag("tr") | |
for key in ["CRS Score", "Number of invitations"]: | |
th = emptySoup.new_tag("th") | |
h3 = emptySoup.new_tag("h3") | |
h3.string = key | |
th.append(h3) | |
tr.append(th) | |
thead.append(tr) | |
return thead | |
def createTableBody(jsonData, emptySoup): | |
tbody = emptySoup.new_tag("tbody") | |
keys_that_start_with_dd = [k for k in jsonData.keys() if re.search(r"dd\d+$", k)] | |
for key in keys_that_start_with_dd: | |
tr = emptySoup.new_tag("tr") | |
keyWithInterval = key + "_interval" | |
hasStrong = jsonData[key + "_has_bold"] | |
for keyToAdd in [keyWithInterval, key]: | |
td = emptySoup.new_tag("td") | |
if hasStrong: | |
strong = emptySoup.new_tag("strong") | |
strong.string = jsonData[keyToAdd] | |
td.append(strong) | |
else: | |
td.string = jsonData[keyToAdd] | |
tr.append(td) | |
tbody.append(tr) | |
return tbody | |
def createTable(jsonData, emptySoup): | |
wrapper_div = emptySoup.new_tag("div") | |
wrapper_div["class"] = "table-responsive" | |
table = emptySoup.new_tag("table") | |
table["class"] = "table table-primary" | |
thead = createTableHeader(emptySoup) | |
tbody = createTableBody(jsonData, emptySoup) | |
table.append(thead) | |
table.append(tbody) | |
return table | |
def parseHtmlAndCreateData(soup, data): | |
draws_list = [] | |
json_elements = soup.find_all(attrs={"data-json-replace": True}) | |
for data_round in data["rounds"]: | |
data_to_append = {} | |
for element in json_elements: | |
json_replace_path = element["data-json-replace"] | |
last_key = json_replace_path.split("/")[-1] | |
# Add the key to the dataToAppend dictionary | |
data_to_append[last_key] = ( | |
data_round[last_key] if last_key in data_round else None | |
) | |
# Check if the last key ends with "dd" and a number | |
last_key_ends_with_draw_pattern = re.search(r"^dd\d+$", last_key) | |
if last_key_ends_with_draw_pattern: | |
drawInterval = element.find_parent().find_parent() | |
hasStrong = drawInterval.find("strong") | |
if hasStrong: | |
# If the strong tag exists, then the parent is one up | |
drawInterval = drawInterval.find_parent() | |
# Edge case for the last draw | |
if not drawInterval: | |
data_to_append[last_key + "_interval"] = "Total" | |
data_to_append[last_key + "_has_bold"] = True | |
else: | |
# If the strong tag exists, then take its text, otherwise take the text of the td tag | |
drawInterval = ( | |
drawInterval.find("strong") if hasStrong else drawInterval | |
) | |
# Save the interval and if the text is bold | |
data_to_append[last_key + "_interval"] = ( | |
drawInterval.text.strip() or "Total" | |
) | |
data_to_append[last_key + "_has_bold"] = hasStrong is not None | |
draws_list.append(data_to_append) | |
return draws_list | |
def main(): | |
html = urlopen(BASE_URL + HTML_PAGE_PATH_SUFFIX) | |
soup = BeautifulSoup(html.read(), "html.parser") | |
data = getJsonFile(soup) | |
data_parsed_list = parseHtmlAndCreateData(soup, data) | |
with open( | |
os.path.join(os.path.dirname(__file__), "output.json"), "w", encoding="utf-8" | |
) as file: | |
json.dump(data_parsed_list, file, indent=4) | |
empty_soup = BeautifulSoup(INPUT_HTML, "html.parser") | |
main_div = empty_soup.find("div", {"class": "container-fluid"}) | |
for data_parsed_to_append in data_parsed_list: | |
div = empty_soup.new_tag("div") | |
div["class"] = "row justify-content-center align-items-center g-2" | |
div.append(createHeader(data_parsed_to_append, empty_soup)) | |
div.append(createTable(data_parsed_to_append, empty_soup)) | |
main_div.append(div) | |
with open( | |
os.path.join(os.path.dirname(__file__), "output.html"), "w", encoding="utf-8" | |
) as file: | |
file.write(empty_soup.prettify()) | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<title>Title</title> | |
<!-- Required meta tags --> | |
<meta charset="utf-8" /> | |
<meta | |
name="viewport" | |
content="width=device-width, initial-scale=1, shrink-to-fit=no" | |
/> | |
<!-- Bootstrap CSS v5.2.1 --> | |
<link | |
href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" | |
rel="stylesheet" | |
integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" | |
crossorigin="anonymous" | |
/> | |
</head> | |
<body> | |
<header> | |
<!-- place navbar here --> | |
</header> | |
<main> | |
<div class="container-fluid"></div> | |
</main> | |
<footer> | |
<!-- place footer here --> | |
</footer> | |
<!-- Bootstrap JavaScript Libraries --> | |
<script | |
src="https://cdn.jsdelivr.net/npm/@popperjs/[email protected]/dist/umd/popper.min.js" | |
integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r" | |
crossorigin="anonymous" | |
></script> | |
<script | |
src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.min.js" | |
integrity="sha384-BBtl+eGJRgqQAUMxJ7pMwbEyER4l1g+O15P+16Ep7Q9Q+zqX6gSbd85u4mG4QzX+" | |
crossorigin="anonymous" | |
></script> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
import json | |
import os | |
import re | |
BASE_URL = "https://www.canada.ca/" | |
HTML_PAGE_PATH_SUFFIX = "en/immigration-refugees-citizenship/services/immigrate-canada/express-entry/submit-profile/rounds-invitations.html" | |
def getJsonFile(bs4Soup): | |
jsonManager = bs4Soup.find_all(attrs={"data-wb-jsonmanager": True}) | |
dataJsonSuffix = json.loads(jsonManager[0]["data-wb-jsonmanager"])["url"] | |
dataJson = urlopen(BASE_URL + dataJsonSuffix) | |
return json.load(dataJson) | |
def createColDiv(text, emptySoup): | |
createColDivDiv = emptySoup.new_tag("div") | |
createColDivDiv["class"] = "col" | |
createColDivDiv.string = text | |
return createColDivDiv | |
def createHeader(headerData, emptySoup): | |
inner_div = emptySoup.new_tag("div") | |
inner_div["class"] = "crs-container col" | |
inner_div_h1 = emptySoup.new_tag("h1") | |
inner_div_h1.string = "Express Entry Round " + headerData["drawNumber"] | |
inner_div_div = emptySoup.new_tag("div") | |
inner_div_div["class"] = "row justify-content-center align-items-center g-2" | |
inner_div_div_div1 = createColDiv( | |
"Number of Invitation: " + headerData["drawSize"], emptySoup | |
) | |
inner_div_div_div2 = createColDiv( | |
"Date and time: " + headerData["drawDateFull"], emptySoup | |
) | |
inner_div_div_div3 = createColDiv( | |
"Lowest CRS score: " + headerData["drawCRS"], emptySoup | |
) | |
inner_div_div.append(inner_div_div_div1) | |
inner_div_div.append(inner_div_div_div2) | |
inner_div_div.append(inner_div_div_div3) | |
inner_div.append(inner_div_h1) | |
inner_div.append(inner_div_div) | |
return inner_div | |
def createTableHeader(emptySoup): | |
thead = emptySoup.new_tag("thead") | |
tr = emptySoup.new_tag("tr") | |
for key in ["CRS Score", "Number of invitations"]: | |
th = emptySoup.new_tag("th") | |
h3 = emptySoup.new_tag("h3") | |
h3.string = key | |
th.append(h3) | |
tr.append(th) | |
thead.append(tr) | |
return thead | |
def createTableBody(jsonData, emptySoup): | |
tbody = emptySoup.new_tag("tbody") | |
keys_that_start_with_dd = [k for k in jsonData.keys() if re.search(r"dd\d+$", k)] | |
for key in keys_that_start_with_dd: | |
tr = emptySoup.new_tag("tr") | |
keyWithInterval = key + "_interval" | |
hasStrong = jsonData[key + "_has_bold"] | |
for keyToAdd in [keyWithInterval, key]: | |
td = emptySoup.new_tag("td") | |
if hasStrong: | |
strong = emptySoup.new_tag("strong") | |
strong.string = jsonData[keyToAdd] | |
td.append(strong) | |
else: | |
td.string = jsonData[keyToAdd] | |
tr.append(td) | |
tbody.append(tr) | |
return tbody | |
def createTable(jsonData, emptySoup): | |
wrapper_div = emptySoup.new_tag("div") | |
wrapper_div["class"] = "table-responsive" | |
table = emptySoup.new_tag("table") | |
table["class"] = "table table-primary" | |
thead = createTableHeader(emptySoup) | |
tbody = createTableBody(jsonData, emptySoup) | |
table.append(thead) | |
table.append(tbody) | |
return table | |
def parseHtmlAndCreateData(soup, data): | |
draws_list = [] | |
json_elements = soup.find_all(attrs={"data-json-replace": True}) | |
for data_round in data["rounds"]: | |
data_to_append = {} | |
for element in json_elements: | |
json_replace_path = element["data-json-replace"] | |
last_key = json_replace_path.split("/")[-1] | |
# Add the key to the dataToAppend dictionary | |
data_to_append[last_key] = ( | |
data_round[last_key] if last_key in data_round else None | |
) | |
# Check if the last key ends with "dd" and a number | |
last_key_ends_with_draw_pattern = re.search(r"^dd\d+$", last_key) | |
if last_key_ends_with_draw_pattern: | |
drawInterval = element.find_parent().find_parent() | |
hasStrong = drawInterval.find("strong") | |
if hasStrong: | |
# If the strong tag exists, then the parent is one up | |
drawInterval = drawInterval.find_parent() | |
# Edge case for the last draw | |
if not drawInterval: | |
data_to_append[last_key + "_interval"] = "Total" | |
data_to_append[last_key + "_has_bold"] = True | |
else: | |
# If the strong tag exists, then take its text, otherwise take the text of the td tag | |
drawInterval = ( | |
drawInterval.find("strong") if hasStrong else drawInterval | |
) | |
# Save the interval and if the text is bold | |
data_to_append[last_key + "_interval"] = ( | |
drawInterval.text.strip() or "Total" | |
) | |
data_to_append[last_key + "_has_bold"] = hasStrong is not None | |
draws_list.append(data_to_append) | |
return draws_list | |
def main(): | |
html = urlopen(BASE_URL + HTML_PAGE_PATH_SUFFIX) | |
soup = BeautifulSoup(html.read(), "html.parser") | |
data = getJsonFile(soup) | |
data_parsed_list = parseHtmlAndCreateData(soup, data) | |
with open( | |
os.path.join(os.path.dirname(__file__), "output.json"), "w", encoding="utf-8" | |
) as file: | |
json.dump(data_parsed_list, file, indent=4) | |
input_base_html_file = os.path.join(os.path.dirname(__file__), "input.html") | |
with open(input_base_html_file, "r", encoding="utf-8") as file: | |
base_html = file.read() | |
empty_soup = BeautifulSoup(base_html, "html.parser") | |
main_div = empty_soup.find("div", {"class": "container-fluid"}) | |
for data_parsed_to_append in data_parsed_list: | |
div = empty_soup.new_tag("div") | |
div["class"] = "row justify-content-center align-items-center g-2" | |
div.append(createHeader(data_parsed_to_append, empty_soup)) | |
div.append(createTable(data_parsed_to_append, empty_soup)) | |
main_div.append(div) | |
with open( | |
os.path.join(os.path.dirname(__file__), "output.html"), "w", encoding="utf-8" | |
) as file: | |
file.write(empty_soup.prettify()) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment