Created
September 22, 2024 20:28
-
-
Save david-andrew/d992ef49ab68057e852c85958a11330d to your computer and use it in GitHub Desktop.
extract just policy and environment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from collections import defaultdict | |
# from markdownify import markdownify as md | |
from weasyprint import HTML | |
from tqdm import tqdm | |
from enum import Enum | |
import pdb | |
class AtrocityType(Enum): | |
A = "Sexual Misconduct, Harassment, & Bullying" | |
B = "White Supremacy, Racism, Homophobia, Transphobia, & Xenophobia" | |
C = "Public Statements / Tweets" | |
D = "Collusion with Russia & Obstruction of Justice" | |
E = "Trump Staff & Administration" | |
F = "Trump Family Business Dealings" | |
G = "Policy" | |
H = "Environment" | |
# map from the little circle image next to each item to what type of atrocity it is | |
tag_map: dict[str, AtrocityType] = { | |
"https://edge.mcsw.net/mcsweeneys/o8h1z4tnek7t3922u5kbhs7jilt6": AtrocityType.A, | |
"https://edge.mcsw.net/mcsweeneys/8pgw1xt7bge7vimpktzrvtduynnw": AtrocityType.B, | |
"https://edge.mcsw.net/mcsweeneys/40hzdg0fji351ky6f82mljdxy97w": AtrocityType.C, | |
"https://edge.mcsw.net/mcsweeneys/f77tzm9u9n7bcka1p80b31fde24k": AtrocityType.D, | |
"https://edge.mcsw.net/mcsweeneys/xxy64xaw69iuxhf0ky8jvilh12e3": AtrocityType.E, | |
"https://edge.mcsw.net/mcsweeneys/8nk0d98xc5l10xhzfx229dz69k8r": AtrocityType.F, | |
"https://edge.mcsw.net/mcsweeneys/6na60r5qxopwx1faxay2eg851o3u": AtrocityType.G, | |
"https://edge.mcsw.net/mcsweeneys/b0vxtek7212calkzs1i6kcbxu7lm": AtrocityType.H, | |
} | |
##### map from each atrocity type to a list of all the atrocities of that type (in formatted markdown) | |
# map from each atrocity type to a list of all the atrocities of that type (in html format) and the int index from the original article | |
atrocity_map: dict[AtrocityType, list[tuple[int, str]]] = defaultdict(list) | |
def main(): | |
url = 'https://www.mcsweeneys.net/articles/the-complete-listing-atrocities-1-1-056' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html5lib') # use html5lib parser because response.text is not well-formed | |
# Step 3: Find the ordered list (ol) and then extract all list items (li) | |
ol_element = soup.find('ol') | |
li_items = ol_element.find_all('li', recursive=False) | |
# Step 4: Loop through each list item and extract the text | |
print('Extracting each item from webpage...') | |
for i, item in enumerate(li_items): | |
# save the original index of the item since they will be re-ordered | |
# item.attrs['value'] = str(i+1) #doesn't work with weasyprint | |
img_tags = item.find_all('img') | |
img_urls = [img['src'] for img in img_tags] | |
types = [tag_map[url] for url in img_urls] | |
# remove the images from the item | |
for img in img_tags: | |
img.extract() | |
# add the markdown to the appropriate atrocity type(s) | |
for atrocity_type in types: | |
atrocity_map[atrocity_type].append((i, str(item))) | |
# Step 5: create a pdf out of each of the items | |
print('Creating PDF...') | |
doc_chunks: list[str] = [] | |
# for atrocity_type, items in atrocity_map.items(): | |
for atrocity_type in [AtrocityType.G, AtrocityType.H]: | |
items = atrocity_map[atrocity_type] | |
doc_chunks.append(f'<h1>{atrocity_type.value}</h1><ol>') | |
for i, item in items: | |
doc_chunks.append(item) | |
doc_chunks.append('</ol><br><br>') | |
doc = ''.join(doc_chunks) | |
html = HTML(string=doc) | |
html.write_pdf('output.pdf') | |
print('Done!') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment