Skip to content

Instantly share code, notes, and snippets.

@david-andrew
Created September 22, 2024 20:28
Show Gist options
  • Save david-andrew/d992ef49ab68057e852c85958a11330d to your computer and use it in GitHub Desktop.
Save david-andrew/d992ef49ab68057e852c85958a11330d to your computer and use it in GitHub Desktop.
extract just policy and environment
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
# from markdownify import markdownify as md
from weasyprint import HTML
from tqdm import tqdm
from enum import Enum
import pdb
class AtrocityType(Enum):
A = "Sexual Misconduct, Harassment, & Bullying"
B = "White Supremacy, Racism, Homophobia, Transphobia, & Xenophobia"
C = "Public Statements / Tweets"
D = "Collusion with Russia & Obstruction of Justice"
E = "Trump Staff & Administration"
F = "Trump Family Business Dealings"
G = "Policy"
H = "Environment"
# map from the little circle image next to each item to what type of atrocity it is
tag_map: dict[str, AtrocityType] = {
"https://edge.mcsw.net/mcsweeneys/o8h1z4tnek7t3922u5kbhs7jilt6": AtrocityType.A,
"https://edge.mcsw.net/mcsweeneys/8pgw1xt7bge7vimpktzrvtduynnw": AtrocityType.B,
"https://edge.mcsw.net/mcsweeneys/40hzdg0fji351ky6f82mljdxy97w": AtrocityType.C,
"https://edge.mcsw.net/mcsweeneys/f77tzm9u9n7bcka1p80b31fde24k": AtrocityType.D,
"https://edge.mcsw.net/mcsweeneys/xxy64xaw69iuxhf0ky8jvilh12e3": AtrocityType.E,
"https://edge.mcsw.net/mcsweeneys/8nk0d98xc5l10xhzfx229dz69k8r": AtrocityType.F,
"https://edge.mcsw.net/mcsweeneys/6na60r5qxopwx1faxay2eg851o3u": AtrocityType.G,
"https://edge.mcsw.net/mcsweeneys/b0vxtek7212calkzs1i6kcbxu7lm": AtrocityType.H,
}
##### map from each atrocity type to a list of all the atrocities of that type (in formatted markdown)
# map from each atrocity type to a list of all the atrocities of that type (in html format) and the int index from the original article
atrocity_map: dict[AtrocityType, list[tuple[int, str]]] = defaultdict(list)
def main():
url = 'https://www.mcsweeneys.net/articles/the-complete-listing-atrocities-1-1-056'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html5lib') # use html5lib parser because response.text is not well-formed
# Step 3: Find the ordered list (ol) and then extract all list items (li)
ol_element = soup.find('ol')
li_items = ol_element.find_all('li', recursive=False)
# Step 4: Loop through each list item and extract the text
print('Extracting each item from webpage...')
for i, item in enumerate(li_items):
# save the original index of the item since they will be re-ordered
# item.attrs['value'] = str(i+1) #doesn't work with weasyprint
img_tags = item.find_all('img')
img_urls = [img['src'] for img in img_tags]
types = [tag_map[url] for url in img_urls]
# remove the images from the item
for img in img_tags:
img.extract()
# add the markdown to the appropriate atrocity type(s)
for atrocity_type in types:
atrocity_map[atrocity_type].append((i, str(item)))
# Step 5: create a pdf out of each of the items
print('Creating PDF...')
doc_chunks: list[str] = []
# for atrocity_type, items in atrocity_map.items():
for atrocity_type in [AtrocityType.G, AtrocityType.H]:
items = atrocity_map[atrocity_type]
doc_chunks.append(f'<h1>{atrocity_type.value}</h1><ol>')
for i, item in items:
doc_chunks.append(item)
doc_chunks.append('</ol><br><br>')
doc = ''.join(doc_chunks)
html = HTML(string=doc)
html.write_pdf('output.pdf')
print('Done!')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment