david-andrew · September 22, 2024 20:28
diff --git a/gistfile1.txt b/gistfile1.txt
 import requests
 from bs4 import BeautifulSoup
 from collections import defaultdict
 # from markdownify import markdownify as md
 from weasyprint import HTML
 from tqdm import tqdm
 from enum import Enum


 import pdb



 class AtrocityType(Enum):
    A = "Sexual Misconduct, Harassment, & Bullying"
    B = "White Supremacy, Racism, Homophobia, Transphobia, & Xenophobia"
    C = "Public Statements / Tweets"
    D = "Collusion with Russia & Obstruction of Justice"
    E = "Trump Staff & Administration"
    F = "Trump Family Business Dealings"
    G = "Policy"
    H = "Environment"

 # map from the little circle image next to each item to what type of atrocity it is
 tag_map: dict[str, AtrocityType] = {
    "https://edge.mcsw.net/mcsweeneys/o8h1z4tnek7t3922u5kbhs7jilt6": AtrocityType.A,
    "https://edge.mcsw.net/mcsweeneys/8pgw1xt7bge7vimpktzrvtduynnw": AtrocityType.B,
    "https://edge.mcsw.net/mcsweeneys/40hzdg0fji351ky6f82mljdxy97w": AtrocityType.C,
    "https://edge.mcsw.net/mcsweeneys/f77tzm9u9n7bcka1p80b31fde24k": AtrocityType.D,
    "https://edge.mcsw.net/mcsweeneys/xxy64xaw69iuxhf0ky8jvilh12e3": AtrocityType.E,
    "https://edge.mcsw.net/mcsweeneys/8nk0d98xc5l10xhzfx229dz69k8r": AtrocityType.F,
    "https://edge.mcsw.net/mcsweeneys/6na60r5qxopwx1faxay2eg851o3u": AtrocityType.G,
    "https://edge.mcsw.net/mcsweeneys/b0vxtek7212calkzs1i6kcbxu7lm": AtrocityType.H,
 }

 ##### map from each atrocity type to a list of all the atrocities of that type (in formatted markdown)
 # map from each atrocity type to a list of all the atrocities of that type (in html format) and the int index from the original article
 atrocity_map: dict[AtrocityType, list[tuple[int, str]]] = defaultdict(list)


 def main():

    url = 'https://www.mcsweeneys.net/articles/the-complete-listing-atrocities-1-1-056'  
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib') # use html5lib parser because response.text is not well-formed
  
    # Step 3: Find the ordered list (ol) and then extract all list items (li)
    ol_element = soup.find('ol')
    li_items = ol_element.find_all('li', recursive=False)

    # Step 4: Loop through each list item and extract the text
    print('Extracting each item from webpage...')
    for i, item in enumerate(li_items):
        # save the original index of the item since they will be re-ordered
        # item.attrs['value'] = str(i+1) #doesn't work with weasyprint

        img_tags = item.find_all('img')
        img_urls = [img['src'] for img in img_tags]
        types = [tag_map[url] for url in img_urls]

        # remove the images from the item
        for img in img_tags:
            img.extract()

        # add the markdown to the appropriate atrocity type(s)
        for atrocity_type in types:
            atrocity_map[atrocity_type].append((i, str(item)))

    # Step 5: create a pdf out of each of the items
    print('Creating PDF...')
    doc_chunks: list[str] = []
    # for atrocity_type, items in atrocity_map.items():
    for atrocity_type in [AtrocityType.G, AtrocityType.H]:
        items = atrocity_map[atrocity_type]
        doc_chunks.append(f'<h1>{atrocity_type.value}</h1><ol>')
        for i, item in items:
            doc_chunks.append(item)
        doc_chunks.append('</ol><br><br>')
    doc = ''.join(doc_chunks)
    html = HTML(string=doc)
    html.write_pdf('output.pdf')

    print('Done!')




 if __name__ == '__main__':
    main()
	import requests
	from bs4 import BeautifulSoup
	from collections import defaultdict
	# from markdownify import markdownify as md
	from weasyprint import HTML
	from tqdm import tqdm
	from enum import Enum


	import pdb



	class AtrocityType(Enum):
	A = "Sexual Misconduct, Harassment, & Bullying"
	B = "White Supremacy, Racism, Homophobia, Transphobia, & Xenophobia"
	C = "Public Statements / Tweets"
	D = "Collusion with Russia & Obstruction of Justice"
	E = "Trump Staff & Administration"
	F = "Trump Family Business Dealings"
	G = "Policy"
	H = "Environment"

	# map from the little circle image next to each item to what type of atrocity it is
	tag_map: dict[str, AtrocityType] = {
	"https://edge.mcsw.net/mcsweeneys/o8h1z4tnek7t3922u5kbhs7jilt6": AtrocityType.A,
	"https://edge.mcsw.net/mcsweeneys/8pgw1xt7bge7vimpktzrvtduynnw": AtrocityType.B,
	"https://edge.mcsw.net/mcsweeneys/40hzdg0fji351ky6f82mljdxy97w": AtrocityType.C,
	"https://edge.mcsw.net/mcsweeneys/f77tzm9u9n7bcka1p80b31fde24k": AtrocityType.D,
	"https://edge.mcsw.net/mcsweeneys/xxy64xaw69iuxhf0ky8jvilh12e3": AtrocityType.E,
	"https://edge.mcsw.net/mcsweeneys/8nk0d98xc5l10xhzfx229dz69k8r": AtrocityType.F,
	"https://edge.mcsw.net/mcsweeneys/6na60r5qxopwx1faxay2eg851o3u": AtrocityType.G,
	"https://edge.mcsw.net/mcsweeneys/b0vxtek7212calkzs1i6kcbxu7lm": AtrocityType.H,
	}

	##### map from each atrocity type to a list of all the atrocities of that type (in formatted markdown)
	# map from each atrocity type to a list of all the atrocities of that type (in html format) and the int index from the original article
	atrocity_map: dict[AtrocityType, list[tuple[int, str]]] = defaultdict(list)


	def main():

	url = 'https://www.mcsweeneys.net/articles/the-complete-listing-atrocities-1-1-056'
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html5lib') # use html5lib parser because response.text is not well-formed

	# Step 3: Find the ordered list (ol) and then extract all list items (li)
	ol_element = soup.find('ol')
	li_items = ol_element.find_all('li', recursive=False)

	# Step 4: Loop through each list item and extract the text
	print('Extracting each item from webpage...')
	for i, item in enumerate(li_items):
	# save the original index of the item since they will be re-ordered
	# item.attrs['value'] = str(i+1) #doesn't work with weasyprint

	img_tags = item.find_all('img')
	img_urls = [img['src'] for img in img_tags]
	types = [tag_map[url] for url in img_urls]

	# remove the images from the item
	for img in img_tags:
	img.extract()

	# add the markdown to the appropriate atrocity type(s)
	for atrocity_type in types:
	atrocity_map[atrocity_type].append((i, str(item)))

	# Step 5: create a pdf out of each of the items
	print('Creating PDF...')
	doc_chunks: list[str] = []
	# for atrocity_type, items in atrocity_map.items():
	for atrocity_type in [AtrocityType.G, AtrocityType.H]:
	items = atrocity_map[atrocity_type]
	doc_chunks.append(f'<h1>{atrocity_type.value}</h1><ol>')
	for i, item in items:
	doc_chunks.append(item)
	doc_chunks.append('</ol><br><br>')
	doc = ''.join(doc_chunks)
	html = HTML(string=doc)
	html.write_pdf('output.pdf')

	print('Done!')




	if __name__ == '__main__':
	main()