hXtreme · July 12, 2019 02:02
diff --git a/scrapper.ipynb b/scrapper.ipynb
diff --git a/scrapper.py b/scrapper.py
 # This was originally a Jupyter Notebook so the code might look a bit weird, my condolences.

 import requests
 from bs4 import BeautifulSoup

 MAX_PRV_LEN = 160

 PARSER = 'html.parser'
 HR_REPLACEMENT = '-----'

 def normalize_txt(s:str):
    s = s.replace(chr(160), '')
    s = s.replace('\n\n', '\n')
    s = s.replace('\n', '\n\n')
    rep = '\n' + (chr(8212)*10) + '\n'
    s = s.replace(rep, '\n----------\n')
    rep = '\n' + (chr(8212)*11) + '\n'
    s = s.replace(rep, '\n----------\n')
    return s.strip()
  
  def preview(s:str, max_len=MAX_PRV_LEN):
    s = s.strip()
    prv_len = min(s.find('\n'), max_len)
    ext = '...' if max_len < s.find('\n') else ''
    return s[:prv_len].strip() + ext
  
  def get_chp(url):
    # Get the page and parse it into a BeautifulSoup
    page = requests.get(url=url).text
    soup = BeautifulSoup(page, PARSER)
    soup_prv = preview(soup.text)
    print(soup_prv)
    
    # Get a convinient tag that contains the chapter
    article_tag = soup.find(
            name='div',
            attrs={'class':'entry-content'}
         )
    
    # Replace hr tags by HR_REPLCEMENT string for the ease of 
    #   post-processing 
    for hr_tag in article_tag.find_all('hr'):
        hr_tag.replace_with(HR_REPLACEMENT)
    
    # Extract the chapter text and end notes
    article = article_tag.text
    main, _, rest = article.rpartition(r'<< | TOC | >>')
    notes = rest.rpartition(r'Related')[0].strip(' -\n')
    
    main_txt = main.strip()
    print(preview(main_txt))
    print(preview(notes))
    
    # Massage the text to get a well formatted output witout junk text.
    main_parts = main_txt.split(HR_REPLACEMENT, maxsplit=2)
    chp_txt = '# ' + (main_parts[-1].lstrip()).rstrip(' -')
    return (soup_prv, chp_txt, notes)
  
  # Test
  
 # test_url = r'http://www.scarletmadness.org/2017/12/07/stos-chapter-17/'
 # test_url = r'http://www.scarletmadness.org/2017/11/30/stos-chapter-16/'
 test_url = r'http://www.scarletmadness.org/2017/12/14/stos-chapter-18/'

 url = test_url

 prv, chp, notes = get_chp(url)

 print(normalize_txt(chp))
 print(normalize_txt(notes))

 # Save Result

 import codecs

 s = prv.find(' ')
 s = prv.find(' ', s+1)
 s = prv.find(' ', s+1)

 # print(prv[:s])
 FILE = './' + prv[:s] + '.md'
 # FILE = './16.md'
 chp = normalize_txt(chp)
 notes = normalize_txt(notes)

 with codecs.open(FILE, 'w', 'utf-8') as f:
    f.write(chp)
    f.write('\n\n')
    f.write(notes)
	# This was originally a Jupyter Notebook so the code might look a bit weird, my condolences.

	import requests
	from bs4 import BeautifulSoup

	MAX_PRV_LEN = 160

	PARSER = 'html.parser'
	HR_REPLACEMENT = '-----'

	def normalize_txt(s:str):
	s = s.replace(chr(160), '')
	s = s.replace('\n\n', '\n')
	s = s.replace('\n', '\n\n')
	rep = '\n' + (chr(8212)*10) + '\n'
	s = s.replace(rep, '\n----------\n')
	rep = '\n' + (chr(8212)*11) + '\n'
	s = s.replace(rep, '\n----------\n')
	return s.strip()

	def preview(s:str, max_len=MAX_PRV_LEN):
	s = s.strip()
	prv_len = min(s.find('\n'), max_len)
	ext = '...' if max_len < s.find('\n') else ''
	return s[:prv_len].strip() + ext

	def get_chp(url):
	# Get the page and parse it into a BeautifulSoup
	page = requests.get(url=url).text
	soup = BeautifulSoup(page, PARSER)
	soup_prv = preview(soup.text)
	print(soup_prv)

	# Get a convinient tag that contains the chapter
	article_tag = soup.find(
	name='div',
	attrs={'class':'entry-content'}
	)

	# Replace hr tags by HR_REPLCEMENT string for the ease of
	# post-processing
	for hr_tag in article_tag.find_all('hr'):
	hr_tag.replace_with(HR_REPLACEMENT)

	# Extract the chapter text and end notes
	article = article_tag.text
	main, _, rest = article.rpartition(r'<< \| TOC \| >>')
	notes = rest.rpartition(r'Related')[0].strip(' -\n')

	main_txt = main.strip()
	print(preview(main_txt))
	print(preview(notes))

	# Massage the text to get a well formatted output witout junk text.
	main_parts = main_txt.split(HR_REPLACEMENT, maxsplit=2)
	chp_txt = '# ' + (main_parts[-1].lstrip()).rstrip(' -')
	return (soup_prv, chp_txt, notes)

	# Test

	# test_url = r'http://www.scarletmadness.org/2017/12/07/stos-chapter-17/'
	# test_url = r'http://www.scarletmadness.org/2017/11/30/stos-chapter-16/'
	test_url = r'http://www.scarletmadness.org/2017/12/14/stos-chapter-18/'

	url = test_url

	prv, chp, notes = get_chp(url)

	print(normalize_txt(chp))
	print(normalize_txt(notes))

	# Save Result

	import codecs

	s = prv.find(' ')
	s = prv.find(' ', s+1)
	s = prv.find(' ', s+1)

	# print(prv[:s])
	FILE = './' + prv[:s] + '.md'
	# FILE = './16.md'
	chp = normalize_txt(chp)
	notes = normalize_txt(notes)

	with codecs.open(FILE, 'w', 'utf-8') as f:
	f.write(chp)
	f.write('\n\n')
	f.write(notes)