toraritte · May 4, 2020 02:43
diff --git a/the-union.js b/the-union.js

 /*
 There are about 12 <article>  HTML tags on each news
 article page, and none of them are explicitly marked
 as the main one (which  would make sense because the
 URL of this page is explicitly for this article; the
 rest are just offered for extra navigation).

 Therefore  the below  assignment is  on shaky  legs,
 because I'm  just assuming that the  first <article>
 will  always  be the  main  article  based on  other
 article   links.  (`querySelector`   always  returns
 the  first  element  even  if there  are  more,  and
 `querySelectorAll` returns  all that has  been found
 in an array.)
 */
 let article = document.querySelector('main article');

 /*
 Grabbing the  main <article> tag alone  also doesn't
 suffice  because they  crammed a  lot of  stuff that
 semantically  shouldn't belong  there,  such as  the
 "Support  Local  Journalism"  section,  the  comment
 section, and  othe articles by The  Union. One could
 argue  that the  comment section  is related  to the
 article, but  that just  means that the  database in
 the backend should have a relationship between them,
 and it is not an intrinsic part of it.

 So this can  also break at any  moment because parts
 of the  article is  grabbed by HTML  attributes that
 may change in the future.
 */
 let headline = article.querySelector('h1');
 let meta = article.querySelector('#article-meta');
 let byline = article.querySelector('#article-byline');
 let content = article.querySelector('.p402_premium');

 /*
 Ads  and other  unrelated content  are usually  just
 interspersed  with  the  text in  the  article  (and
 they  shouldn't   be),  and   so  these   will  need
 to  be  filtered  out.  (Asked  how  this  could  be
 done  properly on  Stackoverflow,  I  may be  wrong;
 https://stackoverflow.com/questions/61584157 )
 */
 Array.from(content.querySelectorAll(':scope > div')).forEach( div => content.removeChild(div) );

 /*
 This  is the  most appalling  part of  this website,
 utterly  disregarding the  HTML standard  and common
 sense:   article   subheadings  are   just   regular
 paragraphs (<p>  elements) that are styled  with CSS
 to look like headings,  instead of using the correct
 element (<h2> in this case).

 Not only does this look ugly and unprofessional, but
 blind people who use screen readers won't be able to
 navigate the  article as  they use  the hierarchical
 headings (<h1> - <h6>) to jump around the page.

 So this code replaces these fake headings with their
 semantic HTML counterparts.
 */
 Array.from(content.querySelectorAll('.STND-STND.BodyText.Subhead')).forEach( (e) => {let headerText = e.textContent; e.outerHTML = '<h2>' + headerText + '</h2>'});

 /*
 Under the article  title it always says  how old the
 article is, and  if it's recent then it  will show a
 relative time  (e.g., 23h ago). Because  we save the
 pages for  readers, and  may not  be read  for days,
 these values  will have to be  converted to absolute
 values.  Fortunately the  HTML elements  contain the
 absolute timestamp, and only a client-side script is
 making them relative.
 */
 let time = meta.querySelector('time');
 let datetimeArray = time.getAttribute('datetime').split(/[TZ]/);
 time.textContent = datetimeArray[0] + ' ' + datetimeArray[1];

 /*
 This is a barbaric move on my part, and will need to
 be  adjusted,  but  I  just needed  to  present  the
 harvested (and normalized) content  so that it could
 be printed into PDF.
 */
 let root = document.body.parentNode;
 root.removeChild(document.body);
 [headline, meta, byline, content].forEach(elem => root.insertAdjacentElement('beforeend', elem));

 /*
 In Chrome (and  I presume that in  other browsers as
 well)  the  print  dialog  can be  called  up  using
 `window.print()`, and  it will use the  page's title
 (set  by  <title> in  <head>)  for  the filename  by
 default.

 This is not optimal because it doesn't have the date
 or the  name of the  publication, so amending  it to
 get the  right results  - and so  one won't  have to
 rename  the file  manually each  time an  article is
 saved.
 */
 document.title = "nevada-city-grass-valley-union-" + datetimeArray[0] + '-' + document.URL.split('/').reverse()[1].replace(/^(\w+-\w+-\w+-\w+-\w+).*$/, '$1');
 window.print()

	/*
	There are about 12 <article> HTML tags on each news
	article page, and none of them are explicitly marked
	as the main one (which would make sense because the
	URL of this page is explicitly for this article; the
	rest are just offered for extra navigation).

	Therefore the below assignment is on shaky legs,
	because I'm just assuming that the first <article>
	will always be the main article based on other
	article links. (`querySelector` always returns
	the first element even if there are more, and
	`querySelectorAll` returns all that has been found
	in an array.)
	*/
	let article = document.querySelector('main article');

	/*
	Grabbing the main <article> tag alone also doesn't
	suffice because they crammed a lot of stuff that
	semantically shouldn't belong there, such as the
	"Support Local Journalism" section, the comment
	section, and othe articles by The Union. One could
	argue that the comment section is related to the
	article, but that just means that the database in
	the backend should have a relationship between them,
	and it is not an intrinsic part of it.

	So this can also break at any moment because parts
	of the article is grabbed by HTML attributes that
	may change in the future.
	*/
	let headline = article.querySelector('h1');
	let meta = article.querySelector('#article-meta');
	let byline = article.querySelector('#article-byline');
	let content = article.querySelector('.p402_premium');

	/*
	Ads and other unrelated content are usually just
	interspersed with the text in the article (and
	they shouldn't be), and so these will need
	to be filtered out. (Asked how this could be
	done properly on Stackoverflow, I may be wrong;
	https://stackoverflow.com/questions/61584157 )
	*/
	Array.from(content.querySelectorAll(':scope > div')).forEach( div => content.removeChild(div) );

	/*
	This is the most appalling part of this website,
	utterly disregarding the HTML standard and common
	sense: article subheadings are just regular
	paragraphs (<p> elements) that are styled with CSS
	to look like headings, instead of using the correct
	element (<h2> in this case).

	Not only does this look ugly and unprofessional, but
	blind people who use screen readers won't be able to
	navigate the article as they use the hierarchical
	headings (<h1> - <h6>) to jump around the page.

	So this code replaces these fake headings with their
	semantic HTML counterparts.
	*/
	Array.from(content.querySelectorAll('.STND-STND.BodyText.Subhead')).forEach( (e) => {let headerText = e.textContent; e.outerHTML = '<h2>' + headerText + '</h2>'});

	/*
	Under the article title it always says how old the
	article is, and if it's recent then it will show a
	relative time (e.g., 23h ago). Because we save the
	pages for readers, and may not be read for days,
	these values will have to be converted to absolute
	values. Fortunately the HTML elements contain the
	absolute timestamp, and only a client-side script is
	making them relative.
	*/
	let time = meta.querySelector('time');
	let datetimeArray = time.getAttribute('datetime').split(/[TZ]/);
	time.textContent = datetimeArray[0] + ' ' + datetimeArray[1];

	/*
	This is a barbaric move on my part, and will need to
	be adjusted, but I just needed to present the
	harvested (and normalized) content so that it could
	be printed into PDF.
	*/
	let root = document.body.parentNode;
	root.removeChild(document.body);
	[headline, meta, byline, content].forEach(elem => root.insertAdjacentElement('beforeend', elem));

	/*
	In Chrome (and I presume that in other browsers as
	well) the print dialog can be called up using
	`window.print()`, and it will use the page's title
	(set by <title> in <head>) for the filename by
	default.

	This is not optimal because it doesn't have the date
	or the name of the publication, so amending it to
	get the right results - and so one won't have to
	rename the file manually each time an article is
	saved.
	*/
	document.title = "nevada-city-grass-valley-union-" + datetimeArray[0] + '-' + document.URL.split('/').reverse()[1].replace(/^(\w+-\w+-\w+-\w+-\w+).*$/, '$1');
	window.print()