amrakm · April 29, 2022 11:34
diff --git a/clean_html.py b/clean_html.py
 import re

 def cleanhtml(raw_html):
    
    #Some HTML texts can also contain entities that are not enclosed in brackets, such as '&nsbm'. If that is the case, then you might want to write the regex as
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

    # as per recommendation from @freylis, compile once only
    CLEANR = re.compile('<.*?>') 
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext
	import re

	def cleanhtml(raw_html):

	#Some HTML texts can also contain entities that are not enclosed in brackets, such as '&nsbm'. If that is the case, then you might want to write the regex as
	CLEANR = re.compile('<.*?>\|&([a-z0-9]+\|#[0-9]{1,6}\|#x[0-9a-f]{1,6});')

	# as per recommendation from @freylis, compile once only
	CLEANR = re.compile('<.*?>')
	cleantext = re.sub(CLEANR, '', raw_html)
	return cleantext
No results found