davidqo · July 4, 2017 03:31
diff --git a/example.py b/example.py
 from bs4 import BeautifulSoup, Comment

 MIN_TEXT_SIZE = 1000

 def mine(page):
    soup = BeautifulSoup(page, "lxml")
    text_areas_list = []
    tag_count = 0
    section_list = soup.findAll('p')
    while len(section_list) > 0:
        biggest_section = sorted(section_list, key = lambda p: len(p.text), reverse=True)[0]
        pure_text = u''
        # If text on current level is not enough - we will go to the upper level
        while True:
            #print "Current node: {}".format(biggest_section)
            pure_text = extract_text(biggest_section.contents)
            #print "RESULTING PURE TEXT: {}".format(pure_text)
            parent_section = biggest_section.parent
            if len(pure_text) < MIN_TEXT_SIZE:
                # There are not enough text collected. Go to the parent element
                if parent_section != None:
                    #print "1"
                    #print "Parent: {}".format(parent_section)
                    biggest_section = parent_section
                # There are not enough text collected but no parent exists. Decompose current element
                else:
                    #print "2"
                    tag_count = count_tags(biggest_section)
                    biggest_section.decompose()
                    break
            # There are enough text collected
            else:
                #print "3"
                tag_count = count_tags(biggest_section)
                #print biggest_section
                biggest_section.decompose()
                break
        text_areas_list.append((pure_text, tag_count))
        section_list = soup.findAll('p')
    return text_areas_list

 def extract_text(content):
    #print "Extract text: {}".format(content)
    extracted_text = u''
    # list of nodes
    if content == None:
        extracted_text = u''
    elif isinstance(content, list):
        # Recursively call extract_text on each sub-node
        for n in content:
            extracted_text += extract_text(n)
    elif content.name == "script":
        extracted_text = u''
    elif isinstance(content, Comment):
        extracted_text = u''
    # unicode string
    elif isinstance(content, unicode):
        extracted_text = prepare_string(content)
        #print u"Extracted: {}".format(extracted_text)
    # simple string
    elif isinstance(content, str):
        extracted_text = prepare_string(content)
        #print u"Extracted: {}".format(extracted_text)
    # node
    else:
        # node that doesn't have any content
        if len(content.contents) == 0:
            extracted_text = u''
        # node with different contents
        else:
            extracted_text = extract_text(content.contents)
    return extracted_text

 def prepare_string(string):
    stripped = string.strip()
    if len(stripped) > 0:
        stripped += '\n'
    return stripped

 def count_tags(section):
    return len(section.findAll())
	from bs4 import BeautifulSoup, Comment

	MIN_TEXT_SIZE = 1000

	def mine(page):
	soup = BeautifulSoup(page, "lxml")
	text_areas_list = []
	tag_count = 0
	section_list = soup.findAll('p')
	while len(section_list) > 0:
	biggest_section = sorted(section_list, key = lambda p: len(p.text), reverse=True)[0]
	pure_text = u''
	# If text on current level is not enough - we will go to the upper level
	while True:
	#print "Current node: {}".format(biggest_section)
	pure_text = extract_text(biggest_section.contents)
	#print "RESULTING PURE TEXT: {}".format(pure_text)
	parent_section = biggest_section.parent
	if len(pure_text) < MIN_TEXT_SIZE:
	# There are not enough text collected. Go to the parent element
	if parent_section != None:
	#print "1"
	#print "Parent: {}".format(parent_section)
	biggest_section = parent_section
	# There are not enough text collected but no parent exists. Decompose current element
	else:
	#print "2"
	tag_count = count_tags(biggest_section)
	biggest_section.decompose()
	break
	# There are enough text collected
	else:
	#print "3"
	tag_count = count_tags(biggest_section)
	#print biggest_section
	biggest_section.decompose()
	break
	text_areas_list.append((pure_text, tag_count))
	section_list = soup.findAll('p')
	return text_areas_list

	def extract_text(content):
	#print "Extract text: {}".format(content)
	extracted_text = u''
	# list of nodes
	if content == None:
	extracted_text = u''
	elif isinstance(content, list):
	# Recursively call extract_text on each sub-node
	for n in content:
	extracted_text += extract_text(n)
	elif content.name == "script":
	extracted_text = u''
	elif isinstance(content, Comment):
	extracted_text = u''
	# unicode string
	elif isinstance(content, unicode):
	extracted_text = prepare_string(content)
	#print u"Extracted: {}".format(extracted_text)
	# simple string
	elif isinstance(content, str):
	extracted_text = prepare_string(content)
	#print u"Extracted: {}".format(extracted_text)
	# node
	else:
	# node that doesn't have any content
	if len(content.contents) == 0:
	extracted_text = u''
	# node with different contents
	else:
	extracted_text = extract_text(content.contents)
	return extracted_text

	def prepare_string(string):
	stripped = string.strip()
	if len(stripped) > 0:
	stripped += '\n'
	return stripped

	def count_tags(section):
	return len(section.findAll())
No results found