Skip to content

Instantly share code, notes, and snippets.

@davidqo
Created July 4, 2017 03:31
Show Gist options
  • Select an option

  • Save davidqo/559f3914d80d6f5467560e25d3cdc75f to your computer and use it in GitHub Desktop.

Select an option

Save davidqo/559f3914d80d6f5467560e25d3cdc75f to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup, Comment
MIN_TEXT_SIZE = 1000
def mine(page):
soup = BeautifulSoup(page, "lxml")
text_areas_list = []
tag_count = 0
section_list = soup.findAll('p')
while len(section_list) > 0:
biggest_section = sorted(section_list, key = lambda p: len(p.text), reverse=True)[0]
pure_text = u''
# If text on current level is not enough - we will go to the upper level
while True:
#print "Current node: {}".format(biggest_section)
pure_text = extract_text(biggest_section.contents)
#print "RESULTING PURE TEXT: {}".format(pure_text)
parent_section = biggest_section.parent
if len(pure_text) < MIN_TEXT_SIZE:
# There are not enough text collected. Go to the parent element
if parent_section != None:
#print "1"
#print "Parent: {}".format(parent_section)
biggest_section = parent_section
# There are not enough text collected but no parent exists. Decompose current element
else:
#print "2"
tag_count = count_tags(biggest_section)
biggest_section.decompose()
break
# There are enough text collected
else:
#print "3"
tag_count = count_tags(biggest_section)
#print biggest_section
biggest_section.decompose()
break
text_areas_list.append((pure_text, tag_count))
section_list = soup.findAll('p')
return text_areas_list
def extract_text(content):
#print "Extract text: {}".format(content)
extracted_text = u''
# list of nodes
if content == None:
extracted_text = u''
elif isinstance(content, list):
# Recursively call extract_text on each sub-node
for n in content:
extracted_text += extract_text(n)
elif content.name == "script":
extracted_text = u''
elif isinstance(content, Comment):
extracted_text = u''
# unicode string
elif isinstance(content, unicode):
extracted_text = prepare_string(content)
#print u"Extracted: {}".format(extracted_text)
# simple string
elif isinstance(content, str):
extracted_text = prepare_string(content)
#print u"Extracted: {}".format(extracted_text)
# node
else:
# node that doesn't have any content
if len(content.contents) == 0:
extracted_text = u''
# node with different contents
else:
extracted_text = extract_text(content.contents)
return extracted_text
def prepare_string(string):
stripped = string.strip()
if len(stripped) > 0:
stripped += '\n'
return stripped
def count_tags(section):
return len(section.findAll())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment