Created
July 4, 2017 03:31
-
-
Save davidqo/559f3914d80d6f5467560e25d3cdc75f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup, Comment | |
| MIN_TEXT_SIZE = 1000 | |
| def mine(page): | |
| soup = BeautifulSoup(page, "lxml") | |
| text_areas_list = [] | |
| tag_count = 0 | |
| section_list = soup.findAll('p') | |
| while len(section_list) > 0: | |
| biggest_section = sorted(section_list, key = lambda p: len(p.text), reverse=True)[0] | |
| pure_text = u'' | |
| # If text on current level is not enough - we will go to the upper level | |
| while True: | |
| #print "Current node: {}".format(biggest_section) | |
| pure_text = extract_text(biggest_section.contents) | |
| #print "RESULTING PURE TEXT: {}".format(pure_text) | |
| parent_section = biggest_section.parent | |
| if len(pure_text) < MIN_TEXT_SIZE: | |
| # There are not enough text collected. Go to the parent element | |
| if parent_section != None: | |
| #print "1" | |
| #print "Parent: {}".format(parent_section) | |
| biggest_section = parent_section | |
| # There are not enough text collected but no parent exists. Decompose current element | |
| else: | |
| #print "2" | |
| tag_count = count_tags(biggest_section) | |
| biggest_section.decompose() | |
| break | |
| # There are enough text collected | |
| else: | |
| #print "3" | |
| tag_count = count_tags(biggest_section) | |
| #print biggest_section | |
| biggest_section.decompose() | |
| break | |
| text_areas_list.append((pure_text, tag_count)) | |
| section_list = soup.findAll('p') | |
| return text_areas_list | |
| def extract_text(content): | |
| #print "Extract text: {}".format(content) | |
| extracted_text = u'' | |
| # list of nodes | |
| if content == None: | |
| extracted_text = u'' | |
| elif isinstance(content, list): | |
| # Recursively call extract_text on each sub-node | |
| for n in content: | |
| extracted_text += extract_text(n) | |
| elif content.name == "script": | |
| extracted_text = u'' | |
| elif isinstance(content, Comment): | |
| extracted_text = u'' | |
| # unicode string | |
| elif isinstance(content, unicode): | |
| extracted_text = prepare_string(content) | |
| #print u"Extracted: {}".format(extracted_text) | |
| # simple string | |
| elif isinstance(content, str): | |
| extracted_text = prepare_string(content) | |
| #print u"Extracted: {}".format(extracted_text) | |
| # node | |
| else: | |
| # node that doesn't have any content | |
| if len(content.contents) == 0: | |
| extracted_text = u'' | |
| # node with different contents | |
| else: | |
| extracted_text = extract_text(content.contents) | |
| return extracted_text | |
| def prepare_string(string): | |
| stripped = string.strip() | |
| if len(stripped) > 0: | |
| stripped += '\n' | |
| return stripped | |
| def count_tags(section): | |
| return len(section.findAll()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment