You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Text extraction from websites, or Web Mining, is a major area of interest in today's connected world. Extract text content from web pages with Python tools. Some of the tools mentioned here have a more focussed approach on main text extraction wherea
1. Using BoilerPy3, a native Python port of Christian Kohlschütter's Boilerpipe for automatically extracting main textual content of a webpage based on shallow text features. It gives us the page title and it's contents
3. Using the google library for links lookup along with requests package for HTML page extraction and then using bs4 for scraping the page for content (SSL Verification warning suppressed)
from googlesearch import search
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
from bs4.element import Comment
from multiprocessing import Pool
import numpy as np
import re
import string
# Disable displaying SSL verification warnings
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# Utility function to pick a random user-agent
def get_random_ua():
random_ua = None
ua_file = 'ua_file.txt'
try:
ua_file_text = uploaded['ua_file.txt'].decode("utf-8")
lines = ua_file_text.split('\n')
# with open(ua_file) as f:
# lines = f.readlines()
if len(lines) > 0:
prng = np.random.RandomState()
index = prng.permutation(len(lines) - 1)
idx = np.asarray(index, dtype=np.integer)[0]
random_ua = lines[int(idx)]
except Exception as ex:
pass
finally:
return random_ua
# Utility function to pick a random delay
def get_random_delay():
delay = 2.0
try:
random_num = np.random.uniform(2, 3)
delay = round(random_num, 4)
except Exception:
pass
finally:
return delay
# Get top sites from google for a query
def google_search(query, num_results=None):
def empty(): # Empty generator
yield from ()
results = empty()
try:
results = search(
query, tld = 'co.in', lang = 'en',
start=0, stop=num_results,
pause=get_random_delay(),
user_agent = get_random_ua()
)
except Exception:
pass
finally:
return results
# Helper function to download the html page of a site
def download_site(url, session):
html_page = None
user_agent = get_random_ua()
headers = {
'user-agent': user_agent,
}
try:
with session.get(url, \
headers=headers, \
timeout = 3.5, \
verify=False) as response:
if(response.status_code == 200):
html_page = response.text
except requests.exceptions.RequestException:
pass
finally:
return html_page
# Retrieve html responses from all sites
def download_all_sites(sites):
all_html_pages = []
with requests.Session() as session:
for url in sites:
html_page = download_site(url, session)
if html_page is not None:
all_html_pages.append(html_page)
return all_html_pages
# Helper function to extract text from the web page's source code
def text_from_html(html):
try:
soup = BeautifulSoup(html, 'lxml')
except AttributeError as e:
return None
def tag_visible(element): # Helper function to filter out futile HTML tags
blacklist = ['style', 'label', '[document]', 'embed', 'img', 'object',
'noscript', 'header', 'html', 'iframe', 'audio', 'picture',
'meta', 'title', 'aside', 'footer', 'svg', 'base', 'figure',
'form', 'nav', 'head', 'link', 'button', 'source', 'canvas',
'br', 'input', 'script', 'wbr', 'video', 'param', 'hr']
if element.parent.name in blacklist:
return False
if isinstance(element, Comment):
return False
return True
texts = soup.body.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return " ".join(t.strip() for t in visible_texts)
# Extract textual content from all pages
def extract_text(html_pages):
textual_contents = []
for page_html in html_pages:
page_text = text_from_html(page_html)
if page_text is not None:
textual_contents.append(page_text)
return textual_contents
# Get a list of relevant text documents for the input query
def fetch_text_results(query):
text_results = []
sites = google_search(query, num_results=5) # Obtain the top 4 URLs
html_pages = download_all_sites(sites) # Get HTML from URLs
if html_pages:
text_results = extract_text(html_pages) # Extract texts from HTML
return text_results
# Get an agglomerated string from multiple relevant documents for a query
def fetch_merged_results(query):
results_list = fetch_text_results(query)
results_string = "\n".join(results_list) # Join the list into a string
return results_string
query_string = "Who is the Prime Minister of India"
results = fetch_merged_results(query_string)
if not results:
print("NA")
else:
print(results)