Created
January 22, 2020 07:00
-
-
Save premrajnarkhede/b47561e62f3bba50c8f16fbc58165d13 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lxml.html.clean | |
from collections import defaultdict | |
import re | |
import tldextract | |
def extract_body_information(data,url): | |
""" | |
This function takes raw html data and final url of response as input | |
and gives plain text, headings, social media accounts and internal links on the page as output | |
""" | |
#Using built in function in lxml to clean some javascript code, attributes | |
# and html irregularities | |
clean_html = lxml.html.clean.clean_html(data) | |
soup = BeautifulSoup(clean_html, 'html.parser') | |
# Extracting headings from clean html | |
headings = defaultdict(list) | |
for tag in ["h1","h2","h3","h4"]: | |
matches = soup.findAll(tag) | |
for match in matches: | |
text = match.text | |
headings[tag].append(text) | |
cleanr = re.compile('<.*?>') | |
cleantext = re.sub(cleanr, '', clean_html) | |
# Extracting domain from url to identify inner pointing links | |
td=tldextract.extract(url) | |
domain=td.registered_domain | |
sm_sites = ['twitter.com','facebook.com','linkedin.com'] | |
soup = BeautifulSoup(data, 'html.parser') | |
# Creating soup of raw data for getting links | |
all_links = soup.find_all('a', href = True) | |
social_media_accounts = defaultdict(list) | |
links_to_follow = [] | |
for sm_site in sm_sites: | |
for link in all_links: | |
# Links containing social media addresses are detected | |
if sm_site in link.attrs['href']: | |
sm_sites_present[sm_site].append(link.attrs['href']) | |
# Links containing domain of the page are detected for futher crawling | |
if domain in link.attrs['href']: | |
links_to_follow.append(link.attrs['href']) | |
return cleantext, headings, social_media_accounts,links_to_follow |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment