Last active
September 20, 2022 14:38
-
-
Save arifsuhan/be172e9d131904639c1dd491f8d49c1d to your computer and use it in GitHub Desktop.
Newspaper parser using beautiful soup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, json | |
import requests, sys | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
class Parse_Soup: | |
def __init__(self, url): | |
self.url = url | |
self.soup = "" | |
def get_soup(self): | |
page = requests.get(self.url) | |
return BeautifulSoup(page.content,'html.parser') | |
class Parse_News: | |
def __init__(self, tags): | |
self.url = tags['url'] | |
self.tags = tags | |
def get_all(self, soup, tag): | |
if len(tag) >1: | |
return soup.find_all(tag[0], attrs = tag[1]) | |
else: | |
return soup.find_all(tag) | |
def get_one(self, soup, tag): | |
if len(tag) > 1: | |
return soup.find(tag[0], attrs = tag[1]) | |
else: | |
return soup.find(tag) | |
def get_details(self,obj): | |
temp = {} | |
temp_tag = self.tags["details"] | |
base_url = domain = urlparse(self.url).netloc | |
temp_obj = "" | |
try: | |
# href | |
if temp_tag['href']["find"]: | |
temp_obj = self.get_one(obj, [temp_tag['href']['tag']]) | |
temp['link'] = base_url + temp_obj['href'] | |
else: | |
temp['link'] = obj['href'] | |
# title | |
tag = temp_tag['title'] | |
if tag['type'] == 'text': | |
temp_obj = self.get_one(obj, tag['tag']) | |
temp['title'] = temp_obj.text | |
else: | |
pass | |
except: | |
pass | |
return temp | |
def get_headlines(self, data): | |
return [self.get_details(x) for x in data] | |
def run(self): | |
soup = Parse_Soup(self.url).get_soup() | |
tag = self.tags['base'] | |
data = self.get_all(soup, tag) | |
return self.get_headlines(data) | |
def read_json(filename): | |
data = [] | |
try: | |
with open(filename,'r') as file: | |
data = json.load(file) | |
except: | |
pass | |
return data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from glob import glob | |
files = [x for x in glob("folder/*.json")] | |
tag = read_json(files[0]) | |
obj = Parse_News(tag) | |
data = obj.run() | |
print(data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"url" : "https://www.bd-pratidin.com/online/todaynews", | |
"base" : ["li", {"class" : "bi-caret-right-fill"}], | |
"details" : { | |
"href" : { | |
"find" : true, | |
"tag" : ["a"] | |
}, | |
"title" :{ | |
"tag" : ["a"], | |
"type" : "text" | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"url" : "https://www.kalerkantho.com/recent", | |
"base" : ["li", {"class" : "col-xs-6"}], | |
"details" : { | |
"href" : { | |
"find" : true, | |
"tag" : ["a"], | |
"call" : ["href"] | |
}, | |
"title" :{ | |
"tag" : ["a"], | |
"type" : "text" | |
}, | |
"time": { | |
"tag" : ["small"], | |
"type" : "text" | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"url" : "https://www.prothomalo.com/collection/latest", | |
"base" : ["a", {"class" : "card-with-image-zoom"}], | |
"details" : { | |
"href" : { | |
"find" : false, | |
"call" : "href" | |
}, | |
"title" :{ | |
"tag" : ["span", {"class":"tilte-no-link-parent"}], | |
"type" : "text" | |
}, | |
"time": { | |
"tag" : ["time", {"class":"published-time"}], | |
"type" : "text" | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment