Skip to content

Instantly share code, notes, and snippets.

@arifsuhan
Last active September 20, 2022 14:38
Show Gist options
  • Save arifsuhan/be172e9d131904639c1dd491f8d49c1d to your computer and use it in GitHub Desktop.
Save arifsuhan/be172e9d131904639c1dd491f8d49c1d to your computer and use it in GitHub Desktop.
Newspaper parser using beautiful soup
import re, json
import requests, sys
from bs4 import BeautifulSoup
from urllib.parse import urlparse
class Parse_Soup:
def __init__(self, url):
self.url = url
self.soup = ""
def get_soup(self):
page = requests.get(self.url)
return BeautifulSoup(page.content,'html.parser')
class Parse_News:
def __init__(self, tags):
self.url = tags['url']
self.tags = tags
def get_all(self, soup, tag):
if len(tag) >1:
return soup.find_all(tag[0], attrs = tag[1])
else:
return soup.find_all(tag)
def get_one(self, soup, tag):
if len(tag) > 1:
return soup.find(tag[0], attrs = tag[1])
else:
return soup.find(tag)
def get_details(self,obj):
temp = {}
temp_tag = self.tags["details"]
base_url = domain = urlparse(self.url).netloc
temp_obj = ""
try:
# href
if temp_tag['href']["find"]:
temp_obj = self.get_one(obj, [temp_tag['href']['tag']])
temp['link'] = base_url + temp_obj['href']
else:
temp['link'] = obj['href']
# title
tag = temp_tag['title']
if tag['type'] == 'text':
temp_obj = self.get_one(obj, tag['tag'])
temp['title'] = temp_obj.text
else:
pass
except:
pass
return temp
def get_headlines(self, data):
return [self.get_details(x) for x in data]
def run(self):
soup = Parse_Soup(self.url).get_soup()
tag = self.tags['base']
data = self.get_all(soup, tag)
return self.get_headlines(data)
def read_json(filename):
data = []
try:
with open(filename,'r') as file:
data = json.load(file)
except:
pass
return data
from glob import glob
files = [x for x in glob("folder/*.json")]
tag = read_json(files[0])
obj = Parse_News(tag)
data = obj.run()
print(data)
{
"url" : "https://www.bd-pratidin.com/online/todaynews",
"base" : ["li", {"class" : "bi-caret-right-fill"}],
"details" : {
"href" : {
"find" : true,
"tag" : ["a"]
},
"title" :{
"tag" : ["a"],
"type" : "text"
}
}
}
{
"url" : "https://www.kalerkantho.com/recent",
"base" : ["li", {"class" : "col-xs-6"}],
"details" : {
"href" : {
"find" : true,
"tag" : ["a"],
"call" : ["href"]
},
"title" :{
"tag" : ["a"],
"type" : "text"
},
"time": {
"tag" : ["small"],
"type" : "text"
}
}
}
{
"url" : "https://www.prothomalo.com/collection/latest",
"base" : ["a", {"class" : "card-with-image-zoom"}],
"details" : {
"href" : {
"find" : false,
"call" : "href"
},
"title" :{
"tag" : ["span", {"class":"tilte-no-link-parent"}],
"type" : "text"
},
"time": {
"tag" : ["time", {"class":"published-time"}],
"type" : "text"
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment