Skip to content

Instantly share code, notes, and snippets.

@manashmandal
Created August 14, 2016 15:33
Show Gist options
  • Save manashmandal/09eef2925d738865dcb5296219e349e9 to your computer and use it in GitHub Desktop.
Save manashmandal/09eef2925d738865dcb5296219e349e9 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 14 20:37:16 2016
@author: Manash
"""
from bs4 import BeautifulSoup
import urllib2
import unicodedata
from datetime import datetime
# baseurl
news_paper_url = "http://en.prothom-alo.com/archive"
baseurl = 'http://en.prothom-alo.com/'
# request
prothom_alo_request = urllib2.urlopen(news_paper_url)
# Creating soup
prothomalo_soup = BeautifulSoup(prothom_alo_request, 'lxml')
# Getting all news links
news_links = [baseurl + link['href'] for link in prothomalo_soup.find('div', {'class' : 'list list_square mb20'}).findAll('a')]
# selecting first news
first_news = news_links[0]
print first_news
# Request of the first news
news_request = urllib2.urlopen(first_news)
# soup
first_news_soup = BeautifulSoup(news_request)
# getting news
news_text = first_news_soup.find(itemprop="articleBody").get_text()
news_update = first_news_soup.find(itemprop="datePublished").get_text()
time = datetime.strptime(news_update[:5], "%H:%M")
convtime = time.strftime("%I:%M %p")
print convtime
print news_update[:5]
print unicodedata.normalize('NFKD', news_text).encode('ascii','ignore')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment