Skip to content

Instantly share code, notes, and snippets.

@manashmandal
Created July 23, 2016 06:07
Show Gist options
  • Save manashmandal/c527ac7e89fd63985c38165fd83a2bc5 to your computer and use it in GitHub Desktop.
Save manashmandal/c527ac7e89fd63985c38165fd83a2bc5 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import urllib2
class News():
def __init__(self):
self.id = None
self.title = ''
self.subtitle = ''
self.reporter = ''
self.published_date = ''
self.updated_date = ''
self.article = ''
def set_id(self, id):
self.id = id
def set_subtitle(self, subtitle):
self.subtitle = subtitle
def set_title(self, title):
self.title = title
def set_reporter(self, reporter):
self.reporter = reporter
def set_published_date(self, published_date):
self.published_date = published_date
def set_article(self, article):
self.article = article
def set_updated_date(self, updated_date):
self.updated_date = updated_date
file_link = 'C:\Users\Manash\Desktop\dataset_garment.txt'
news_links = []
with open(file_link) as file:
for line in file:
news_links.append(line)
testlink = news_links[0]
print testlink
site= str(testlink)
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(site,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page, 'lxml')
print soup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment