Skip to content

Instantly share code, notes, and snippets.

@manashmandal
Created July 23, 2016 06:36
Show Gist options
  • Save manashmandal/61845b5c411445fe3ab24f9fd1f661fc to your computer and use it in GitHub Desktop.
Save manashmandal/61845b5c411445fe3ab24f9fd1f661fc to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import urllib2
from newspaper import Article
import csv
scraped_news_list = []
class News():
def __init__(self):
self.id = None
self.title = ''
self.subtitle = ''
self.reporter = ''
self.published_date = ''
self.updated_date = ''
self.article = ''
def set_id(self, id):
self.id = id
def set_subtitle(self, subtitle):
self.subtitle = subtitle
def set_title(self, title):
self.title = title
def set_reporter(self, reporter):
self.reporter = reporter
def set_published_date(self, published_date):
self.published_date = published_date
def set_article(self, article):
self.article = article
def set_updated_date(self, updated_date):
self.updated_date = updated_date
file_link = 'C:\Users\Manash\Desktop\dataset_garment.txt'
news_links = []
# Get news links
with open(file_link) as file:
for line in file:
news_links.append(line)
for i in range(len(news_links)):
link = news_links[i]
article = Article(link)
article.download()
article.parse()
news = News()
news.set_title(article.title)
news.set_article(article.text)
news.set_id(i)
news_links.append(news)
print news.title
print ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment