Created
November 24, 2016 05:02
-
-
Save maheshkkumar/3ac9961d04afdb6cb667542ac5220bb4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from lxml import html | |
from bs4 import BeautifulSoup | |
import re | |
from pymongo import MongoClient | |
from pymongo.errors import ConnectionFailure, DuplicateKeyError | |
import datetime | |
import time | |
DB_NAME = 'news_aggregator' | |
try: | |
DATABASE = MongoClient()[DB_NAME] | |
except ConnectionFailure: | |
print "Connection Error" | |
# BASE_URL = [ | |
# "http://www.datatau.com/x?fnid=bbwdy1hKwE", | |
# "http://www.datatau.com/x?fnid=3W6vX6uCp9", | |
# "http://www.datatau.com/x?fnid=yrjPOamlb3", | |
# "http://www.datatau.com/x?fnid=NC98ffV5Lx", | |
# "http://www.datatau.com/x?fnid=3Pv8lcbswJ", | |
# "http://www.datatau.com/x?fnid=LZLi7SprVG", | |
# "http://www.datatau.com/" | |
# ] | |
URL = "http://www.datatau.com" | |
def news_scraper(): | |
# for url in BASE_URL: | |
page = urllib2.urlopen(URL).read() | |
soup = BeautifulSoup(page) | |
news = soup('td', {'class': 'title'}) | |
links = [] | |
for i in xrange(len(news)): | |
lst = [] | |
if(i%2==1): | |
lst.append(news[i].a.get('href').encode('utf-8')) | |
lst.append(re.sub("[^A-Za-z0-9\:\+\*\%\$\#\@\!\&\?\=\/\,\|\-\_\\.'""()[]]+", ' ', news[i].a.string.encode("utf-8"))) | |
links.append(lst) | |
count = 1 | |
# links.reverse() | |
for link in links: | |
try: | |
print "\n" | |
print "Description --> {}".format(link[1]) | |
print "Link ---------> {}".format(link[0]) | |
choice = int(raw_input("Do you want to save the link? : ")) | |
if choice == 1: | |
category = str(raw_input("Enter the category : ")) | |
data = { | |
"_id": link[0], | |
"description": link[1], | |
"category": category, | |
"createdAt": datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), | |
"score": 1, | |
"likes": 1, | |
"user": "admin" | |
} | |
# print data | |
bool_insert = DATABASE.articles.insert(data) | |
except DuplicateKeyError: | |
print "{}. {} was not inserted into the DB".format(count, link[1]) | |
count = count + 1 | |
if __name__ == '__main__': | |
news_scraper() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment