Last active
September 30, 2018 18:33
-
-
Save AnthonyBloomer/75b5b0208e2ea046486c14fd77ee2f32 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from slideshare import Slideshare | |
from pymongo import MongoClient | |
import argparse | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('topic') | |
args = parser.parse_args() | |
client = MongoClient('mongodb://localhost:27017/') | |
db = client.slideshare | |
has_pages = True | |
page = 1 | |
while has_pages: | |
try: | |
slideshare = Slideshare() | |
slideshows = slideshare.scrape(topic=args.topic, page_num=page) | |
for slideshow in slideshows: | |
ss = { | |
'title': slideshow.title(), | |
'description': slideshow.description(), | |
'publish_date': slideshow.publish_date(), | |
'views': slideshow.views(), | |
'favourites': slideshow.favourites(), | |
'author': slideshow.author(), | |
'comments': slideshow.comments(), | |
'categories': slideshow.categories() | |
} | |
db.slideshare.insert_one(ss) | |
except: | |
has_pages = False | |
print 'Finished!' | |
page += 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Slideshow(object): | |
def __init__(self, soup): | |
self.soup = soup | |
def title(self): | |
try: | |
return self.soup.find('span', {'class': 'j-title-breadcrumb'}).text.strip() | |
except AttributeError: | |
return | |
def description(self): | |
try: | |
return " ".join(self.soup.find('p', {'id': 'slideshow-description-paragraph'}).text.split()) | |
except AttributeError: | |
return | |
def author(self): | |
try: | |
return self.soup.find('a', {'class': 'j-author-name'}).text.strip() | |
except AttributeError: | |
return | |
def favourites(self): | |
try: | |
return self.soup.find('span', {'class': 'j-favs-count'}).text.strip().split()[0].replace(',', '') | |
except AttributeError: | |
return | |
def views(self): | |
try: | |
return self.soup.find('div', {'class': 'stat-value'}).text.strip().replace(',', '') | |
except AttributeError: | |
return | |
def publish_date(self): | |
try: | |
return self.soup.find('time').text.strip() | |
except AttributeError: | |
return | |
def categories(self): | |
try: | |
categories_container = self.soup.find('div', {'class': 'categories-container'}) | |
categories = categories_container.find_all('a') | |
return [category.text.strip() for category in categories] | |
except AttributeError: | |
return | |
def comments(self): | |
try: | |
comments = self.soup.find_all('div', {'class': 'commentText'}) | |
iter_comments = iter(comments) | |
next(iter_comments) | |
return [" ".join(comment.text.split()) for comment in iter_comments] | |
except AttributeError: | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment