Created
July 25, 2015 15:07
-
-
Save pjha1994/f662ea5b49f128333426 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from bs4 import BeautifulSoup | |
from bs4 import SoupStrainer | |
import os | |
import httplib2 | |
c=0 | |
def make_soup(s): | |
match=re.compile('https://|http://|www.|.com|.in|.org|gov.in') | |
if re.search(match,s): | |
http = httplib2.Http() | |
status, response = http.request(s) | |
#parse_only=SoupStrainer(['a','time']) | |
page = BeautifulSoup(response) | |
return page | |
else: | |
return None | |
def is_a_valid_link(href): | |
match1=re.compile('http://|https://') | |
match2=re.compile('/r/news/comments/') | |
match3=re.compile('modpost') | |
def is_a_valid_link(href): | |
match1=re.compile('http://|https://') | |
match2=re.compile('/r/news/comments/') | |
match3=re.compile('modpost') | |
return re.search(match1,href) and re.search(match2,href) and not re.search(match3,href) | |
def parse(s): | |
global c | |
flag=0 | |
soup=make_soup(s) | |
match4=re.compile('comments') | |
if(soup!=None): | |
#for tag in soup.select('a.may-blank loggedin'): | |
for link in soup.find_all('a',attrs={'class':['title',' may-blank', 'loggedin']}): | |
#if(link['class']!=['author may-blank loggedin']): | |
#if(not re.search(re.compile('/r/WritingPrompts/comments/'),link['href'])): | |
#f.write('\nPROMPT '+str(c+1)+'\n') | |
#f.write(link.string+'\n') | |
#f.write(link['href']+'\n') | |
x=soup.link.find_all('time',class_='live-timestamp',datetime=True,title=True) | |
print(x) | |
def read_reddit_images(): | |
s='https://www.reddit.com/r/news/' | |
soup=make_soup(s) | |
parse(s) | |
read_reddit_images() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment