Created
December 10, 2015 19:55
-
-
Save pjha1994/05a852430427738f44c0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from bs4 import BeautifulSoup | |
from bs4 import SoupStrainer | |
import os | |
import httplib2 | |
from datetime import datetime | |
c=0 | |
def make_soup(s): | |
#test_internet() | |
match=re.compile('https://|http://|www.|.com|.in|.org|gov.in') | |
if re.search(match,s): | |
http = httplib2.Http() | |
status, response = http.request(s) | |
page = BeautifulSoup(response,"html.parser",parse_only=SoupStrainer('div'))#,parse_only=SoupStrainer('div') | |
return page | |
else: | |
return None | |
def test_internet(): | |
while(True): | |
try: | |
test_in=make_soup("https://www.google.com") | |
#print('here in inteenet') | |
break | |
except: | |
continue | |
def parse1(s): | |
global c | |
temp_set=set() | |
soup=make_soup(s) | |
if(soup!=None): | |
for div in soup.find_all('div',class_=[ "thing" , "id-t3_3ua12m" ,"linkflair" , "linkflair-normal" , "odd" , "link"]): | |
try: | |
if(div.p!=None and div.p.next_sibling!=None and div.p.next_sibling.next_sibling!=None): | |
x=div.p.next_sibling.next_sibling.next_sibling['class'] | |
#print(x) | |
if(x[0]=='entry'): | |
element='\nPROMPT '+str(c+1)+'\n' | |
if(div.p.next_sibling.next_sibling.next_sibling!=None and div.p.next_sibling.next_sibling.next_sibling.p!=None and div.p.next_sibling.next_sibling.next_sibling.p.a!=None): | |
element=element+div.p.next_sibling.next_sibling.next_sibling.p.a.string+'\n' | |
element=element+div.p.next_sibling.next_sibling.next_sibling.p.a['href']+'\n' | |
if(div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'})!=None and div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time!=None): | |
element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time['datetime']+'\t' | |
element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time['title']+'\t' | |
element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time.string+'\n' | |
if(div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'})!=None and div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).a!=None): | |
element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).a.string+'\n' | |
element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).text+'\n' | |
if(div.div.find('div',{'class':'score likes'})!=None): | |
element=element+'score likes '+div.div.find('div',{'class':'score likes'}).string+'\t' | |
element=element+'score dislikes '+div.div.find('div',{'class':'score dislikes'}).string+'\t' | |
element=element+'score unvoted '+div.div.find('div',{'class':'score unvoted'}).string+'\n\n' | |
f.write(element) | |
c=c+1 | |
elif(x[0]=='thumbnail'): | |
element='\nPROMPT '+str(c+1)+'\n' | |
if(div.find('div',{'class':'entry unvoted'})!=None and div.find('div',{'class':'entry unvoted'}).p!=None and div.find('div',{'class':'entry unvoted'}).p.a!=None and div.find('div',{'class':'entry unvoted'}).p.a.string!=None): | |
element=element+div.find('div',{'class':'entry unvoted'}).p.a.string+'\n' | |
element=element+div.find('div',{'class':'entry unvoted'}).p.a['href']+'\n' | |
if(div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'})!=None and div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time != None): | |
element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time['datetime']+'\t' | |
element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time['title']+'\t' | |
element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time.string+'\n' | |
if(div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).a!=None): | |
element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).a.string+'\n' | |
element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).text+'\n' | |
if(div.p.next_sibling.next_sibling.find('div',{'class':'score likes'})!=None and div.p.next_sibling.next_sibling.find('div',{'class':'score dislikes'})!=None and div.p.next_sibling.next_sibling.find('div',{'class':'score unvoted'})!=None): | |
element=element+'score likes '+div.p.next_sibling.next_sibling.find('div',{'class':'score likes'}).string+'\t\t' | |
element=element+'score dislikes '+div.p.next_sibling.next_sibling.find('div',{'class':'score dislikes'}).string+'\t\t' | |
element=element+'score unvoted '+div.p.next_sibling.next_sibling.find('div',{'class':'score unvoted'}).string+'\n' | |
f.write(element) | |
c=c+1 | |
except: | |
print('ERROR') | |
continue | |
def count_next_of_current(s,m): | |
test_internet() | |
soup=make_soup(s) | |
y='https://www.reddit.com/r/'+m+'/'+select_tab+'/?count=' | |
match=re.compile(y) | |
for link in soup.find_all('a',{'rel':['next']}): | |
href=link['href'] | |
return href | |
def read_reddit_images(change_file_number,m,x): | |
test_internet() | |
global f | |
global select_tab | |
select_tab=x | |
#x=m+'_'+select_tab+str(change_file_number)+'.txt' | |
x=m+'_'+select_tab+'.txt' | |
f=open(x,'a',encoding='utf-8') | |
FORMAT = '%d-%m-%Y %H:%M:%S' | |
f.write('\n\n\n\niteration number '+str(change_file_number)+' '+datetime.now().strftime(FORMAT)+'\n\n') | |
maximum_number_of_next_pages=7 | |
s='https://www.reddit.com/r/'+m+'/'+select_tab | |
soup=make_soup(s) | |
parse1(s) | |
count=0 | |
print('for '+m+' '+select_tab+' current page number is'+'\n'+str(count)) | |
while(count<maximum_number_of_next_pages): | |
test_internet() | |
s=count_next_of_current(s,m) | |
if(s!=None): | |
parse1(s) | |
count=count+1 | |
print(count) | |
else: | |
break | |
f.write('\n\niteration number '+str(change_file_number)+' '+datetime.now().strftime(FORMAT)+'\n\n') | |
f.close() | |
def maincall(m,i): | |
#test_internet() | |
read_reddit_images(i,m,'hot') | |
#test_internet() | |
read_reddit_images(i,m,'new') | |
#test_internet() | |
read_reddit_images(i,m,'top') | |
#test_internet() | |
read_reddit_images(i,m,'rising') | |
#test_internet() | |
read_reddit_images(i,m,'controversial') | |
#test_internet() | |
read_reddit_images(i,m,'gilded') | |
def subs(b): | |
test_internet() | |
t=open('mytext.txt','r') | |
i=t.read() | |
temp=int(i) | |
temp=temp+1 | |
t.close() | |
t=open('mytext.txt','w') | |
t.write(str(temp)) | |
t.close() | |
for k in b: | |
test_internet() | |
maincall(k,i) | |
def main(): | |
test_internet() | |
#print('here') | |
b=[] | |
b=['24hoursupport','3amjokes','ADHD','AMA','AcademicPhilosophy','AcademicPsychology','Aerospace','Android','AndroidQuestions','Anger','Anxiety', | |
'AskAnthropology','AskComputerScience','AskElectronics','AskEngineers','AskHR','AskHistorians','AskMen','AskPhysics','AskReddit','AskScienceDiscussion', | |
'AskScienceFiction','AskSocialScience','AskWomen','Ask_Politics','Bash','BehavioralEconomics','BigDataJobs','BipolarReddit','CAD','C_Programming', | |
'ComputerScience','Confession','CoverTheWorld','Cplusplus','CppForbeginners','CrappyDesign','CrazyIdeas','DIY','DIYCompSci','DailyProgrammer','DeadBedrooms', | |
'DebateReligion','DecidingToBeBetter','DigitalNomad','DoesNotTranslate','ECE','Economics','EngineeringStudents','Entrepreneur','ExNoContact','FEA','FE_Exam', | |
'Feminism','FluidMechanics','Foodforthought','FoundWords','Freethought','GetMotivated','GetStudying','GraphicsProgramming','HITsWorthTurkingFor','HTMLBattles', | |
'HomeworkHelp','HowsYourJob','IAmA','IOPsychology','InternetIsBeautiful','LaTeX','LanguageLearning','LearnANewLanguage','LearnJava','LearnJavaScript', | |
'LifeProTips','LinguisticsHumor','LongDistance','MachineLearning','Manufacturing','MathHelp','Meditation','NetworkingJobs','Neuropsychology','NoStupidQuestions', | |
'ObjectiveC','PCMasterRace','PLC','PhilosophyofScience','PhsychologicalTricks','PoliticalDiscussion','Polyamory','PrintedCircuitBoard','Progether', | |
'ProgrammerHumor','Proofreading','Python','RapeCounseling','RetailManagement','STEMdents','SWORDS','SWResources','SampleSize','SanctionedSuicide','Seduction', | |
'SiblingSupport','Statistics','SuicideWatch','Swift','SysadminJobs','TechNews','ThermalPerformance','Tinder','TinyCode','TowerOfBabel','TrueAskReddit', | |
'TrueReddit','Unix','VentureBiotech','WeMetOnline','Web_Development','WhatsTheWord','YoungJobs','academicpsychology','academicpublishing','accounting','advice', | |
'androiddev','translator','answers','asklinguistics','askmath','askphotography','askreddit','askscience','assistance','astronomy','audiology','autism','badcode', | |
'badlinguistics','beermoney','behavioralmedicine','behaviortherapy','bestof','bestofTLDR','bioengineering','biology','biotech','bodybuilding','bookquotes', | |
'books','breadboard','bugs','buildapc','business','careerguidance','cfd','changemyview','chemicalengineering','chipdesign','civilengineering','cloudcomputing', | |
'coding','coffeescript','cogneuro','cogneurocogsci','cognitivelinguistics','cogsci','compilers','complexsystems','compling','compression','compsci', | |
'computerforensics','computers','computerscience','conlangs','conspiracy','construction','cosmology','coursearea','cpp','cpp_questions','crypto','cryptography', | |
'cs50','csbooks','cscareerquestions','csharp','css','dae','dailyprogrammer','dailyscripts','darkinternet','dataisbeautiful','datamining','dementia','depression', | |
'diy','documentaries','dotnet','downsyndrome','dyslexia','economics','education','eebooks','electricalengineering','electronics','engineering', | |
'engineeringtechnology','entrepreneur','epidemiology','etymology','eurodiversity','everythingscience','evolution','evopsych','explainlikeimfive','favors', | |
'finance','financialindependence','findareddit','forhire','forth','freelance','freelanceUK','freelanceWriters','funny','gadgets','genetics','getdisciplined', | |
'getemployed','getmotivated','getting_over_it','goldredditsays','grammar','grammarwriting','graphic_design','hacking','hardware','history','holdmybeer', | |
'homeworkhelp','html','htmlbasics','humanism','hwstartups','hypotheticalsituation','iWantToLearn','ideasfortheadmins','illegaltorrents','improvevocab','india', | |
'ineedafavor','intel','intelligence','interview','inventions','iwantoutjobs','java','javaTIL','javacodegeeks','javahelp','javascript','jobbit','jobsearchhacks', | |
'jokes','jquery','languagetechnology','learnjava','learnjavascript','learnmath','learnprogramming','learnpython','lectures','lifehacks','linguistics','linux', | |
'linux4noobs','linuxquestions','literature','logic','machinelearning','marketing','masculism','math','mathbooks','mathematics','mathpsych','matlab', | |
'mechanicalengineering','medicine','meditation','mentalhealth','mentors','metalworking','microsoft','mmfb','motivation','movies','music','mysql','needadvice', | |
'networking','neuro','neurodiversity','neurophilosophy','neuropsychology','newproducts','news','newtoreddit','nonprofit_jobs','nootropics','obvious', | |
'occupationaltherapy','ocd','offmychest','opengl','osdev','parkrangers','perl','philosophy','philosophyofScience','philosophyofscience','php','physics','pics', | |
'politics','privacy','product_design','productivity','programbattles','programming','programmingbuddies','programmingchallenges','psychiatry','psychology', | |
'psychopharmacology','psychotherapy','psychscience','puzzles','python','quotes','rage','rational','reasonstolive','rehabtherapy','relationship_advice', | |
'relationships','resumes','riddles','robotics','ruby','saneorpsycho','schizophrenia','science','scientificresearch','self','selfhelp','selfimprovement','sex', | |
'shittyaskscience','shittyideas','shittyprogramming','showerthoughts','simpleliving','slp','socialism','socialmedia','socialskills','sociology','software', | |
'softwarearchitecture','softwaredevelopment','softwaregore','solotravel','space','specialed','startups','stopselfharm','suicidology','sysadmin','systems', | |
'talesfromtechsupport','technology','techsupport','teenagers','testimonials','themixednuts','thisismyjob','tipofmytongue','todayilearned','tr', | |
'translationstudies','travel','tutor','ultralight','undelete','undeleteShadow','undergraduateresearch','uniqueminds','visualbasic','web_programming','webdev', | |
'whatisthis','whatstheword','windows','windowsazure','womenEngineers','words','work','workonline','worldnews','writingprompts']#major list, Once a week | |
#b=[] | |
#b=['AskAnthropology','AskScienceDiscussion', | |
# 'AskScienceFiction','AskSocialScience','Ask_Politics','ECE','Economics', | |
# 'Freethought', | |
# 'GetMotivated','GetStudying','GraphicsProgramming','Neuropsychology','NoStupidQuestions','PhsychologicalTricks', | |
# 'PoliticalDiscussion','Web_Development','badcode', | |
# 'biology','books','bugs','buildapc','compilers', | |
# 'computers','computerscience','crypto', | |
# 'cryptography','cs50','csbooks','cscareerquestions','dailyprogrammer', | |
# 'dailyscripts','electronics','explainlikeimfive','grammar','hacking', | |
# 'history','linux', | |
# 'linux4noobs','linuxquestions','logic','mysql','networking', | |
# 'opengl','philosophy','philosophyofScience','politics', | |
# 'productivity','programmingchallenges', | |
# 'shittyaskscience','shittyideas','shittyprogramming','showerthoughts', | |
# 'socialism','socialskills','software']#regular list,Everyday | |
#['AskScienceFiction','AskSocialScience','Ask_Politics','ECE','Economics', | |
## 'GraphicsProgramming' | |
b=[] | |
b=['AskAnthropology','AskScienceDiscussion', | |
'AskSocialScience', | |
'Ask_Politics','badcode','biology','compilers', | |
'computers','computerscience','crypto', | |
'cryptography','cs50','csbooks', | |
'cscareerquestions','dailyprogrammer','electronics','history', | |
'linux','linux4noobs','linuxquestions','logic','psychology']#basic_LONG INTERVAL | |
l=set() | |
for k in b: | |
l.add(k) | |
b=[] | |
for k in l: | |
b.append(k) | |
b.sort() | |
subs(b) | |
#xcv=0 | |
#print('[',sep='',end='') | |
#for k in b: | |
# xcv=xcv+1 | |
# print("'",k,"',",sep='',end='') | |
# #print(str(xcv),k,'\t\t') | |
#print(']') | |
#list_subreddits() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment