Last active
December 26, 2024 23:31
Revisions
-
jamaljeantobias revised this gist
Mar 8, 2017 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ from selenium import webdriver from time import sleep from bs4 import BeautifulSoup from random import randint -
jamaljeantobias revised this gist
Mar 8, 2017 . No changes.There are no files selected for viewing
-
jamaljeantobias created this gist
Mar 8, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,90 @@ rom selenium import webdriver from time import sleep from bs4 import BeautifulSoup from random import randint import requests import re import csv class SoundCloudEmail(object): def __init__(self,Target_get = '<Enter soundcloud Username>: ex: generichouse-1/following' ,Html_source='Cache_email_scraper.html',): self.Base_url = 'https://soundcloud.com' self.Target_get = Target_get self.Html_source = Html_source self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver') self.driver.get(self.Base_url + Target_get) sleep(3) self.infinite_scroller() self.data_html_text() self.email_parser() self.driver.close() def infinite_scroller(self): # A dynamic scroller for Selenium drive count = 0 while True: Lastheight = self.driver.execute_script('return document.body.scrollHeight') self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") count += 1 sleep(randint(5, 8)) newHeight = self.driver.execute_script('return document.body.scrollHeight') print(newHeight,Lastheight) print("{}: Scroller Height {} Last Scroller Height {}: Number of users loaded to cache : {}".format(count,newHeight,Lastheight,count*24)) if newHeight == Lastheight: break lastHeight = newHeight def data_html_text(self): #Downloads page source code print("Downloading Page Source of {} to disk......".format(self.Base_url + self.Target_get)) soundcloud_page_source = self.driver.page_source with open(self.Html_source, 'w') as file: file.write(soundcloud_page_source) def email_parser(self): # gets scraped links and filters it count = 0 file = open(self.Html_source) data = file.read() soup = BeautifulSoup(data, 'lxml') all_divs = soup.find_all('li',class_='badgeList__item',) scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs] for link in scrapper_links: count += 1 print("{} ------> {}".format(count,link)) count = 0 data = [] for s_link in scrapper_links: user_page = requests.get(s_link, headers=self.headers) text = user_page.content inner_pagee = text.decode() all_emails = re.findall(r'[w\w.-]+@[\w\.-]+', inner_pagee) if all_emails: count += 1 print("{} Scraping Emails: {}".format(count, all_emails[0])) data.append(all_emails[0]) new_data = list(set(data)) data1 =[] for x in new_data: x = re.sub('[.]$','',x) data1.append(x) print(data1) with open('test.csv', "w") as output: writer = csv.writer(output, lineterminator='\n') for val in data1: writer.writerow([val]) if __name__ == '__main__': SoundCloudEmail = SoundCloudEmail()