jamaljeantobias · December 26, 2024 23:31 · Mar 8, 2017 · Mar 8, 2017 · Mar 8, 2017
diff --git a/SoundcloudEmailScraper.py b/SoundcloudEmailScraper.py
@@ -1,4 +1,4 @@
-rom selenium import webdriver
+from selenium import webdriver
 from time import sleep
 from bs4 import BeautifulSoup
 from random import randint

diff --git a/SoundcloudEmailScraper.py b/SoundcloudEmailScraper.py
@@ -0,0 +1,90 @@
+rom selenium import webdriver
+from time import sleep
+from bs4 import BeautifulSoup
+from random import randint
+import requests
+import re
+import csv
+
+class SoundCloudEmail(object):
+    def __init__(self,Target_get = '<Enter soundcloud Username>: ex: generichouse-1/following' ,Html_source='Cache_email_scraper.html',):
+
+
+        self.Base_url = 'https://soundcloud.com'
+        self.Target_get = Target_get 
+        self.Html_source = Html_source
+        self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
+        self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver')
+        self.driver.get(self.Base_url + Target_get)
+        sleep(3)
+        self.infinite_scroller()
+        self.data_html_text()
+        self.email_parser()
+        self.driver.close()
+
+
+    def infinite_scroller(self): # A dynamic scroller for Selenium drive
+        count = 0
+
+        while True:
+            Lastheight = self.driver.execute_script('return document.body.scrollHeight')
+            self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
+            count += 1
+            sleep(randint(5, 8))
+            newHeight = self.driver.execute_script('return document.body.scrollHeight')
+            print(newHeight,Lastheight)
+            print("{}: Scroller Height {} Last Scroller Height {}: Number of users loaded to cache : {}".format(count,newHeight,Lastheight,count*24))
+            if newHeight == Lastheight:
+                break
+            lastHeight = newHeight
+
+
+    def data_html_text(self): #Downloads page source code
+        print("Downloading Page Source of {} to disk......".format(self.Base_url + self.Target_get))
+        soundcloud_page_source = self.driver.page_source
+        with open(self.Html_source, 'w') as file:
+             file.write(soundcloud_page_source)
+
+
+    def email_parser(self): # gets scraped links and filters it 
+        count = 0
+
+        file = open(self.Html_source)
+        data = file.read()
+        soup = BeautifulSoup(data, 'lxml')
+        all_divs = soup.find_all('li',class_='badgeList__item',)
+        scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs]
+
+        for link in scrapper_links:
+            count += 1
+            print("{} ------> {}".format(count,link))
+
+        count = 0
+
+        data = []
+        for s_link in scrapper_links:
+            user_page = requests.get(s_link, headers=self.headers)
+            text = user_page.content
+            inner_pagee = text.decode()
+            all_emails = re.findall(r'[w\w.-]+@[\w\.-]+', inner_pagee)
+            if all_emails:
+                count += 1
+                print("{} Scraping Emails: {}".format(count, all_emails[0]))
+                data.append(all_emails[0])
+                new_data = list(set(data))
+
+        data1 =[]
+        for x in new_data:
+            x = re.sub('[.]$','',x)
+            data1.append(x)
+        print(data1)
+
+
+        with open('test.csv', "w") as output:
+            writer = csv.writer(output, lineterminator='\n')
+            for val in data1:
+                writer.writerow([val])
+
+
+if __name__ == '__main__':
+    SoundCloudEmail = SoundCloudEmail()