jamaljeantobias · December 26, 2024 23:31 · currentsound · Feb 3, 2021 · currentsound · Feb 4, 2021
diff --git a/SoundcloudEmailScraper.py b/SoundcloudEmailScraper.py
 from selenium import webdriver
 from time import sleep
 from bs4 import BeautifulSoup
 from random import randint
 import requests
 import re
 import csv

 class SoundCloudEmail(object):
    def __init__(self,Target_get = '<Enter soundcloud Username>: ex: generichouse-1/following' ,Html_source='Cache_email_scraper.html',):


        self.Base_url = 'https://soundcloud.com'
        self.Target_get = Target_get 
        self.Html_source = Html_source
        self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver')
        self.driver.get(self.Base_url + Target_get)
        sleep(3)
        self.infinite_scroller()
        self.data_html_text()
        self.email_parser()
        self.driver.close()


    def infinite_scroller(self): # A dynamic scroller for Selenium drive
        count = 0

        while True:
            Lastheight = self.driver.execute_script('return document.body.scrollHeight')
            self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
            count += 1
            sleep(randint(5, 8))
            newHeight = self.driver.execute_script('return document.body.scrollHeight')
            print(newHeight,Lastheight)
            print("{}: Scroller Height {} Last Scroller Height {}: Number of users loaded to cache : {}".format(count,newHeight,Lastheight,count*24))
            if newHeight == Lastheight:
                break
            lastHeight = newHeight
    
    
    def data_html_text(self): #Downloads page source code
        print("Downloading Page Source of {} to disk......".format(self.Base_url + self.Target_get))
        soundcloud_page_source = self.driver.page_source
        with open(self.Html_source, 'w') as file:
             file.write(soundcloud_page_source)


    def email_parser(self): # gets scraped links and filters it 
        count = 0

        file = open(self.Html_source)
        data = file.read()
        soup = BeautifulSoup(data, 'lxml')
        all_divs = soup.find_all('li',class_='badgeList__item',)
        scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs]

        for link in scrapper_links:
            count += 1
            print("{} ------> {}".format(count,link))

        count = 0

        data = []
        for s_link in scrapper_links:
            user_page = requests.get(s_link, headers=self.headers)
            text = user_page.content
            inner_pagee = text.decode()
            all_emails = re.findall(r'[w\w.-]+@[\w\.-]+', inner_pagee)
            if all_emails:
                count += 1
                print("{} Scraping Emails: {}".format(count, all_emails[0]))
                data.append(all_emails[0])
                new_data = list(set(data))

        data1 =[]
        for x in new_data:
            x = re.sub('[.]$','',x)
            data1.append(x)
        print(data1)


        with open('test.csv', "w") as output:
            writer = csv.writer(output, lineterminator='\n')
            for val in data1:
                writer.writerow([val])


 if __name__ == '__main__':
    SoundCloudEmail = SoundCloudEmail()
	from selenium import webdriver
	from time import sleep
	from bs4 import BeautifulSoup
	from random import randint
	import requests
	import re
	import csv

	class SoundCloudEmail(object):
	def __init__(self,Target_get = '<Enter soundcloud Username>: ex: generichouse-1/following' ,Html_source='Cache_email_scraper.html',):


	self.Base_url = 'https://soundcloud.com'
	self.Target_get = Target_get
	self.Html_source = Html_source
	self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
	self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver')
	self.driver.get(self.Base_url + Target_get)
	sleep(3)
	self.infinite_scroller()
	self.data_html_text()
	self.email_parser()
	self.driver.close()


	def infinite_scroller(self): # A dynamic scroller for Selenium drive
	count = 0

	while True:
	Lastheight = self.driver.execute_script('return document.body.scrollHeight')
	self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
	count += 1
	sleep(randint(5, 8))
	newHeight = self.driver.execute_script('return document.body.scrollHeight')
	print(newHeight,Lastheight)
	print("{}: Scroller Height {} Last Scroller Height {}: Number of users loaded to cache : {}".format(count,newHeight,Lastheight,count*24))
	if newHeight == Lastheight:
	break
	lastHeight = newHeight


	def data_html_text(self): #Downloads page source code
	print("Downloading Page Source of {} to disk......".format(self.Base_url + self.Target_get))
	soundcloud_page_source = self.driver.page_source
	with open(self.Html_source, 'w') as file:
	file.write(soundcloud_page_source)


	def email_parser(self): # gets scraped links and filters it
	count = 0

	file = open(self.Html_source)
	data = file.read()
	soup = BeautifulSoup(data, 'lxml')
	all_divs = soup.find_all('li',class_='badgeList__item',)
	scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs]

	for link in scrapper_links:
	count += 1
	print("{} ------> {}".format(count,link))

	count = 0

	data = []
	for s_link in scrapper_links:
	user_page = requests.get(s_link, headers=self.headers)
	text = user_page.content
	inner_pagee = text.decode()
	all_emails = re.findall(r'[w\w.-]+@[\w\.-]+', inner_pagee)
	if all_emails:
	count += 1
	print("{} Scraping Emails: {}".format(count, all_emails[0]))
	data.append(all_emails[0])
	new_data = list(set(data))

	data1 =[]
	for x in new_data:
	x = re.sub('[.]$','',x)
	data1.append(x)
	print(data1)


	with open('test.csv', "w") as output:
	writer = csv.writer(output, lineterminator='\n')
	for val in data1:
	writer.writerow([val])


	if __name__ == '__main__':
	SoundCloudEmail = SoundCloudEmail()