Skip to content

Instantly share code, notes, and snippets.

@jamaljeantobias
Last active December 26, 2024 23:31
Show Gist options
  • Save jamaljeantobias/cef4c58593c7ac1896e6aa51399ce604 to your computer and use it in GitHub Desktop.
Save jamaljeantobias/cef4c58593c7ac1896e6aa51399ce604 to your computer and use it in GitHub Desktop.
Scrape emails into a CSV from a Soundcloud user followers and following
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
from random import randint
import requests
import re
import csv
class SoundCloudEmail(object):
def __init__(self,Target_get = '<Enter soundcloud Username>: ex: generichouse-1/following' ,Html_source='Cache_email_scraper.html',):
self.Base_url = 'https://soundcloud.com'
self.Target_get = Target_get
self.Html_source = Html_source
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver')
self.driver.get(self.Base_url + Target_get)
sleep(3)
self.infinite_scroller()
self.data_html_text()
self.email_parser()
self.driver.close()
def infinite_scroller(self): # A dynamic scroller for Selenium drive
count = 0
while True:
Lastheight = self.driver.execute_script('return document.body.scrollHeight')
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
count += 1
sleep(randint(5, 8))
newHeight = self.driver.execute_script('return document.body.scrollHeight')
print(newHeight,Lastheight)
print("{}: Scroller Height {} Last Scroller Height {}: Number of users loaded to cache : {}".format(count,newHeight,Lastheight,count*24))
if newHeight == Lastheight:
break
lastHeight = newHeight
def data_html_text(self): #Downloads page source code
print("Downloading Page Source of {} to disk......".format(self.Base_url + self.Target_get))
soundcloud_page_source = self.driver.page_source
with open(self.Html_source, 'w') as file:
file.write(soundcloud_page_source)
def email_parser(self): # gets scraped links and filters it
count = 0
file = open(self.Html_source)
data = file.read()
soup = BeautifulSoup(data, 'lxml')
all_divs = soup.find_all('li',class_='badgeList__item',)
scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs]
for link in scrapper_links:
count += 1
print("{} ------> {}".format(count,link))
count = 0
data = []
for s_link in scrapper_links:
user_page = requests.get(s_link, headers=self.headers)
text = user_page.content
inner_pagee = text.decode()
all_emails = re.findall(r'[w\w.-]+@[\w\.-]+', inner_pagee)
if all_emails:
count += 1
print("{} Scraping Emails: {}".format(count, all_emails[0]))
data.append(all_emails[0])
new_data = list(set(data))
data1 =[]
for x in new_data:
x = re.sub('[.]$','',x)
data1.append(x)
print(data1)
with open('test.csv', "w") as output:
writer = csv.writer(output, lineterminator='\n')
for val in data1:
writer.writerow([val])
if __name__ == '__main__':
SoundCloudEmail = SoundCloudEmail()
@currentsound
Copy link

currentsound commented Feb 3, 2021

Hi, I get this error in Windows 10. Can you please help troubleshoot?

C:\Users\mail\emailscraper>python SoundcloudEmailScraper.py
Traceback (most recent call last):
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\common\service.py", line 72, in start
self.process = subprocess.Popen(cmd, env=self.env,
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\subprocess.py", line 854, in init
self._execute_child(args, executable, preexec_fn, close_fds,
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\subprocess.py", line 1307, in _execute_child
hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
FileNotFoundError: [WinError 2] The system cannot find the file specified

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "SoundcloudEmailScraper.py", line 90, in
SoundCloudEmail = SoundCloudEmail()
File "SoundcloudEmailScraper.py", line 17, in init
self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver')
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 73, in init
self.service.start()
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\common\service.py", line 81, in start
raise WebDriverException(
selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home

@currentsound
Copy link

Lol. Ok but people would like to know how to use your code. Just tell us what to install to use it? I installed chromedriver from that link but the error still persists so this is not straight forward.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment