Last active
December 26, 2024 23:31
-
-
Save jamaljeantobias/cef4c58593c7ac1896e6aa51399ce604 to your computer and use it in GitHub Desktop.
Scrape emails into a CSV from a Soundcloud user followers and following
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from time import sleep | |
from bs4 import BeautifulSoup | |
from random import randint | |
import requests | |
import re | |
import csv | |
class SoundCloudEmail(object): | |
def __init__(self,Target_get = '<Enter soundcloud Username>: ex: generichouse-1/following' ,Html_source='Cache_email_scraper.html',): | |
self.Base_url = 'https://soundcloud.com' | |
self.Target_get = Target_get | |
self.Html_source = Html_source | |
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} | |
self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver') | |
self.driver.get(self.Base_url + Target_get) | |
sleep(3) | |
self.infinite_scroller() | |
self.data_html_text() | |
self.email_parser() | |
self.driver.close() | |
def infinite_scroller(self): # A dynamic scroller for Selenium drive | |
count = 0 | |
while True: | |
Lastheight = self.driver.execute_script('return document.body.scrollHeight') | |
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") | |
count += 1 | |
sleep(randint(5, 8)) | |
newHeight = self.driver.execute_script('return document.body.scrollHeight') | |
print(newHeight,Lastheight) | |
print("{}: Scroller Height {} Last Scroller Height {}: Number of users loaded to cache : {}".format(count,newHeight,Lastheight,count*24)) | |
if newHeight == Lastheight: | |
break | |
lastHeight = newHeight | |
def data_html_text(self): #Downloads page source code | |
print("Downloading Page Source of {} to disk......".format(self.Base_url + self.Target_get)) | |
soundcloud_page_source = self.driver.page_source | |
with open(self.Html_source, 'w') as file: | |
file.write(soundcloud_page_source) | |
def email_parser(self): # gets scraped links and filters it | |
count = 0 | |
file = open(self.Html_source) | |
data = file.read() | |
soup = BeautifulSoup(data, 'lxml') | |
all_divs = soup.find_all('li',class_='badgeList__item',) | |
scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs] | |
for link in scrapper_links: | |
count += 1 | |
print("{} ------> {}".format(count,link)) | |
count = 0 | |
data = [] | |
for s_link in scrapper_links: | |
user_page = requests.get(s_link, headers=self.headers) | |
text = user_page.content | |
inner_pagee = text.decode() | |
all_emails = re.findall(r'[w\w.-]+@[\w\.-]+', inner_pagee) | |
if all_emails: | |
count += 1 | |
print("{} Scraping Emails: {}".format(count, all_emails[0])) | |
data.append(all_emails[0]) | |
new_data = list(set(data)) | |
data1 =[] | |
for x in new_data: | |
x = re.sub('[.]$','',x) | |
data1.append(x) | |
print(data1) | |
with open('test.csv', "w") as output: | |
writer = csv.writer(output, lineterminator='\n') | |
for val in data1: | |
writer.writerow([val]) | |
if __name__ == '__main__': | |
SoundCloudEmail = SoundCloudEmail() |
Lol. Ok but people would like to know how to use your code. Just tell us what to install to use it? I installed chromedriver from that link but the error still persists so this is not straight forward.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I get this error in Windows 10. Can you please help troubleshoot?
C:\Users\mail\emailscraper>python SoundcloudEmailScraper.py
Traceback (most recent call last):
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\common\service.py", line 72, in start
self.process = subprocess.Popen(cmd, env=self.env,
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\subprocess.py", line 854, in init
self._execute_child(args, executable, preexec_fn, close_fds,
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\subprocess.py", line 1307, in _execute_child
hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
FileNotFoundError: [WinError 2] The system cannot find the file specified
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "SoundcloudEmailScraper.py", line 90, in
SoundCloudEmail = SoundCloudEmail()
File "SoundcloudEmailScraper.py", line 17, in init
self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver')
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 73, in init
self.service.start()
File "C:\Users\mail\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\common\service.py", line 81, in start
raise WebDriverException(
selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home