Last active
December 26, 2024 23:31
-
-
Save jamaljeantobias/cef4c58593c7ac1896e6aa51399ce604 to your computer and use it in GitHub Desktop.
Scrape emails into a CSV from a Soundcloud user followers and following
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from time import sleep | |
from bs4 import BeautifulSoup | |
from random import randint | |
import requests | |
import re | |
import csv | |
class SoundCloudEmail(object): | |
def __init__(self,Target_get = '<Enter soundcloud Username>: ex: generichouse-1/following' ,Html_source='Cache_email_scraper.html',): | |
self.Base_url = 'https://soundcloud.com' | |
self.Target_get = Target_get | |
self.Html_source = Html_source | |
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} | |
self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver') | |
self.driver.get(self.Base_url + Target_get) | |
sleep(3) | |
self.infinite_scroller() | |
self.data_html_text() | |
self.email_parser() | |
self.driver.close() | |
def infinite_scroller(self): # A dynamic scroller for Selenium drive | |
count = 0 | |
while True: | |
Lastheight = self.driver.execute_script('return document.body.scrollHeight') | |
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") | |
count += 1 | |
sleep(randint(5, 8)) | |
newHeight = self.driver.execute_script('return document.body.scrollHeight') | |
print(newHeight,Lastheight) | |
print("{}: Scroller Height {} Last Scroller Height {}: Number of users loaded to cache : {}".format(count,newHeight,Lastheight,count*24)) | |
if newHeight == Lastheight: | |
break | |
lastHeight = newHeight | |
def data_html_text(self): #Downloads page source code | |
print("Downloading Page Source of {} to disk......".format(self.Base_url + self.Target_get)) | |
soundcloud_page_source = self.driver.page_source | |
with open(self.Html_source, 'w') as file: | |
file.write(soundcloud_page_source) | |
def email_parser(self): # gets scraped links and filters it | |
count = 0 | |
file = open(self.Html_source) | |
data = file.read() | |
soup = BeautifulSoup(data, 'lxml') | |
all_divs = soup.find_all('li',class_='badgeList__item',) | |
scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs] | |
for link in scrapper_links: | |
count += 1 | |
print("{} ------> {}".format(count,link)) | |
count = 0 | |
data = [] | |
for s_link in scrapper_links: | |
user_page = requests.get(s_link, headers=self.headers) | |
text = user_page.content | |
inner_pagee = text.decode() | |
all_emails = re.findall(r'[w\w.-]+@[\w\.-]+', inner_pagee) | |
if all_emails: | |
count += 1 | |
print("{} Scraping Emails: {}".format(count, all_emails[0])) | |
data.append(all_emails[0]) | |
new_data = list(set(data)) | |
data1 =[] | |
for x in new_data: | |
x = re.sub('[.]$','',x) | |
data1.append(x) | |
print(data1) | |
with open('test.csv', "w") as output: | |
writer = csv.writer(output, lineterminator='\n') | |
for val in data1: | |
writer.writerow([val]) | |
if __name__ == '__main__': | |
SoundCloudEmail = SoundCloudEmail() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Lol. Ok but people would like to know how to use your code. Just tell us what to install to use it? I installed chromedriver from that link but the error still persists so this is not straight forward.