Skip to content

Instantly share code, notes, and snippets.

@jamaljeantobias
Last active December 26, 2024 23:31

Revisions

  1. jamaljeantobias revised this gist Mar 8, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion SoundcloudEmailScraper.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    rom selenium import webdriver
    from selenium import webdriver
    from time import sleep
    from bs4 import BeautifulSoup
    from random import randint
  2. jamaljeantobias revised this gist Mar 8, 2017. No changes.
  3. jamaljeantobias created this gist Mar 8, 2017.
    90 changes: 90 additions & 0 deletions SoundcloudEmailScraper.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,90 @@
    rom selenium import webdriver
    from time import sleep
    from bs4 import BeautifulSoup
    from random import randint
    import requests
    import re
    import csv

    class SoundCloudEmail(object):
    def __init__(self,Target_get = '<Enter soundcloud Username>: ex: generichouse-1/following' ,Html_source='Cache_email_scraper.html',):


    self.Base_url = 'https://soundcloud.com'
    self.Target_get = Target_get
    self.Html_source = Html_source
    self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    self.driver = webdriver.Chrome('/Users/jamaljean-tobias/Downloads/chromedriver')
    self.driver.get(self.Base_url + Target_get)
    sleep(3)
    self.infinite_scroller()
    self.data_html_text()
    self.email_parser()
    self.driver.close()


    def infinite_scroller(self): # A dynamic scroller for Selenium drive
    count = 0

    while True:
    Lastheight = self.driver.execute_script('return document.body.scrollHeight')
    self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
    count += 1
    sleep(randint(5, 8))
    newHeight = self.driver.execute_script('return document.body.scrollHeight')
    print(newHeight,Lastheight)
    print("{}: Scroller Height {} Last Scroller Height {}: Number of users loaded to cache : {}".format(count,newHeight,Lastheight,count*24))
    if newHeight == Lastheight:
    break
    lastHeight = newHeight


    def data_html_text(self): #Downloads page source code
    print("Downloading Page Source of {} to disk......".format(self.Base_url + self.Target_get))
    soundcloud_page_source = self.driver.page_source
    with open(self.Html_source, 'w') as file:
    file.write(soundcloud_page_source)


    def email_parser(self): # gets scraped links and filters it
    count = 0

    file = open(self.Html_source)
    data = file.read()
    soup = BeautifulSoup(data, 'lxml')
    all_divs = soup.find_all('li',class_='badgeList__item',)
    scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs]

    for link in scrapper_links:
    count += 1
    print("{} ------> {}".format(count,link))

    count = 0

    data = []
    for s_link in scrapper_links:
    user_page = requests.get(s_link, headers=self.headers)
    text = user_page.content
    inner_pagee = text.decode()
    all_emails = re.findall(r'[w\w.-]+@[\w\.-]+', inner_pagee)
    if all_emails:
    count += 1
    print("{} Scraping Emails: {}".format(count, all_emails[0]))
    data.append(all_emails[0])
    new_data = list(set(data))

    data1 =[]
    for x in new_data:
    x = re.sub('[.]$','',x)
    data1.append(x)
    print(data1)


    with open('test.csv', "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in data1:
    writer.writerow([val])


    if __name__ == '__main__':
    SoundCloudEmail = SoundCloudEmail()