Skip to content

Instantly share code, notes, and snippets.

@vijayanandrp
Last active February 22, 2017 07:56
Show Gist options
  • Save vijayanandrp/f6c8fc3423d48b931496c1f9ce1d9d9f to your computer and use it in GitHub Desktop.
Save vijayanandrp/f6c8fc3423d48b931496c1f9ce1d9d9f to your computer and use it in GitHub Desktop.
Python. (Web Scrapping) Tries.
#!/usr/bin/env python3.5
# encoding: utf-8
import requests
from bs4 import BeautifulSoup
class PhishTankUrlScrapper(object):
""" URL configurations """
def __init__(self):
self.url_home = 'https://www.phishtank.com/'
self.url_search = self.url_home + 'phish_search.php?page={:d}&active=y&verified=y'
self.phish_url_dict = {}
@staticmethod
def check_if_page_up_or_down(url=None):
""" To make page is up before grabbing content"""
page_request = requests.get(url=url)
if page_request.status_code == 200:
return page_request.text
else:
return None
@staticmethod
def grab_phish_url(target_url):
""" Grab the Phish URL alone"""
page_response = PhishTankUrlScrapper.check_if_page_up_or_down(target_url)
if page_response:
page = BeautifulSoup(page_response, 'html.parser')
all_links = page.find_all('b')
phish_url = [link.string for link in all_links if 'http' in link.string]
if len(phish_url) == 1:
return ''.join(phish_url)
else:
return phish_url[0]
else:
return None
def grab_phish_ids(self, page_response):
""" Grab phish tank url ids"""
phish_ids = []
page = BeautifulSoup(page_response, 'html.parser')
print(page.title.string)
all_links = page.find_all(name='a')
for link in all_links:
link = link.get('href')
if "phish_detail.php?phish_id=" in link:
phish_ids.append(self.url_home + link)
if not phish_ids:
return None
print("Phish Ids found in the page: ")
for number, phish_link in enumerate(phish_ids):
phish_url = PhishTankUrlScrapper.grab_phish_url(phish_link)
if phish_url:
print(number + 1, ' - ', phish_link, ' - ', phish_url)
self.phish_url_dict[phish_link] = phish_url
def scrap(self, page_range=10):
""" Main Function to do the automated process """
for page_id in range(page_range):
print('*_*_'*24)
target_url = self.url_search.format(page_id)
page_response = PhishTankUrlScrapper.check_if_page_up_or_down(target_url)
if page_response:
self.grab_phish_ids(page_response)
else:
continue
print('*==*=' * 24)
return self.phish_url_dict
if __name__ == '__main__':
""" Page Range is number of phish tank page to grab the phish urls"""
phish_tank = PhishTankUrlScrapper().scrap(page_range=5)
print("List of phish urls(total grabbed - {:d}) are {}".format(len(phish_tank), phish_tank.values()))
#!/usr/bin/env python3.5
# encoding: utf-8
import sys
import requests
import os
import re
from bs4 import BeautifulSoup
base_url = 'http://starmusiq.com/'
movie_path = 'tamil_movie_songs_listen_download.asp?MovieId='
movie_id = str(input('Enter the MovieId: '))
download_path = 'Music'
response = requests.get(base_url+movie_path+movie_id).text
page = BeautifulSoup(response, "html.parser")
print(page.title)
movie_name = page.find(text=re.compile("Movie :"))
print(movie_name)
movie_name = movie_name.split(':')[1]
movie_name = movie_name.strip()
new_dir = os.path.join(download_path, movie_name)
if os.path.isdir(download_path):
if not os.path.isdir(new_dir):
os.mkdir(new_dir)
else:
print('{} directory already found'.format(movie_name))
else:
print('Check the download_path..{}'.format(download_path))
sys.exit()
all_links = page.find_all('a')
download_links = []
for link in all_links:
link = link.get('href')
if 'download/?songID' in link:
download_links.append(base_url+link)
if not download_links:
print('Download Link not found')
sys.exit()
for download_link in download_links:
req = requests.get(download_link)
file_text = req.headers['content-disposition']
file_name = re.findall("filename=(.+)", file_text)[0]
file_content = req.content
print('Downloading ... {}'.format(file_name))
with open(os.path.join(new_dir, file_name.replace('-StarMusiQ.Com', '')), 'wb') as song_file:
song_file.write(file_content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment