Last active
February 22, 2017 07:56
-
-
Save vijayanandrp/f6c8fc3423d48b931496c1f9ce1d9d9f to your computer and use it in GitHub Desktop.
Python. (Web Scrapping) Tries.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.5 | |
# encoding: utf-8 | |
import requests | |
from bs4 import BeautifulSoup | |
class PhishTankUrlScrapper(object): | |
""" URL configurations """ | |
def __init__(self): | |
self.url_home = 'https://www.phishtank.com/' | |
self.url_search = self.url_home + 'phish_search.php?page={:d}&active=y&verified=y' | |
self.phish_url_dict = {} | |
@staticmethod | |
def check_if_page_up_or_down(url=None): | |
""" To make page is up before grabbing content""" | |
page_request = requests.get(url=url) | |
if page_request.status_code == 200: | |
return page_request.text | |
else: | |
return None | |
@staticmethod | |
def grab_phish_url(target_url): | |
""" Grab the Phish URL alone""" | |
page_response = PhishTankUrlScrapper.check_if_page_up_or_down(target_url) | |
if page_response: | |
page = BeautifulSoup(page_response, 'html.parser') | |
all_links = page.find_all('b') | |
phish_url = [link.string for link in all_links if 'http' in link.string] | |
if len(phish_url) == 1: | |
return ''.join(phish_url) | |
else: | |
return phish_url[0] | |
else: | |
return None | |
def grab_phish_ids(self, page_response): | |
""" Grab phish tank url ids""" | |
phish_ids = [] | |
page = BeautifulSoup(page_response, 'html.parser') | |
print(page.title.string) | |
all_links = page.find_all(name='a') | |
for link in all_links: | |
link = link.get('href') | |
if "phish_detail.php?phish_id=" in link: | |
phish_ids.append(self.url_home + link) | |
if not phish_ids: | |
return None | |
print("Phish Ids found in the page: ") | |
for number, phish_link in enumerate(phish_ids): | |
phish_url = PhishTankUrlScrapper.grab_phish_url(phish_link) | |
if phish_url: | |
print(number + 1, ' - ', phish_link, ' - ', phish_url) | |
self.phish_url_dict[phish_link] = phish_url | |
def scrap(self, page_range=10): | |
""" Main Function to do the automated process """ | |
for page_id in range(page_range): | |
print('*_*_'*24) | |
target_url = self.url_search.format(page_id) | |
page_response = PhishTankUrlScrapper.check_if_page_up_or_down(target_url) | |
if page_response: | |
self.grab_phish_ids(page_response) | |
else: | |
continue | |
print('*==*=' * 24) | |
return self.phish_url_dict | |
if __name__ == '__main__': | |
""" Page Range is number of phish tank page to grab the phish urls""" | |
phish_tank = PhishTankUrlScrapper().scrap(page_range=5) | |
print("List of phish urls(total grabbed - {:d}) are {}".format(len(phish_tank), phish_tank.values())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.5 | |
# encoding: utf-8 | |
import sys | |
import requests | |
import os | |
import re | |
from bs4 import BeautifulSoup | |
base_url = 'http://starmusiq.com/' | |
movie_path = 'tamil_movie_songs_listen_download.asp?MovieId=' | |
movie_id = str(input('Enter the MovieId: ')) | |
download_path = 'Music' | |
response = requests.get(base_url+movie_path+movie_id).text | |
page = BeautifulSoup(response, "html.parser") | |
print(page.title) | |
movie_name = page.find(text=re.compile("Movie :")) | |
print(movie_name) | |
movie_name = movie_name.split(':')[1] | |
movie_name = movie_name.strip() | |
new_dir = os.path.join(download_path, movie_name) | |
if os.path.isdir(download_path): | |
if not os.path.isdir(new_dir): | |
os.mkdir(new_dir) | |
else: | |
print('{} directory already found'.format(movie_name)) | |
else: | |
print('Check the download_path..{}'.format(download_path)) | |
sys.exit() | |
all_links = page.find_all('a') | |
download_links = [] | |
for link in all_links: | |
link = link.get('href') | |
if 'download/?songID' in link: | |
download_links.append(base_url+link) | |
if not download_links: | |
print('Download Link not found') | |
sys.exit() | |
for download_link in download_links: | |
req = requests.get(download_link) | |
file_text = req.headers['content-disposition'] | |
file_name = re.findall("filename=(.+)", file_text)[0] | |
file_content = req.content | |
print('Downloading ... {}'.format(file_name)) | |
with open(os.path.join(new_dir, file_name.replace('-StarMusiQ.Com', '')), 'wb') as song_file: | |
song_file.write(file_content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment