Created
September 21, 2015 09:49
-
-
Save iKlotho/140cc8fd4535ea1285a3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: iso-8859-1 -*- | |
import requests, time, re, sys, json, urllib2 | |
from downloadDM import downloadDM | |
from BeautifulSoup import BeautifulSoup | |
class lequipeParse(downloadDM): | |
def __init__(self): | |
self.base_url = "http://video.lequipe.fr/morevideos/48/1" | |
self.main_url = "http://video.lequipe.fr" | |
self.dm_url = "http://www.dailymotion.com/video" | |
self.links = [] | |
self.proxies = {'https': 'https://94.23.196.68:3128'} | |
self.headers = {'User-agent': | |
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0'} | |
self.jsonCollect = [] | |
self.orginalVideoLinks = [] | |
self.dmCollect = {'title': '', 'description': '', 'link': '', 'tags': '', 'file_id': ''} | |
self.collectLinks() | |
self.getDailymotionLinks() | |
self.getDownloadLinks() | |
self.downloadAndJsonIt() | |
with open('deneme.json', 'w') as f: | |
f.write(str(self.jsonCollect)) | |
f.close() | |
def collectLinks(self): | |
print "Linkler toplaniyor" | |
r = requests.get(self.base_url) | |
soup = BeautifulSoup(r.content) | |
for i in soup.findAll('li', 'items_last_vids'): # str(time.strftime("%d") | |
if str(i.a.find('div', 'date').text[:2]) == '21': | |
self.links.append(i.a['href']) | |
def getDailymotionLinks(self): | |
for link in self.links: | |
key = '' | |
response = requests.get(self.main_url + link) | |
soup = BeautifulSoup(response.content) | |
holder = str(soup.findAll('div', id='laVideo')[0].iframe['src']) | |
holder = holder[holder.rfind('/'):holder.rfind('?')] | |
self.dmCollect['link'] = str(self.dm_url + holder) | |
self.dmCollect['title'] = str(soup.findAll('div', 'haut borderbas')[0].h1.text.encode('ascii', 'ignore')) | |
self.dmCollect['description'] = str(soup.findAll('p', 'desc')[0].text.encode('ascii', 'ignore')) | |
for i in soup.findAll('div', 'brique briqM')[0].findAll('a'): | |
key = key + ' ' + i.text.encode('ascii', 'ignore') | |
self.dmCollect['tags'] = str(key + ' ') | |
self.jsonCollect.append(self.dmCollect) | |
def getDownloadLinks(self): | |
for q in range(len(self.jsonCollect)): | |
self.jsonCollect[q]['link'] = self._findLinks(self.jsonCollect[q]['link']) | |
def downloadAndJsonIt(self): | |
for queue in range(len(self.jsonCollect)): | |
self.jsonCollect[queue]['file_id'] = self._downloadLinks(self.jsonCollect[queue]['title'], | |
self.jsonCollect[queue]['link']) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment