Last active
September 2, 2015 08:13
-
-
Save kmmbvnr/86fcd890956c424b7e4d to your computer and use it in GitHub Desktop.
Divanoparser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.pyc | |
*.p12 | |
.tox | |
settings.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import unicode_literals | |
import click | |
import requests | |
import time | |
from datetime import datetime, timedelta | |
from goose import Goose | |
from pyquery import PyQuery as pq | |
from settings import SID, UID, GCAL_EMAIL, GCAL_CALEDAR_ID | |
posts = [ | |
# Была еда | |
'http://leprosorium.ru/comments/1683540/', | |
'http://leprosorium.ru/comments/1689446/', | |
'http://leprosorium.ru/comments/1692536/', | |
'http://leprosorium.ru/comments/1696511/', | |
'http://leprosorium.ru/comments/1700837/', | |
'http://leprosorium.ru/comments/1710845/', | |
'https://leprosorium.ru/comments/1724975/', | |
'https://leprosorium.ru/comments/1730215/', | |
'https://leprosorium.ru/comments/1752452/', | |
'https://leprosorium.ru/comments/1757101/', | |
# До 40 | |
'https://leprosorium.ru/comments/1762503/', | |
'https://leprosorium.ru/comments/1767249/', | |
'https://leprosorium.ru/comments/1770989/', | |
'https://leprosorium.ru/comments/1774264/', | |
'https://leprosorium.ru/comments/1778938/', | |
'https://leprosorium.ru/comments/1781072/', | |
'https://leprosorium.ru/comments/1782294/', | |
# До 45 | |
'https://leprosorium.ru/comments/1783908/', | |
'https://leprosorium.ru/comments/1785694/', | |
'https://leprosorium.ru/comments/1789480/', | |
'https://leprosorium.ru/comments/1791878/', | |
'https://leprosorium.ru/comments/1793150/', | |
'https://leprosorium.ru/comments/1794050/', | |
# До 50 | |
'https://leprosorium.ru/comments/1794831/', | |
'https://leprosorium.ru/comments/1795246/', | |
'https://leprosorium.ru/comments/1796205/', | |
'https://leprosorium.ru/comments/1797267/', | |
'https://leprosorium.ru/comments/1799447/', | |
'https://leprosorium.ru/comments/1801537/', | |
'https://leprosorium.ru/comments/1803974/', | |
'https://leprosorium.ru/comments/1804781/', | |
'https://leprosorium.ru/comments/1805524/', | |
# До 60 | |
'https://leprosorium.ru/comments/1808194/', | |
'https://leprosorium.ru/comments/1809200/', | |
'https://leprosorium.ru/comments/1811039/', | |
'https://leprosorium.ru/comments/1812103/', | |
# До 80 | |
'https://leprosorium.ru/comments/1813325/', | |
'https://leprosorium.ru/comments/1814359/', | |
# щупали 85 | |
'https://leprosorium.ru/comments/1814601/', | |
'https://leprosorium.ru/comments/1814942/', | |
'https://leprosorium.ru/comments/1815251/', | |
# вернулись к 60 | |
'https://leprosorium.ru/comments/1815681/', | |
'https://leprosorium.ru/comments/1815899/', | |
'https://leprosorium.ru/comments/1816508/', | |
'https://leprosorium.ru/comments/1817547/', | |
'https://leprosorium.ru/comments/1818543/', | |
# Новогодний запой'15 | |
'https://leprosorium.ru/comments/1820085/', | |
'https://leprosorium.ru/comments/1821968/', | |
'https://leprosorium.ru/comments/1824380/', | |
# до 70 | |
'https://leprosorium.ru/comments/1826850/', | |
'https://leprosorium.ru/comments/1827696/', | |
'https://leprosorium.ru/comments/1829075/', | |
'https://leprosorium.ru/comments/1830576/', | |
'https://leprosorium.ru/comments/1832435/', | |
'https://leprosorium.ru/comments/1833812/', | |
# BB+ | |
'https://leprosorium.ru/comments/1835005/', | |
'http://smsbogu.com/divane.html', # 'https://leprosorium.ru/comments/1835086/', | |
'https://leprosorium.ru/comments/1838559/', | |
'https://leprosorium.ru/comments/1840807/', | |
'https://leprosorium.ru/comments/1841994/', | |
# до 65 | |
'https://leprosorium.ru/comments/1845316/', | |
'https://leprosorium.ru/comments/1847690/', | |
'https://leprosorium.ru/comments/1851104/', | |
'https://leprosorium.ru/comments/1856697/', | |
# 'https://leprosorium.ru/comments/1861956/', | |
'https://leprosorium.ru/comments/1862232/', | |
'https://leprosorium.ru/comments/1866158/', | |
# Великий отскок и стабильность | |
'https://leprosorium.ru/comments/1869917/', | |
'https://leprosorium.ru/comments/1879350/', | |
'https://leprosorium.ru/comments/1884075/', | |
'https://leprosorium.ru/comments/1887653/', | |
'https://leprosorium.ru/comments/1894805/', | |
'https://leprosorium.ru/comments/1905700/', | |
'https://leprosorium.ru/comments/1913916/', | |
'https://leprosorium.ru/comments/1924447/', | |
'https://leprosorium.ru/comments/1930888/', | |
# до 70 | |
'https://leprosorium.ru/comments/1936498/', | |
'https://leprosorium.ru/comments/1941213/', | |
'https://leprosorium.ru/comments/1943944/', | |
'https://leprosorium.ru/comments/1946180/', | |
'https://leprosorium.ru/comments/1950065/', | |
'https://leprosorium.ru/comments/1950210/', | |
'https://leprosorium.ru/comments/1951123/', | |
'https://leprosorium.ru/comments/1951939/', | |
'https://leprosorium.ru/comments/1953174/', | |
] | |
def link_data(post_url): | |
""" | |
Extract external links from post | |
""" | |
response = requests.get(post_url, cookies={'sid': SID, 'uid': UID}) | |
doc = pq(response.text) | |
for element in doc('.comment'): | |
comment = pq(element) | |
date = datetime.fromtimestamp( | |
int(comment.find('.c_footer span.js-date').attr['data-epoch_date'])) | |
links = comment.find('.c_body a') | |
text = comment.find('.c_body').text() | |
if len(links) > 10 and comment.find('.c_user').text() == 'catap': | |
continue | |
for link in links: | |
href = link.attrib['href'] | |
if 'leprosorium' in href: | |
continue | |
if href.startswith('https://'): | |
href = href.replace('https://', 'http://') | |
yield {'href': href, 'date': date, 'text': text} | |
def article_title(article_url): | |
if article_url.endswith('.pdf') \ | |
or article_url.endswith('.mp4') \ | |
or article_url.endswith('.mp3') \ | |
or article_url.endswith('.zip') \ | |
or article_url.endswith('.csv') \ | |
or article_url.endswith('.avi') \ | |
or article_url.endswith('.jpg') \ | |
or article_url.endswith('.gif') \ | |
or article_url.endswith('.png'): | |
return article_url.rsplit('/', 1)[-1] | |
elif '.flv?' in article_url: | |
return 'tvrain' | |
goose = Goose() | |
resonse = requests.get(article_url, timeout=10) | |
doc = pq(resonse.text) | |
article = goose.extract(raw_html=doc.outerHtml()) | |
return article.title | |
def calendar_client(): | |
with open('google.p12', 'rb') as key_file: | |
private_key = key_file.read() | |
import httplib2 | |
from service_account_auth import AuthorizedService | |
from oauth2client.client import SignedJwtAssertionCredentials | |
class CalendarAuthorizedService(AuthorizedService): | |
def _get_authorized_http(self): | |
credentials = SignedJwtAssertionCredentials( | |
self.email, | |
self.key, | |
scope='https://www.googleapis.com/auth/calendar') | |
http = httplib2.Http() | |
http = credentials.authorize(http) | |
return http | |
ga_service = CalendarAuthorizedService( | |
project_id='charged-atlas-853', | |
service_name='calendar', | |
service_version='v3', | |
email=GCAL_EMAIL, | |
key=private_key) | |
return ga_service.service | |
def insert_event(service, title, link, text, event_datetime): | |
request = service.events().insert( | |
calendarId=GCAL_CALEDAR_ID, | |
body={ | |
'summary': title, | |
'source': { | |
'title': title, | |
'url': link | |
}, | |
'description': "{}\n\n{}".format(link, text), | |
'start': { | |
'dateTime': event_datetime.isoformat(b'T')+'+03:00', | |
}, | |
'end': { | |
'dateTime': (event_datetime+timedelta(minutes=10)).isoformat(b'T')+'+03:00' | |
} | |
} | |
) | |
return request.execute() | |
def insert_post_links(post_url): | |
print('Fetching {}'.format(post_url)) | |
links = {} | |
for link in link_data(post_url): | |
if link['href'] not in links: | |
try: | |
print('Getting title for {}'.format(link['href'])) | |
title = article_title(link['href']) | |
except: | |
continue | |
else: | |
links[link['href']] = { | |
'link': link['href'], | |
'event_datetime': link['date'], | |
'title': title, | |
'text': link['text'] | |
} | |
print('{} links found'.format(len(links))) | |
service = calendar_client() | |
for link in links.values(): | |
try: | |
insert_event(service, **link) | |
print('Created {}'.format(link['link'])) | |
except: | |
try: | |
time.sleep(3) | |
insert_event(service, **link) | |
except Exception as ex: | |
print(ex) | |
@click.group() | |
def calendar(): | |
pass | |
@calendar.command() | |
def cli(): | |
import IPython | |
IPython.embed() | |
if __name__ == '__main__': | |
calendar() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Lepra auth | |
SID = '' | |
UID = '' | |
# google calendar | |
GCAL_EMAIL = "" | |
GCAL_CALEDAR_ID = "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[tox] | |
envlist = py27 | |
skipsdist = True | |
[testenv] | |
deps = repoze.lru | |
requests | |
pyquery | |
goose-extractor | |
click | |
google-api-python-client | |
GoogleCalendarV3 | |
gclient-service-account-auth | |
ipdb | |
ipython | |
commands = python news_calendar.py {posargs:cli} | |
setenv = | |
PYTHONPATH={toxinidir} | |
PATH={envdir}/bin:{env:PATH} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment