Skip to content

Instantly share code, notes, and snippets.

@kmmbvnr
Last active September 2, 2015 08:13
Show Gist options
  • Save kmmbvnr/86fcd890956c424b7e4d to your computer and use it in GitHub Desktop.
Save kmmbvnr/86fcd890956c424b7e4d to your computer and use it in GitHub Desktop.
Divanoparser
*.pyc
*.p12
.tox
settings.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import click
import requests
import time
from datetime import datetime, timedelta
from goose import Goose
from pyquery import PyQuery as pq
from settings import SID, UID, GCAL_EMAIL, GCAL_CALEDAR_ID
posts = [
# Была еда
'http://leprosorium.ru/comments/1683540/',
'http://leprosorium.ru/comments/1689446/',
'http://leprosorium.ru/comments/1692536/',
'http://leprosorium.ru/comments/1696511/',
'http://leprosorium.ru/comments/1700837/',
'http://leprosorium.ru/comments/1710845/',
'https://leprosorium.ru/comments/1724975/',
'https://leprosorium.ru/comments/1730215/',
'https://leprosorium.ru/comments/1752452/',
'https://leprosorium.ru/comments/1757101/',
# До 40
'https://leprosorium.ru/comments/1762503/',
'https://leprosorium.ru/comments/1767249/',
'https://leprosorium.ru/comments/1770989/',
'https://leprosorium.ru/comments/1774264/',
'https://leprosorium.ru/comments/1778938/',
'https://leprosorium.ru/comments/1781072/',
'https://leprosorium.ru/comments/1782294/',
# До 45
'https://leprosorium.ru/comments/1783908/',
'https://leprosorium.ru/comments/1785694/',
'https://leprosorium.ru/comments/1789480/',
'https://leprosorium.ru/comments/1791878/',
'https://leprosorium.ru/comments/1793150/',
'https://leprosorium.ru/comments/1794050/',
# До 50
'https://leprosorium.ru/comments/1794831/',
'https://leprosorium.ru/comments/1795246/',
'https://leprosorium.ru/comments/1796205/',
'https://leprosorium.ru/comments/1797267/',
'https://leprosorium.ru/comments/1799447/',
'https://leprosorium.ru/comments/1801537/',
'https://leprosorium.ru/comments/1803974/',
'https://leprosorium.ru/comments/1804781/',
'https://leprosorium.ru/comments/1805524/',
# До 60
'https://leprosorium.ru/comments/1808194/',
'https://leprosorium.ru/comments/1809200/',
'https://leprosorium.ru/comments/1811039/',
'https://leprosorium.ru/comments/1812103/',
# До 80
'https://leprosorium.ru/comments/1813325/',
'https://leprosorium.ru/comments/1814359/',
# щупали 85
'https://leprosorium.ru/comments/1814601/',
'https://leprosorium.ru/comments/1814942/',
'https://leprosorium.ru/comments/1815251/',
# вернулись к 60
'https://leprosorium.ru/comments/1815681/',
'https://leprosorium.ru/comments/1815899/',
'https://leprosorium.ru/comments/1816508/',
'https://leprosorium.ru/comments/1817547/',
'https://leprosorium.ru/comments/1818543/',
# Новогодний запой'15
'https://leprosorium.ru/comments/1820085/',
'https://leprosorium.ru/comments/1821968/',
'https://leprosorium.ru/comments/1824380/',
# до 70
'https://leprosorium.ru/comments/1826850/',
'https://leprosorium.ru/comments/1827696/',
'https://leprosorium.ru/comments/1829075/',
'https://leprosorium.ru/comments/1830576/',
'https://leprosorium.ru/comments/1832435/',
'https://leprosorium.ru/comments/1833812/',
# BB+
'https://leprosorium.ru/comments/1835005/',
'http://smsbogu.com/divane.html', # 'https://leprosorium.ru/comments/1835086/',
'https://leprosorium.ru/comments/1838559/',
'https://leprosorium.ru/comments/1840807/',
'https://leprosorium.ru/comments/1841994/',
# до 65
'https://leprosorium.ru/comments/1845316/',
'https://leprosorium.ru/comments/1847690/',
'https://leprosorium.ru/comments/1851104/',
'https://leprosorium.ru/comments/1856697/',
# 'https://leprosorium.ru/comments/1861956/',
'https://leprosorium.ru/comments/1862232/',
'https://leprosorium.ru/comments/1866158/',
# Великий отскок и стабильность
'https://leprosorium.ru/comments/1869917/',
'https://leprosorium.ru/comments/1879350/',
'https://leprosorium.ru/comments/1884075/',
'https://leprosorium.ru/comments/1887653/',
'https://leprosorium.ru/comments/1894805/',
'https://leprosorium.ru/comments/1905700/',
'https://leprosorium.ru/comments/1913916/',
'https://leprosorium.ru/comments/1924447/',
'https://leprosorium.ru/comments/1930888/',
# до 70
'https://leprosorium.ru/comments/1936498/',
'https://leprosorium.ru/comments/1941213/',
'https://leprosorium.ru/comments/1943944/',
'https://leprosorium.ru/comments/1946180/',
'https://leprosorium.ru/comments/1950065/',
'https://leprosorium.ru/comments/1950210/',
'https://leprosorium.ru/comments/1951123/',
'https://leprosorium.ru/comments/1951939/',
'https://leprosorium.ru/comments/1953174/',
]
def link_data(post_url):
"""
Extract external links from post
"""
response = requests.get(post_url, cookies={'sid': SID, 'uid': UID})
doc = pq(response.text)
for element in doc('.comment'):
comment = pq(element)
date = datetime.fromtimestamp(
int(comment.find('.c_footer span.js-date').attr['data-epoch_date']))
links = comment.find('.c_body a')
text = comment.find('.c_body').text()
if len(links) > 10 and comment.find('.c_user').text() == 'catap':
continue
for link in links:
href = link.attrib['href']
if 'leprosorium' in href:
continue
if href.startswith('https://'):
href = href.replace('https://', 'http://')
yield {'href': href, 'date': date, 'text': text}
def article_title(article_url):
if article_url.endswith('.pdf') \
or article_url.endswith('.mp4') \
or article_url.endswith('.mp3') \
or article_url.endswith('.zip') \
or article_url.endswith('.csv') \
or article_url.endswith('.avi') \
or article_url.endswith('.jpg') \
or article_url.endswith('.gif') \
or article_url.endswith('.png'):
return article_url.rsplit('/', 1)[-1]
elif '.flv?' in article_url:
return 'tvrain'
goose = Goose()
resonse = requests.get(article_url, timeout=10)
doc = pq(resonse.text)
article = goose.extract(raw_html=doc.outerHtml())
return article.title
def calendar_client():
with open('google.p12', 'rb') as key_file:
private_key = key_file.read()
import httplib2
from service_account_auth import AuthorizedService
from oauth2client.client import SignedJwtAssertionCredentials
class CalendarAuthorizedService(AuthorizedService):
def _get_authorized_http(self):
credentials = SignedJwtAssertionCredentials(
self.email,
self.key,
scope='https://www.googleapis.com/auth/calendar')
http = httplib2.Http()
http = credentials.authorize(http)
return http
ga_service = CalendarAuthorizedService(
project_id='charged-atlas-853',
service_name='calendar',
service_version='v3',
email=GCAL_EMAIL,
key=private_key)
return ga_service.service
def insert_event(service, title, link, text, event_datetime):
request = service.events().insert(
calendarId=GCAL_CALEDAR_ID,
body={
'summary': title,
'source': {
'title': title,
'url': link
},
'description': "{}\n\n{}".format(link, text),
'start': {
'dateTime': event_datetime.isoformat(b'T')+'+03:00',
},
'end': {
'dateTime': (event_datetime+timedelta(minutes=10)).isoformat(b'T')+'+03:00'
}
}
)
return request.execute()
def insert_post_links(post_url):
print('Fetching {}'.format(post_url))
links = {}
for link in link_data(post_url):
if link['href'] not in links:
try:
print('Getting title for {}'.format(link['href']))
title = article_title(link['href'])
except:
continue
else:
links[link['href']] = {
'link': link['href'],
'event_datetime': link['date'],
'title': title,
'text': link['text']
}
print('{} links found'.format(len(links)))
service = calendar_client()
for link in links.values():
try:
insert_event(service, **link)
print('Created {}'.format(link['link']))
except:
try:
time.sleep(3)
insert_event(service, **link)
except Exception as ex:
print(ex)
@click.group()
def calendar():
pass
@calendar.command()
def cli():
import IPython
IPython.embed()
if __name__ == '__main__':
calendar()
# Lepra auth
SID = ''
UID = ''
# google calendar
GCAL_EMAIL = ""
GCAL_CALEDAR_ID = ""
[tox]
envlist = py27
skipsdist = True
[testenv]
deps = repoze.lru
requests
pyquery
goose-extractor
click
google-api-python-client
GoogleCalendarV3
gclient-service-account-auth
ipdb
ipython
commands = python news_calendar.py {posargs:cli}
setenv =
PYTHONPATH={toxinidir}
PATH={envdir}/bin:{env:PATH}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment