Created
January 8, 2017 22:35
-
-
Save anonymous/0bd76f5679e99b9eb73729fe0e291417 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
duolingo.com | |
""" | |
import json | |
import re | |
import urllib | |
from difflib import SequenceMatcher | |
from calibre.web.feeds.news import BasicNewsRecipe | |
__license__ = 'GPL v3' | |
__copyright__ = '2008, Kovid Goyal [email protected]' | |
__docformat__ = 'restructuredtext en' | |
""" | |
Download all Duolingo lesson tips and notes for your language | |
and turn them into a handy reference book! | |
USAGE: | |
1) you must have calibre installed. Download it from https://calibre-ebook.com/ | |
Calibre is free, cross-platform e-book creation / conversion / management software. | |
2) go to https://www.duolingo.com/ , log in, | |
and make sure you have switched to the language you want to download | |
3) from the command line, type | |
ebook-convert duolingo.recipe <outputfilename>.<ext> --username <myduolingousername> --password <myduolingopassword> -vv --test | |
where <ext> is the book format you want, that is, epub azw3 mobi pdf | |
if you omit <outputfilename> it will use the same name as input, i.e., duolingo | |
Example: | |
ebook-convert duolingo.recipe duo_french.epub --username bob --password mary123 -vv --test | |
4) This creates a test book called duo_french.epub with only 2 lessons naximum. | |
Open it. If everything looks good, run the command again without | |
the --test this time, i.e., | |
ebook-convert duolingo.recipe duo_french.epub --username bob --password mary123 -vv | |
Notes: | |
a) The -vv tells it to spit out some possibly useful debug info. | |
b) Once you have the epub, you can convert it to other formats without running | |
the script (and downloading everything) again, like this: | |
ebook-convert duo_french.epub .mobi --username bob --password mary123 -vv | |
ebook-convert duo_french.epub .azw3 --username bob --password mary123 -vv | |
ebook-convert duo_french.epub .pdf --username bob --password mary123 -vv | |
Good luck! Contact heybart on reddit if you have a problem | |
""" | |
class DuolingoLessons(BasicNewsRecipe): | |
# A few customizable options | |
# how book title will appear | |
# {} will be replaced with language name | |
title_with_vocabs = u'{} grammar and vocabulary with Duolingo' | |
title_no_vocabs = u'Learning {} with Duolingo' | |
# URL of image to use as the book cover | |
# set to a web address or local file | |
# example: 'http://i.imgur.com/KDslMRP.jpg' or 'c:/pics/owl.png' | |
# Windows users: use forward slashes / instead of backslashes \ | |
# set to 'auto' to use the default picture of the duolingo owl | |
cover_url = 'auto' | |
# cover_url = 'https://duolingo-images.s3.amazonaws.com/avatars/15224667/q13kbDuwyI/xlarge' # American English | |
# cover_url = 'http://d2rhekw5qr4gcj.cloudfront.net/img/400sqf/from/uploads/course_photos/8947308000160730232415.png' # spanish | |
# cover_url = 'http://i.imgur.com/ZUIigS0.png' # german | |
# cover_url = 'http://i.imgur.com/KDslMRP.jpg' #french | |
# cover_url = 'http://d2rhekw5qr4gcj.cloudfront.net/img/400sqf/from/uploads/course_photos/2656444000150612030056.png' # norwegian | |
# description for your book, shows up in metadata | |
description = 'Duolingo Lesson Tips and Notes' | |
# include lesson words? either True or False | |
include_vocabs = False | |
# (if include_vocabs = True) include definitions of lessons words? | |
include_defs = True | |
# (if include_defs = True) put definitions inline instead of popup footnotes? | |
inline_defs = False | |
definition_color = '#222' | |
# symbol to indicate lesson has tips & notes | |
dagger = u'\u2020' | |
# ------------------------ | |
# don't mess with rest of this stuff unless you know what you're doing :) | |
# see https://manual.calibre-ebook.com/news_recipe.html | |
# for documentation on calibre recipe API | |
# ------------------------- | |
__author__ = 'heybart on reddit' | |
__version__ = '0.16.3a' | |
# there should be no reason to change this | |
index_url = 'https://www.duolingo.com' | |
login_url = 'https://www.duolingo.com/login' | |
max_articles_per_feed = 150 | |
no_stylesheets = False | |
no_javascript = True | |
needs_subscription = True | |
extra_css = ( | |
'.calibre_navbar {display:none} ' | |
'a.sup, a sup { text-decoration: none !important; } ' | |
'sup.invis { color: white !important; } ' | |
'table { border-bottom: 1px solid #888; } ' | |
'.vocabs { border-bottom: 1px dotted #888; } ' | |
'li.vocab { margin-top: 3px; margin-bottom: 3px; } ' | |
'.vocab_word { font-size: 1em; font-weight: bold } ' | |
'.vocab_def { font-size: 0.90em; color: ' + definition_color + '; } ' | |
'th { border-bottom: 1px dotted #aaa; } td, th {padding: 5px; } ' | |
'.footnotes { page-break-before: always;} ' | |
'h1 { font-size: 1.3em; border-bottom: 1px solid #aaa;} ' | |
'h2 { font-size: 1.2em; } h3 { font-size: 1.15em; } ' | |
'h4 { font-size: 1.10em; } h5,h6 { font-size: 1.05em; } ' | |
) | |
# green duolingo owl, no specific language | |
default_cover_url = 'http://65.media.tumblr.com/5fd6b3ccc4e8c978c87f469b236558ad/tumblr_inline_mwkqv1OuOg1ss97ol.png' | |
learning_language = None | |
learning_language_id = None | |
# indices to use for <a name=...> to ensure uniqueness | |
a_indices = {} | |
def get_browser(self): | |
print('getbrowse') | |
br = BasicNewsRecipe.get_browser(self) | |
data = {'login': self.username, 'password': self.password} | |
br.open(self.login_url, urllib.urlencode(data)) | |
return br | |
def get_raw(self, url): | |
br = BasicNewsRecipe.get_browser(self) | |
return br.open(url).read() | |
def get_json(self, url): | |
return json.loads(self.get_raw(url)) | |
def lookup_definitions(self, learning_lang, from_lang, words): | |
""" | |
use API call to look up definition of words | |
learning_lang := language to translate to | |
from_lang := language to translate from | |
words := list of words | |
example: | |
learning_lang = "fr" | |
from_lang = "en" | |
words = ["me","femme","pays"] | |
https://d2.duolingo.com/api/1/dictionary/hints/fr/en?tokens=["me","femme","pays"] | |
returns | |
{"me": ["me", "myself"], "pays": ["country", "countries", "land", "region", "village"], "femme": ["woman", "wife"]} | |
""" | |
# flatten list | |
words = [item for sublist in words for item in sublist] | |
params = json.dumps(words, separators=(',', ':')) | |
url = 'https://d2.duolingo.com/api/1/dictionary/hints/{}/{}?tokens={}' \ | |
.format(learning_lang, from_lang, urlquote(params)) | |
return self.get_json(url) | |
def make_anchor(self, prefix, name = ''): | |
idx = self.a_indices.get(prefix, 1) | |
self.a_indices[prefix] = idx + 1 | |
return '{}{}{}'.format(prefix, idx, re.sub(r'[^\w\d]', '', name)) | |
# def postprocess_book(self, oeb, opts, log): | |
# # Remove the superfluous extra feed page at the beginning of the book, replacing it | |
# # with the proper credits | |
# for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'): | |
# item.getparent().remove(item) | |
# for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'): | |
# item.getparent().remove(item) | |
# def postprocess_html(self, soup, first_fetch): | |
# try: | |
# with open('d:/python/duolingo/tmp/'+self.make_anchor('post_')+'.html', 'w') as myfile: | |
# myfile.write(soup.prettify()) | |
# except: | |
# pass | |
# return soup | |
def preprocess_raw_html(self, raw_html, url): | |
""" | |
extract article title ('name') and the tips and notes ('explanation') | |
and optionally vocabulary words ('lessonWords') | |
from json and return result as html | |
""" | |
def get_vocabs(lang_data): | |
"""return string of lesson vocabs built from lang_data | |
""" | |
if not self.include_vocabs: return ('', '') | |
# list of list of words, one list for each lesson | |
word_lists = lang_data.get('lessonWords') | |
if word_lists is None: return ('', '') | |
vocab_section = self.make_anchor('voc_') | |
vocabs = endnotes = '' | |
if self.include_defs: | |
defs = self.lookup_definitions( | |
self.learning_language, | |
lang_data['fromLanguage'], word_lists) | |
for words in word_lists: | |
if self.include_defs: | |
strng = sep = '' | |
for word in words: | |
if self.inline_defs: | |
strng += ( | |
'{}♦ <span class="vocab_word">{}</span>: ' | |
'<span class="vocab_def"> {}</span>').format( | |
sep, word, ', '.join(defs[word])) | |
else: | |
# kindle formats require a superscripted link from noteref | |
# to the footnote and a link back from footnote to noteref | |
# to make popup footnote work | |
fn = self.make_anchor('fn_', word) # name for footnote | |
ref = self.make_anchor('ref_', word) # name for noteref | |
strng += ( | |
'{0}{3}<a id="{1}" href="#{2}" epub:type="noteref">' | |
'<sup>*</sup></a>').format(sep, ref, fn, word) | |
# ' <a href="#{4}">↵</a></p>') \ | |
endnotes += ( | |
'<p id="{0}" epub:type="footnote">' | |
'<a href="#{3}">{1}</a>: {2}</p>') \ | |
.format(fn, word, ', '.join(defs[word]), ref) | |
sep = ', ' | |
else: | |
strng = ', '.join(words) | |
vocabs += '<li>{}</li>'.format(strng) | |
vocabs = '<div class="vocabs" id="{}"><ol>{}</ol></div>'.format(vocab_section, vocabs) | |
endnotes = '<aside epub:type="footnotes" class="footnotes">{}</aside>'.format(endnotes) | |
return (vocabs, endnotes) | |
def get_notes(lang_data): | |
"""process and return tips & notes from lang_data['explanation] | |
""" | |
notes = lang_data.get('explanation', '') | |
if len(notes) < 200: return '' | |
# if notes contains a heading similar to title, remove it | |
# because we'll add the title ourselves | |
hreg = re.compile(r'^\s*\<h\d\>(.+?)\<\/h\d\>') | |
m = hreg.match(notes) | |
if m and similar(m.group(1), lang_data['name']): | |
notes = hreg.sub('', notes) | |
# strip out extraneous "<hr /> blah blah" near bottom | |
notes = re.sub('\<hr \/\>.{,5}a href(.+)?$', '', notes, 1, re.DOTALL) | |
# strip out extraneous "blah blah <hr />" near top | |
notes = re.sub('^.{,100}\<hr \/\>', '', notes, 1, re.DOTALL) | |
return notes | |
try: | |
lang_data = json.loads(raw_html).get('skills')[0] | |
except: | |
abort_article('Unexpected json data') | |
return | |
heading = lang_data['name'] | |
notes = get_notes(lang_data) | |
(vocabs, endnotes) = get_vocabs(lang_data) | |
if vocabs or notes: | |
class_name = '' | |
# if has both notes and vocabs add a link to the heading | |
# so you can skip over the vocabs and jump to the notes | |
if notes and vocabs: | |
anchor = self.make_anchor('notes_', heading) | |
heading += ' <a href="#{}">{}</a>'.format(anchor, self.dagger) | |
notes = '<a name="{0}" id="{0}"></a>{1}'.format(anchor, notes) | |
class_name = 'has_notes' | |
heading = '<h1 class="{}">{}</h1>'.format(class_name, heading) | |
html = ( | |
'<?xml version="1.0" encoding="utf-8"?>' | |
'<html xmlns:epub="http://www.idpf.org/2007/ops">' | |
'<head><title></title></head><body>{}{}{}{}</body></html>') \ | |
.format(heading, vocabs, notes, endnotes) | |
try: | |
with open('d:/python/duolingo/tmp/%s.html' % self.make_anchor('tmp_'), 'w') as myfile: | |
myfile.write(html) | |
except: | |
pass | |
return html | |
else: | |
self.abort_article(heading + ' has no notes or vocabs.') | |
def print_version(self, url): | |
""" | |
change user facing url | |
.../skill/<language>/<topic> | |
to | |
.../2016-04-13/skills?learningLanguage=<learning_lang>&urlName=<topic> | |
this gives us the json data we really want | |
""" | |
return re.sub( | |
'/skill/[^/]+/', | |
'/2016-04-13/skills?learningLanguage=' + self.learning_language_id + '&urlName=', | |
url, 1) | |
def populate_article_metadata(self, article, soup, first): | |
"""add dagger to title if h1 class == has_notes | |
""" | |
h = soup.find('h1') | |
if h and h.get('class') == 'has_notes': | |
article.title = article.title + ' ' + self.dagger | |
def get_learning_language_id(self): | |
""" | |
look up the learning language id needed for the skill API call | |
from the javascript struct duo.available_languages | |
downloaded from home page. (Don't know of a more elegant way of getting this) | |
usually it is the same as learning lang abbreviation but not always | |
e.g., for Norwegian, language abbrev = nb, learning language id = no-BO | |
""" | |
if self.learning_language_id is None: | |
raw_str = self.get_raw(self.index_url) | |
m = re.match( | |
'.+duo\.available\_languages\s*\=\s*(\[(.+?)\])', | |
raw_str, re.DOTALL) | |
langs = json.loads(m.group(1)) | |
for lang in langs: | |
if lang['key'] == self.learning_language: | |
self.learning_language_id = lang['learning_language_id'] | |
self.log('learning_language_id: ', self.learning_language_id) | |
break | |
# find nothing? well, let's hope for the best! | |
if self.learning_language_id is None: | |
self.learning_language_id = self.learning_language | |
def parse_index(self): | |
""" | |
get user data from which we get list of skills (i.e., lessons) | |
as well as learning language, then build article list | |
""" | |
print('parsei') | |
user_data = self.get_json('https://www.duolingo.com/users/' + self.username) | |
self.learning_language = user_data['learning_language'] | |
if not self.learning_language: | |
abort_recipe_processing('Failed to get learning_language') | |
self.log('learning_language detected: ', self.learning_language) | |
self.get_learning_language_id() | |
lang_data = user_data['language_data'][self.learning_language] | |
lang_str = lang_data['language_string'] | |
skills = lang_data['skills'] | |
if self.include_vocabs: | |
self.title = self.title_with_vocabs.format(lang_str) | |
else: | |
self.title = self.title_no_vocabs.format(lang_str) | |
if self.cover_url == 'auto': | |
self.cover_url = self.default_cover_url | |
articles = [] | |
# skills needs to be sorted by y coord (position in tree), then x coord | |
for skill in sorted(skills, key=lambda x: (x['coords_y'], x['coords_x'])): | |
url = '{}/skill/{}/{}'.format(self.index_url, | |
self.learning_language, urlquote(skill['url_title'])) | |
#url = '{}/2016-04-13/skills?learningLanguage={}&urlName={}'.format( | |
# self.index_url, self.learning_language_id, urlquote(skill['url_title'])) | |
# self.log('Found article:', url) | |
articles.append({'title': skill['title'], 'url': url}) | |
return [(self.title, articles)] | |
def similar(a, b): | |
"""string a and b are similar if non junk chars SequenceMatcher ratio > .75 | |
""" | |
return SequenceMatcher( | |
lambda x: x in ' -:_12345678890:/()[]?!', a.lower(), b.lower() | |
).ratio() >= 0.75 | |
def urlquote(params): | |
"""safely quote url params with UTF-8 encoding | |
""" | |
return urllib.quote_plus(params.encode('UTF-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment