SuzanaK · February 12, 2017 20:28
diff --git a/csv2libsvm.py b/csv2libsvm.py
 #!/usr/bin/env python 
 # -*- coding: utf-8 -*-

 # converts csv to libsvm format
 # usage: python convert_to_libsvm.py (after having modified input and output filename)
 
 import codecs
 
 
 # change input filename here
 filename = "features.csv"
 fh = codecs.open(filename, "r", "utf-8")
 lines = fh.readlines()
 fh.close()
 
 # change output filename here
 out = "features.libsvm"
 fh = codecs.open(out, "w", "utf-8")
 
 
 for l in lines:
 
 l = l.split(";")
 lineout = l[0] + " "
 values = l[1:-2]
 for i in range(len(values)):
 
 # for some unknown reason libsvm starts index counting at 1, not at 0
 lineout += str(i+1) + ":" + values[i] + " "
 
 #lineout += " -1:\n"
 lineout += "\n"
 fh.write(lineout)
 
 
 fh.close()
diff --git a/deutschewelle.py b/deutschewelle.py
 #download the free audio trainer German / French from Deutsche Welle 

 import urllib

 for i in range(1, 10):
  urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion00' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')

 for i in range(10, 100):
    urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion0' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')

 urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion100_dwdownload.mp3', '/home/me/filename_100.mp3')


diff --git a/google_search.py b/google_search.py
 #!/usr/bin/env python 
 # -*- coding: utf-8 -*-

 import json
 import urllib


 # performs a Google search with the title and returns the first URL's of the query result
 def search_web(title):

    words = "+".join(title.split())
    query = urllib.urlencode({'q':words})
    # the ajax google api only returns the first results but that is enough for our purpose 
    response = urllib.urlopen('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
    json_response = json.loads(response)
    if json_response and json_response['responseData']:
    	results = json_response['responseData']['results']
    else:
        return []
    urls = [result['url'] for result in results]
    print('Found ' + str(len(urls)) + ' URLs.')
    return urls 
diff --git a/language_codes.py b/language_codes.py
 {'Estonian': 'et', 'Bicol': 'bcl', 'Zande': 'zne', 'Uruund': 'rnd', 'Khoekhoegowab': 'naq', u'Newari': 'new', 'Krio': 'kri', 'Nuer': 'nus', 'Cinyanja': 'nya', u'Bulgarian': 'bg', 'Norwegian': 'no', u'Yoruba': 'yo', u'French': 'fr', u'Otomi': 'oto', 'Ateso': 'teo', 'Iloko': 'ilo', 'Wolaita': 'wal', 'Tsonga': 'ts', 'Tzotzil': 'tzo', u'Tamil': 'ta', u'Haitian': 'ht', 'Samoan': 'sm', 'Aukan': 'djk', 'Finnish': 'fi', 'Rutoro': 'ttj', 'Dangme': 'ada', 'Albanian': 'sq', 'Mbunda': 'mck', 'Solomon': 'pis', 'Hiligaynon': 'hil', 'Tagalog': 'tl', u'Serbian': 'sr_latn', u'Efik': 'efi', 'Pangasinan': 'pag', 'Italian': 'it', 'Miskito': 'miq', 'Lhukonzo': 'koo', 'Lamba': 'lam', u'Kongo': 'kg', 'Mazatec': 'mau', u'Tarascan': 'tsz', u'Amharic': 'am', u'Czech': 'cs', u'Papiamento': 'pap', u'Nahuatl': 'ncj', 'Ga': 'gaa', 'Polish': 'pl', 'Tongan': 'to', 'Xhosa': 'xh', 'Swedish': 'sv', u'Marathi': 'mr', 'Luganda': 'lg', u'Slovenian': 'sl', 'Ewe': 'ee', u'Azerbaijani': 'az_cyrl', u'Kikuyu': 'ki', 'Luo': 'luo', 'Tankarana': 'xmv', 'Danish': 'da', 'Indonesian': 'id', 'Frafra': 'gur', 'Zulu': 'zu', 'Lenje': 'leh', 'Cakchiquel': 'cak', u'Georgian': 'ka', 'Mayangna': 'yan', 'Tetum': 'tdt', u'Tigrinya': 'ti', 'Nzema': 'nzi', 'Niuean': 'niu', u'Slovak': 'sk', u'Thai': 'th', 'Afrikaans': 'af', u'Lahu': 'lhu', u'Guarani': 'gug', 'Sidama': 'sid', u'Punjabi': 'pa', 'Kalenjin': 'kln', 'Herero': 'hz', u'Kekchi': 'kek', 'Kisonge': 'sop', u'Latvian': 'lv', 'English': 'en', 'Mambwe-Lungu': 'mgr', 'Lingala': 'ln', u'Faeroese': 'fo', u'Chinese': 'zh_hant', 'Wayuunaiki': 'guc', 'Quichua': 'qus', 'Huave': 'huv', u'Tatar': 'tt', 'Kabyle': 'kab', 'Chin': 'cnh', u'Quiche': 'quc', 'Rapa': 'rap', 'Venda': 've', 'Tojolabal': 'toj', 'Swahili': 'sw', u'Icelandic': 'is', u'Turkish': 'tr', 'Kalanga': 'kck', 'Twi': 'tw', 'Waray-Waray': 'war', u'Kirghiz': 'ky', 'Guna': 'cuk', u'Gujarati': 'gu', u'Hindi': 'hi', 'Zapotec': 'zpg', u'Korean': 'ko', 'Malagasy': 'mg', 'Hungarian': 'hu', 'Igbo': 'ig', u'Lithuanian': 'lt', 'Greenlandic': 'kl', 'Tzeltal': 'tzh', 'Acholi': 'ach', u'Russian': 'ru', 'Romany': 'rmn', 'Croatian': 'hr', u'Kazakh': 'kk_cyrl', 'Tiv': 'tiv', 'Cebuano': 'ceb', u'Armenian': 'hy_armn', 'Sarnami': 'hns', 'Kikamba': 'kam', 'Toba': 'tob', 'Chol': 'ctu', 'Luvale': 'lue', 'Sepedi': 'nso', 'Mixe': 'mco', u'Greek': 'el', 'Sesotho': 'st', 'Hausa': 'ha', 'Isoko': 'iso', 'Irish': 'ga', 'Seychelles': 'crs', 'German': 'de', 'Runyankore': 'nyn', 'Kwanyama': 'kj', u'Macedonian': 'mk', u'Mongolian': 'mn', 'Aymara': 'ay', u'Mapudungun': 'arn', u'Sinhala': 'si', 'Ndonga': 'ng', u'Vietnamese': 'vi', u'Romanian': 'ro', 'Shona': 'sn', 'Dutch': 'nl', 'Swati': 'ss', 'Somali': 'so', 'Garifuna': 'cab', u'Nepali': 'ne', 'Tokelauan': 'tkl', 'Maya': 'yua', u'Ukrainian': 'uk', 'Welsh': 'cy', u'Mauritian': 'mfe', u'Mayo': 'mfy', 'Kisi': 'kiz', 'Tahitian': 'ty', u'Baoule': 'bci', u'Pilag\xe1': 'plg', 'Rarotongan': 'rar', 'Maltese': 'mt', 'Mam': 'mam', u'Cambodian': 'km', u'Kurdish': 'kmr_cyrl', u'Spanish': 'es', 'Tswana': 'tn', 'Kikaonde': 'kqn', 'Sango': 'sg', 'Oromo': 'om', u'Portuguese': 'pt', u'Huastec': 'hus', u'Myanmar': 'mya', u'Saramaccan': 'srm', 'Sranantongo': 'srn', 'Kiluba': 'lub', u'Japanese': 'ja', 'Kinyarwanda': 'rw', 'Lugbara': 'lgg', 'Ndebele': 'nr', 'Quechua': 'que', 'Kwangali': 'kwn', u'Tajiki': 'tg', u'Ossetian': 'os'}
diff --git a/load_unicode_url.py b/load_unicode_url.py
 # loads the html content of an URL, extracts the text and returns it as unicode string

 import urllib2
 from bs4 import BeautifulSoup
 import nltk
 
 # loads url, extracts and returns text as unicode string
 def get_text_from_url(url):
 
 opener = urllib2.build_opener()
 request = urllib2.Request(url)
 request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:19.0) Gecko/20100101 Firefox/19.0')
 html = opener.open(request).read()
 # pass to BeautifulSoup before because it will determine the character encoding of the website and decode to Unicode
 soup = BeautifulSoup(html)
 html_unicode = unicode(soup)
 text_unicode = nltk.clean_html(html_unicode)
 return text_unicode
diff --git a/nwt_textfinder.py b/nwt_textfinder.py
 #! /usr/bin/python
 # coding: utf-8 -*-

 import sys
 import time
 import string
 import logging
 import urllib
 import requests
 import codecs
 from bs4 import BeautifulSoup


 # schema of URLs: 
 # http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/prediger/4/#v21004007

 # replace this dict and adapt baseurl to receive texts in another language
 biblebooks = {"1. Korinther": 46, "Habakuk": 35, "Epheser": 49, "2. Timotheus": 55, "Haggai": 37, "1. Samuel": 9, "Johannes": 43, "Jona": 32, "Daniel": 27, "Zephanja": 36, "1. Petrus": 60, "2. Chronika": 14, "Ruth": 8, "Judas": 65, "1. Mose": 1, "Esther": 17, "Jakobus": 59, "Maleachi": 39, "1. Johannes": 62, "Klagelieder": 25, "2. Mose": 2, "Kolosser": 51, "2. Korinther": 47, "1. Könige": 11, "Prediger": 21, "Micha": 33, "Philipper": 50, "Galater": 48, "Josua": 6, "Markus": 41, "Joel": 29, "Lukas": 42, "Hohes Lied": 22, "Jeremia": 24, "Hosea": 28, "Hiob": 18, "1. Timotheus": 54, "Psalm": 19, "2. Thessalonicher": 53, "Nehemia": 16, "5. Mose": 5, "Amos": 30, "Obadja": 31, "Apostelgeschichte": 44, "1. Chronika": 13, "Richter": 7, "4. Mose": 4, "Nahum": 34, "Matthäus": 40, "Römer": 45, "Sprüche": 20, "3. Johannes": 64, "Jesaja": 23, "Hesekiel": 26, "Hebräer": 58, "Sacharja": 38, "Titus": 56, "Philemon": 57, "Esra": 15, "Offenbarung": 66, "2. Könige": 12, "3. Mose": 3, "2. Johannes": 63, "2. Samuel": 10, "2. Petrus": 61, "1. Thessalonicher": 52}

 one_chapter_books = ['Obadja', 'Philemon', 'Judas', '2. Johannes', '3. Johannes']

 for k in biblebooks.keys():
    biblebooks[unicode(k.decode('utf-8'))] = biblebooks.pop(k)


 # for German bible texts only 
 baseurl = 'http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/'


 """Takes as input the text's "address" as string or unicode and returns the text's content as unicode, downloaded from 
 the German New World Translation published by JW on jw.org.

 Example call: nwt_textfinder.get_bible_text_german(u'Sprüche 18:10')
 Returns: u'10\xa0\xa0Der Name Jehovas ist ein starker Turm.+ Der Gerechte l\xe4uft hinein und wird besch\xfctzt.*+\n\n'

 Also downloads multiple texts like "1. Mose 1:1, 2" or "1. Mose 1:1-3"
 """
 def get_bible_text_german(text):

  text = unicode(text.strip())
  text = text.replace(u'\xa0', u' ')
  while not text[-1].isdigit():
    text = text[:-1]

  logging.debug('Now starting to look up this text: %s' %text)
  
  if ':' in text:
    book_chapter, verse = text.split(':', 1)
    try:
      book, chapter = book_chapter.rsplit(' ', 1)
    except ValueError:
      logging.error('No valid bible text - there must be a space between book and chapter: %s' %text)
      return ''
  else:
    book_found = False
    for b in one_chapter_books:
      if b in text:
        logging.warning('This is a text from a book with one chapter: %s' %text)
        try:
          book, verse = text.rsplit(' ', 1)
        except ValueError:
          logging.error('No valid bible text - there must be a space between book and verse: %s' %text)
        chapter = '1'
        book_found = True

    if not book_found:
      logging.error('No valid text! There must be a colon between the chapter and the verse: %s' %text)
      return ''

  logging.debug('Book: %s' %book)
  logging.debug('Chapter: %s' %chapter)
  logging.debug('Verse: %s' %verse)

  if '-' in verse:
    start_verse, end_verse = verse.split('-')
    verselist = range(int(start_verse.strip()), int(end_verse.strip())+1)
    verselist = [str(v) for v in verselist]

  elif ',' in verse:
    a, b = verse.split(',')
    verselist = [a.strip(), b.strip()]
  else:
    verselist = [verse.replace(';','').replace(',','')]

  verselist = [v for v in verselist if v]

  logging.debug('Will now look up these verse(s): %s' %str(verselist))
  book = book.replace(u'\xa0', ' ')

  texturl = book.replace('. ', '-') + '/' + chapter + '/' 

  r = requests.get(baseurl + texturl)

  soup = BeautifulSoup(r.content)
  try:
    bookid = str(biblebooks[book])
  except KeyError as err:
    logging.error('Book %s was not found! %s' %(book, err))

  result = ''

  for v in verselist:

    textid = 'v' +  bookid + (3 - len(chapter)) * '0' + chapter + (3 - len(v)) * '0' + v
    logging.debug('ID of the texts element: %s' %textid)
    souptext = soup.find(id=textid)
    if souptext:
      result += souptext.text + '\n'
    else:
      logging.error('No text found for this verse: %s' %v)
  return result

  
diff --git a/Python_Code_Snippets b/Python_Code_Snippets
 # various Python code snippets
diff --git a/scrape_wiki_tables.py b/scrape_wiki_tables.py
 #!/usr/bin/env python
 # scrape Wikipedia table and export as .csv file

 import urllib, urllib2
 import unicodecsv as csv
 import json
 import codecs
 from BeautifulSoup import BeautifulSoup


 def main(url, table_names):

  all_data = list()
  fh = urllib.urlopen(url)
  content = fh.read()
  soup = BeautifulSoup(content)
  tables = soup.findAll('table', 'wikitable') #excludes vertical navigation box with article summary

  for table_no, table in enumerate(tables[:-1]):

    if table_no == 0:
      heads = [th.text for th in table.findAll('th')]
      all_data.append(heads + [u'Continent'])

    for row in table.findAll('tr'):
      row_data = []
      if row.findAll('th'):
        continue
      for table_data in row.findAll('td'):
        if table_data.a: row_data.append(table_data.a.text)
        else: row_data.append(table_data.text)

      row_data.append(table_names[table_no])
      all_data.append(row_data)

  # csv module cannot handle unicode (shame on you)
  all_data_utf8 = []
  for line in all_data:
    all_data_utf8.append([unicode(s).encode('utf-8') for s in line])

  out = open('all_data.csv', 'w')
  writer = csv.writer(out, dialect='excel', encoding='utf-8')
  for row in all_data_utf8:
    print row
    writer.writerow(row)
  out.close()

 if __name__ == '__main__':


    url = r"https://en.wikipedia.org/wiki/Jehovah's_Witnesses_by_country"
    # TODO
    table_names =  [u'Africa', u'North America', u'Caribbean', u'South America', u'Asia', u'Europe', u'Oceania', u'Other']

    main(url, table_names)

diff --git a/send2kindle.py b/send2kindle.py
 '''Send mobi files to kindle email address with python's email package'''
 def send2kindle(f):

    msg = MIMEMultipart()
    msg['Subject'] = "Neue Zeitschriftenlieferung!"
    msg['From'] = 
    msg['To'] = sender_email
    msg['Date'] = formatdate(localtime=True)
    message = "Neue Zeitschriften im Anhang"
    msg.attach(MIMEText(message))
    part = MIMEApplication(open(f, 'rb').read(), _subtype='application/x-mobipocket-ebook')
    part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f))
    msg.attach(part)

    # connect to Google's SMTP server 
    mailserver = smtplib.SMTP(smtpserver, port)
    mailserver.ehlo()
    mailserver.starttls()
    mailserver.ehlo()
    mailserver.login(sender_email, sender_password)
    mailserver.sendmail(sender_email, kindles_email, msg.as_string())
    mailserver.close()
diff --git a/simple_prob_distribution.py b/simple_prob_distribution.py
 # choose a key from a dictionary randomly - the probability of a key getting chosen is commensurate to its value
 # d ist dictionary mit index - count werten

 def simpleProbDist(d):
 
 i = 1
 d2 = dict()
 for k in d.keys():
 r = (i, i+d[k])
 #print r
 i += d[k]
 #print k
 d2[k] = r
 
 x = random.randint(1, i-1)
 #print "x: " + str(x)
 for k in d2.keys():
 if x in range(*d2[k]):
 chosen = k
 break
 
 return chosen
diff --git a/string_manipulation.py b/string_manipulation.py
 #remove punctuation:

 import string

 table = string.maketrans("","") 
 def rem_punct(s):
    return s.translate(table, string.punctuation)



 #lemmatizing of nouns and verbs with the Nodebox Linguistics Library und WordNet
 #perform POS tagging before to improve precision 

 def lemmatize(words):

  lemmas = []

  for w in words:
    if en.is_verb(w): 
      lemmas.append(en.verb.infinitive(w.lower()))
    elif en.is_noun(w):
      lemmas.append(en.noun.singular(w.lower()))
    else:
      lemmas.append(w.lower())
  
  return lemmas
diff --git a/things_i_dont_want_to_forget.py b/things_i_dont_want_to_forget.py
 # hopefully i won't forget them anymore when i wrote them down here :-) 

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 str(datetime.datetime.now())

 if __name__ == '__main__':

 # only Python 2, not 3
 except IOerror as e:
    print >> stderr, "IOError: %s" %e.strerror

 import logging
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

 import warnings
 warnings.filterwarnings('ignore', category=DeprecationWarning)

 # execute bash command
 subprocess.Popen(['/bin/bash', '-c', cmd])
 # get output of bash command in python
 output = subprocess.check_output(['/bin/bash', '-c', cmd])
 # get stderr 
 output = subprocess.check_output('ls', stderr=subprocess.STDOUT)


 Pylab / Numpy etc.

    data = np.genfromtxt('all_absolute.csv', delimiter=';')

    data = np.array(1, 2, 3], [4, 5, 6)
    plt.imshow(data4, interpolation='nearest', cmap=plt.cm.bone) 
diff --git a/wikipedia.py b/wikipedia.py
 import wikipedia #https://github.com/goldsmith/Wikipedia
 import urllib
 import re

 msg = 'Error!'

 try:
    p = wikipedia.page(page_title)

 # if you know only the page_title and url but not the pageid you have to 
 # find it in the html code of the wikipedia page before you can use 
 # the API functions
 
 except wikipedia.DisambiguationError as err:
    
    print err
    fh = urllib.urlopen(baseurl + page_link)
    if fh:
      html = fh.read()    
      r = re.search('"wgArticleId":(\d+),', html)
      groups = r.groups()
      if groups:
        pageid = groups[0] 
        p = wikipedia.page(pageid=pageid)
        
      else:
        print msg
        return
    else:
      print msg 
      return 
diff --git a/word2vec_model.py b/word2vec_model.py
 #!/usr/bin/python

 import codecs
 import string
 import re
 import logging

 import nltk 
 import gensim
 import en

 # before starting the script install Cython with `pip install cython` to use optimized word2vec training (70x speedup).
 # Creates and saves a word2vec model from a big (some MB) file of raw text. 

 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 global table 
 table = string.maketrans("","") 


 def remove_punct(s):
  return s.translate(table, string.punctuation)


 def lemmatize(words):
 
  lemmas = []
 
  for w in words:
    if en.is_verb(w):
      lemmas.append(en.verb.infinitive(w.lower()))
    elif en.is_noun(w):
      lemmas.append(en.noun.singular(w.lower()))
    else:
      lemmas.append(w.lower())
  return lemmas


 class WordPreparer(object):
  
  def __init__(self, fname):
    self.fname = fname
    fh = codecs.open(fname, mode='r', encoding='utf-8')
    text = fh.read()
    self.sents = nltk.sent_tokenize(text)
    self.sents = [remove_punct(s.encode('utf-8')) for s in self.sents]

  def __iter__(self):
    for s in self.sents:
      yield lemmatize(remove_punct(s).split())


 if __name__ == '__main__':

  words = WordPreparer('raw_text.txt')  
  model = gensim.models.Word2Vec(words, min_count=3, workers=2)
   
  model.save('word2vec_model')
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# converts csv to libsvm format
	# usage: python convert_to_libsvm.py (after having modified input and output filename)

	import codecs


	# change input filename here
	filename = "features.csv"
	fh = codecs.open(filename, "r", "utf-8")
	lines = fh.readlines()
	fh.close()

	# change output filename here
	out = "features.libsvm"
	fh = codecs.open(out, "w", "utf-8")


	for l in lines:

	l = l.split(";")
	lineout = l[0] + " "
	values = l[1:-2]
	for i in range(len(values)):

	# for some unknown reason libsvm starts index counting at 1, not at 0
	lineout += str(i+1) + ":" + values[i] + " "

	#lineout += " -1:\n"
	lineout += "\n"
	fh.write(lineout)


	fh.close()
	#download the free audio trainer German / French from Deutsche Welle

	import urllib

	for i in range(1, 10):
	urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion00' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')

	for i in range(10, 100):
	urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion0' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')

	urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion100_dwdownload.mp3', '/home/me/filename_100.mp3')
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import json
	import urllib


	# performs a Google search with the title and returns the first URL's of the query result
	def search_web(title):

	words = "+".join(title.split())
	query = urllib.urlencode({'q':words})
	# the ajax google api only returns the first results but that is enough for our purpose
	response = urllib.urlopen('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json_response = json.loads(response)
	if json_response and json_response['responseData']:
	results = json_response['responseData']['results']
	else:
	return []
	urls = [result['url'] for result in results]
	print('Found ' + str(len(urls)) + ' URLs.')
	return urls
	# loads the html content of an URL, extracts the text and returns it as unicode string

	import urllib2
	from bs4 import BeautifulSoup
	import nltk

	# loads url, extracts and returns text as unicode string
	def get_text_from_url(url):

	opener = urllib2.build_opener()
	request = urllib2.Request(url)
	request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:19.0) Gecko/20100101 Firefox/19.0')
	html = opener.open(request).read()
	# pass to BeautifulSoup before because it will determine the character encoding of the website and decode to Unicode
	soup = BeautifulSoup(html)
	html_unicode = unicode(soup)
	text_unicode = nltk.clean_html(html_unicode)
	return text_unicode
	#! /usr/bin/python
	# coding: utf-8 -*-

	import sys
	import time
	import string
	import logging
	import urllib
	import requests
	import codecs
	from bs4 import BeautifulSoup


	# schema of URLs:
	# http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/prediger/4/#v21004007

	# replace this dict and adapt baseurl to receive texts in another language
	biblebooks = {"1. Korinther": 46, "Habakuk": 35, "Epheser": 49, "2. Timotheus": 55, "Haggai": 37, "1. Samuel": 9, "Johannes": 43, "Jona": 32, "Daniel": 27, "Zephanja": 36, "1. Petrus": 60, "2. Chronika": 14, "Ruth": 8, "Judas": 65, "1. Mose": 1, "Esther": 17, "Jakobus": 59, "Maleachi": 39, "1. Johannes": 62, "Klagelieder": 25, "2. Mose": 2, "Kolosser": 51, "2. Korinther": 47, "1. Könige": 11, "Prediger": 21, "Micha": 33, "Philipper": 50, "Galater": 48, "Josua": 6, "Markus": 41, "Joel": 29, "Lukas": 42, "Hohes Lied": 22, "Jeremia": 24, "Hosea": 28, "Hiob": 18, "1. Timotheus": 54, "Psalm": 19, "2. Thessalonicher": 53, "Nehemia": 16, "5. Mose": 5, "Amos": 30, "Obadja": 31, "Apostelgeschichte": 44, "1. Chronika": 13, "Richter": 7, "4. Mose": 4, "Nahum": 34, "Matthäus": 40, "Römer": 45, "Sprüche": 20, "3. Johannes": 64, "Jesaja": 23, "Hesekiel": 26, "Hebräer": 58, "Sacharja": 38, "Titus": 56, "Philemon": 57, "Esra": 15, "Offenbarung": 66, "2. Könige": 12, "3. Mose": 3, "2. Johannes": 63, "2. Samuel": 10, "2. Petrus": 61, "1. Thessalonicher": 52}

	one_chapter_books = ['Obadja', 'Philemon', 'Judas', '2. Johannes', '3. Johannes']

	for k in biblebooks.keys():
	biblebooks[unicode(k.decode('utf-8'))] = biblebooks.pop(k)


	# for German bible texts only
	baseurl = 'http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/'


	"""Takes as input the text's "address" as string or unicode and returns the text's content as unicode, downloaded from
	the German New World Translation published by JW on jw.org.

	Example call: nwt_textfinder.get_bible_text_german(u'Sprüche 18:10')
	Returns: u'10\xa0\xa0Der Name Jehovas ist ein starker Turm.+ Der Gerechte l\xe4uft hinein und wird besch\xfctzt.*+\n\n'

	Also downloads multiple texts like "1. Mose 1:1, 2" or "1. Mose 1:1-3"
	"""
	def get_bible_text_german(text):

	text = unicode(text.strip())
	text = text.replace(u'\xa0', u' ')
	while not text[-1].isdigit():
	text = text[:-1]

	logging.debug('Now starting to look up this text: %s' %text)

	if ':' in text:
	book_chapter, verse = text.split(':', 1)
	try:
	book, chapter = book_chapter.rsplit(' ', 1)
	except ValueError:
	logging.error('No valid bible text - there must be a space between book and chapter: %s' %text)
	return ''
	else:
	book_found = False
	for b in one_chapter_books:
	if b in text:
	logging.warning('This is a text from a book with one chapter: %s' %text)
	try:
	book, verse = text.rsplit(' ', 1)
	except ValueError:
	logging.error('No valid bible text - there must be a space between book and verse: %s' %text)
	chapter = '1'
	book_found = True

	if not book_found:
	logging.error('No valid text! There must be a colon between the chapter and the verse: %s' %text)
	return ''

	logging.debug('Book: %s' %book)
	logging.debug('Chapter: %s' %chapter)
	logging.debug('Verse: %s' %verse)

	if '-' in verse:
	start_verse, end_verse = verse.split('-')
	verselist = range(int(start_verse.strip()), int(end_verse.strip())+1)
	verselist = [str(v) for v in verselist]

	elif ',' in verse:
	a, b = verse.split(',')
	verselist = [a.strip(), b.strip()]
	else:
	verselist = [verse.replace(';','').replace(',','')]

	verselist = [v for v in verselist if v]

	logging.debug('Will now look up these verse(s): %s' %str(verselist))
	book = book.replace(u'\xa0', ' ')

	texturl = book.replace('. ', '-') + '/' + chapter + '/'

	r = requests.get(baseurl + texturl)

	soup = BeautifulSoup(r.content)
	try:
	bookid = str(biblebooks[book])
	except KeyError as err:
	logging.error('Book %s was not found! %s' %(book, err))

	result = ''

	for v in verselist:

	textid = 'v' + bookid + (3 - len(chapter)) * '0' + chapter + (3 - len(v)) * '0' + v
	logging.debug('ID of the texts element: %s' %textid)
	souptext = soup.find(id=textid)
	if souptext:
	result += souptext.text + '\n'
	else:
	logging.error('No text found for this verse: %s' %v)
	return result
	#!/usr/bin/env python
	# scrape Wikipedia table and export as .csv file

	import urllib, urllib2
	import unicodecsv as csv
	import json
	import codecs
	from BeautifulSoup import BeautifulSoup


	def main(url, table_names):

	all_data = list()
	fh = urllib.urlopen(url)
	content = fh.read()
	soup = BeautifulSoup(content)
	tables = soup.findAll('table', 'wikitable') #excludes vertical navigation box with article summary

	for table_no, table in enumerate(tables[:-1]):

	if table_no == 0:
	heads = [th.text for th in table.findAll('th')]
	all_data.append(heads + [u'Continent'])

	for row in table.findAll('tr'):
	row_data = []
	if row.findAll('th'):
	continue
	for table_data in row.findAll('td'):
	if table_data.a: row_data.append(table_data.a.text)
	else: row_data.append(table_data.text)

	row_data.append(table_names[table_no])
	all_data.append(row_data)

	# csv module cannot handle unicode (shame on you)
	all_data_utf8 = []
	for line in all_data:
	all_data_utf8.append([unicode(s).encode('utf-8') for s in line])

	out = open('all_data.csv', 'w')
	writer = csv.writer(out, dialect='excel', encoding='utf-8')
	for row in all_data_utf8:
	print row
	writer.writerow(row)
	out.close()

	if __name__ == '__main__':


	url = r"https://en.wikipedia.org/wiki/Jehovah's_Witnesses_by_country"
	# TODO
	table_names = [u'Africa', u'North America', u'Caribbean', u'South America', u'Asia', u'Europe', u'Oceania', u'Other']

	main(url, table_names)
	'''Send mobi files to kindle email address with python's email package'''
	def send2kindle(f):

	msg = MIMEMultipart()
	msg['Subject'] = "Neue Zeitschriftenlieferung!"
	msg['From'] =
	msg['To'] = sender_email
	msg['Date'] = formatdate(localtime=True)
	message = "Neue Zeitschriften im Anhang"
	msg.attach(MIMEText(message))
	part = MIMEApplication(open(f, 'rb').read(), _subtype='application/x-mobipocket-ebook')
	part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f))
	msg.attach(part)

	# connect to Google's SMTP server
	mailserver = smtplib.SMTP(smtpserver, port)
	mailserver.ehlo()
	mailserver.starttls()
	mailserver.ehlo()
	mailserver.login(sender_email, sender_password)
	mailserver.sendmail(sender_email, kindles_email, msg.as_string())
	mailserver.close()
	# choose a key from a dictionary randomly - the probability of a key getting chosen is commensurate to its value
	# d ist dictionary mit index - count werten

	def simpleProbDist(d):

	i = 1
	d2 = dict()
	for k in d.keys():
	r = (i, i+d[k])
	#print r
	i += d[k]
	#print k
	d2[k] = r

	x = random.randint(1, i-1)
	#print "x: " + str(x)
	for k in d2.keys():
	if x in range(*d2[k]):
	chosen = k
	break

	return chosen
	#remove punctuation:

	import string

	table = string.maketrans("","")
	def rem_punct(s):
	return s.translate(table, string.punctuation)



	#lemmatizing of nouns and verbs with the Nodebox Linguistics Library und WordNet
	#perform POS tagging before to improve precision

	def lemmatize(words):

	lemmas = []

	for w in words:
	if en.is_verb(w):
	lemmas.append(en.verb.infinitive(w.lower()))
	elif en.is_noun(w):
	lemmas.append(en.noun.singular(w.lower()))
	else:
	lemmas.append(w.lower())

	return lemmas
	# hopefully i won't forget them anymore when i wrote them down here :-)

	#!/usr/bin/env python
	# -- coding: utf-8 --

	str(datetime.datetime.now())

	if __name__ == '__main__':

	# only Python 2, not 3
	except IOerror as e:
	print >> stderr, "IOError: %s" %e.strerror

	import logging
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	import warnings
	warnings.filterwarnings('ignore', category=DeprecationWarning)

	# execute bash command
	subprocess.Popen(['/bin/bash', '-c', cmd])
	# get output of bash command in python
	output = subprocess.check_output(['/bin/bash', '-c', cmd])
	# get stderr
	output = subprocess.check_output('ls', stderr=subprocess.STDOUT)


	Pylab / Numpy etc.

	data = np.genfromtxt('all_absolute.csv', delimiter=';')

	data = np.array(1, 2, 3], [4, 5, 6)
	plt.imshow(data4, interpolation='nearest', cmap=plt.cm.bone)