Last active
February 12, 2017 20:28
-
-
Save SuzanaK/5245543 to your computer and use it in GitHub Desktop.
Python Snippets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# converts csv to libsvm format | |
# usage: python convert_to_libsvm.py (after having modified input and output filename) | |
import codecs | |
# change input filename here | |
filename = "features.csv" | |
fh = codecs.open(filename, "r", "utf-8") | |
lines = fh.readlines() | |
fh.close() | |
# change output filename here | |
out = "features.libsvm" | |
fh = codecs.open(out, "w", "utf-8") | |
for l in lines: | |
l = l.split(";") | |
lineout = l[0] + " " | |
values = l[1:-2] | |
for i in range(len(values)): | |
# for some unknown reason libsvm starts index counting at 1, not at 0 | |
lineout += str(i+1) + ":" + values[i] + " " | |
#lineout += " -1:\n" | |
lineout += "\n" | |
fh.write(lineout) | |
fh.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#download the free audio trainer German / French from Deutsche Welle | |
import urllib | |
for i in range(1, 10): | |
urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion00' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3') | |
for i in range(10, 100): | |
urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion0' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3') | |
urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion100_dwdownload.mp3', '/home/me/filename_100.mp3') | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import json | |
import urllib | |
# performs a Google search with the title and returns the first URL's of the query result | |
def search_web(title): | |
words = "+".join(title.split()) | |
query = urllib.urlencode({'q':words}) | |
# the ajax google api only returns the first results but that is enough for our purpose | |
response = urllib.urlopen('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() | |
json_response = json.loads(response) | |
if json_response and json_response['responseData']: | |
results = json_response['responseData']['results'] | |
else: | |
return [] | |
urls = [result['url'] for result in results] | |
print('Found ' + str(len(urls)) + ' URLs.') | |
return urls |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{'Estonian': 'et', 'Bicol': 'bcl', 'Zande': 'zne', 'Uruund': 'rnd', 'Khoekhoegowab': 'naq', u'Newari': 'new', 'Krio': 'kri', 'Nuer': 'nus', 'Cinyanja': 'nya', u'Bulgarian': 'bg', 'Norwegian': 'no', u'Yoruba': 'yo', u'French': 'fr', u'Otomi': 'oto', 'Ateso': 'teo', 'Iloko': 'ilo', 'Wolaita': 'wal', 'Tsonga': 'ts', 'Tzotzil': 'tzo', u'Tamil': 'ta', u'Haitian': 'ht', 'Samoan': 'sm', 'Aukan': 'djk', 'Finnish': 'fi', 'Rutoro': 'ttj', 'Dangme': 'ada', 'Albanian': 'sq', 'Mbunda': 'mck', 'Solomon': 'pis', 'Hiligaynon': 'hil', 'Tagalog': 'tl', u'Serbian': 'sr_latn', u'Efik': 'efi', 'Pangasinan': 'pag', 'Italian': 'it', 'Miskito': 'miq', 'Lhukonzo': 'koo', 'Lamba': 'lam', u'Kongo': 'kg', 'Mazatec': 'mau', u'Tarascan': 'tsz', u'Amharic': 'am', u'Czech': 'cs', u'Papiamento': 'pap', u'Nahuatl': 'ncj', 'Ga': 'gaa', 'Polish': 'pl', 'Tongan': 'to', 'Xhosa': 'xh', 'Swedish': 'sv', u'Marathi': 'mr', 'Luganda': 'lg', u'Slovenian': 'sl', 'Ewe': 'ee', u'Azerbaijani': 'az_cyrl', u'Kikuyu': 'ki', 'Luo': 'luo', 'Tankarana': 'xmv', 'Danish': 'da', 'Indonesian': 'id', 'Frafra': 'gur', 'Zulu': 'zu', 'Lenje': 'leh', 'Cakchiquel': 'cak', u'Georgian': 'ka', 'Mayangna': 'yan', 'Tetum': 'tdt', u'Tigrinya': 'ti', 'Nzema': 'nzi', 'Niuean': 'niu', u'Slovak': 'sk', u'Thai': 'th', 'Afrikaans': 'af', u'Lahu': 'lhu', u'Guarani': 'gug', 'Sidama': 'sid', u'Punjabi': 'pa', 'Kalenjin': 'kln', 'Herero': 'hz', u'Kekchi': 'kek', 'Kisonge': 'sop', u'Latvian': 'lv', 'English': 'en', 'Mambwe-Lungu': 'mgr', 'Lingala': 'ln', u'Faeroese': 'fo', u'Chinese': 'zh_hant', 'Wayuunaiki': 'guc', 'Quichua': 'qus', 'Huave': 'huv', u'Tatar': 'tt', 'Kabyle': 'kab', 'Chin': 'cnh', u'Quiche': 'quc', 'Rapa': 'rap', 'Venda': 've', 'Tojolabal': 'toj', 'Swahili': 'sw', u'Icelandic': 'is', u'Turkish': 'tr', 'Kalanga': 'kck', 'Twi': 'tw', 'Waray-Waray': 'war', u'Kirghiz': 'ky', 'Guna': 'cuk', u'Gujarati': 'gu', u'Hindi': 'hi', 'Zapotec': 'zpg', u'Korean': 'ko', 'Malagasy': 'mg', 'Hungarian': 'hu', 'Igbo': 'ig', u'Lithuanian': 'lt', 'Greenlandic': 'kl', 'Tzeltal': 'tzh', 'Acholi': 'ach', u'Russian': 'ru', 'Romany': 'rmn', 'Croatian': 'hr', u'Kazakh': 'kk_cyrl', 'Tiv': 'tiv', 'Cebuano': 'ceb', u'Armenian': 'hy_armn', 'Sarnami': 'hns', 'Kikamba': 'kam', 'Toba': 'tob', 'Chol': 'ctu', 'Luvale': 'lue', 'Sepedi': 'nso', 'Mixe': 'mco', u'Greek': 'el', 'Sesotho': 'st', 'Hausa': 'ha', 'Isoko': 'iso', 'Irish': 'ga', 'Seychelles': 'crs', 'German': 'de', 'Runyankore': 'nyn', 'Kwanyama': 'kj', u'Macedonian': 'mk', u'Mongolian': 'mn', 'Aymara': 'ay', u'Mapudungun': 'arn', u'Sinhala': 'si', 'Ndonga': 'ng', u'Vietnamese': 'vi', u'Romanian': 'ro', 'Shona': 'sn', 'Dutch': 'nl', 'Swati': 'ss', 'Somali': 'so', 'Garifuna': 'cab', u'Nepali': 'ne', 'Tokelauan': 'tkl', 'Maya': 'yua', u'Ukrainian': 'uk', 'Welsh': 'cy', u'Mauritian': 'mfe', u'Mayo': 'mfy', 'Kisi': 'kiz', 'Tahitian': 'ty', u'Baoule': 'bci', u'Pilag\xe1': 'plg', 'Rarotongan': 'rar', 'Maltese': 'mt', 'Mam': 'mam', u'Cambodian': 'km', u'Kurdish': 'kmr_cyrl', u'Spanish': 'es', 'Tswana': 'tn', 'Kikaonde': 'kqn', 'Sango': 'sg', 'Oromo': 'om', u'Portuguese': 'pt', u'Huastec': 'hus', u'Myanmar': 'mya', u'Saramaccan': 'srm', 'Sranantongo': 'srn', 'Kiluba': 'lub', u'Japanese': 'ja', 'Kinyarwanda': 'rw', 'Lugbara': 'lgg', 'Ndebele': 'nr', 'Quechua': 'que', 'Kwangali': 'kwn', u'Tajiki': 'tg', u'Ossetian': 'os'} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# loads the html content of an URL, extracts the text and returns it as unicode string | |
import urllib2 | |
from bs4 import BeautifulSoup | |
import nltk | |
# loads url, extracts and returns text as unicode string | |
def get_text_from_url(url): | |
opener = urllib2.build_opener() | |
request = urllib2.Request(url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:19.0) Gecko/20100101 Firefox/19.0') | |
html = opener.open(request).read() | |
# pass to BeautifulSoup before because it will determine the character encoding of the website and decode to Unicode | |
soup = BeautifulSoup(html) | |
html_unicode = unicode(soup) | |
text_unicode = nltk.clean_html(html_unicode) | |
return text_unicode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python | |
# coding: utf-8 -*- | |
import sys | |
import time | |
import string | |
import logging | |
import urllib | |
import requests | |
import codecs | |
from bs4 import BeautifulSoup | |
# schema of URLs: | |
# http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/prediger/4/#v21004007 | |
# replace this dict and adapt baseurl to receive texts in another language | |
biblebooks = {"1. Korinther": 46, "Habakuk": 35, "Epheser": 49, "2. Timotheus": 55, "Haggai": 37, "1. Samuel": 9, "Johannes": 43, "Jona": 32, "Daniel": 27, "Zephanja": 36, "1. Petrus": 60, "2. Chronika": 14, "Ruth": 8, "Judas": 65, "1. Mose": 1, "Esther": 17, "Jakobus": 59, "Maleachi": 39, "1. Johannes": 62, "Klagelieder": 25, "2. Mose": 2, "Kolosser": 51, "2. Korinther": 47, "1. Könige": 11, "Prediger": 21, "Micha": 33, "Philipper": 50, "Galater": 48, "Josua": 6, "Markus": 41, "Joel": 29, "Lukas": 42, "Hohes Lied": 22, "Jeremia": 24, "Hosea": 28, "Hiob": 18, "1. Timotheus": 54, "Psalm": 19, "2. Thessalonicher": 53, "Nehemia": 16, "5. Mose": 5, "Amos": 30, "Obadja": 31, "Apostelgeschichte": 44, "1. Chronika": 13, "Richter": 7, "4. Mose": 4, "Nahum": 34, "Matthäus": 40, "Römer": 45, "Sprüche": 20, "3. Johannes": 64, "Jesaja": 23, "Hesekiel": 26, "Hebräer": 58, "Sacharja": 38, "Titus": 56, "Philemon": 57, "Esra": 15, "Offenbarung": 66, "2. Könige": 12, "3. Mose": 3, "2. Johannes": 63, "2. Samuel": 10, "2. Petrus": 61, "1. Thessalonicher": 52} | |
one_chapter_books = ['Obadja', 'Philemon', 'Judas', '2. Johannes', '3. Johannes'] | |
for k in biblebooks.keys(): | |
biblebooks[unicode(k.decode('utf-8'))] = biblebooks.pop(k) | |
# for German bible texts only | |
baseurl = 'http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/' | |
"""Takes as input the text's "address" as string or unicode and returns the text's content as unicode, downloaded from | |
the German New World Translation published by JW on jw.org. | |
Example call: nwt_textfinder.get_bible_text_german(u'Sprüche 18:10') | |
Returns: u'10\xa0\xa0Der Name Jehovas ist ein starker Turm.+ Der Gerechte l\xe4uft hinein und wird besch\xfctzt.*+\n\n' | |
Also downloads multiple texts like "1. Mose 1:1, 2" or "1. Mose 1:1-3" | |
""" | |
def get_bible_text_german(text): | |
text = unicode(text.strip()) | |
text = text.replace(u'\xa0', u' ') | |
while not text[-1].isdigit(): | |
text = text[:-1] | |
logging.debug('Now starting to look up this text: %s' %text) | |
if ':' in text: | |
book_chapter, verse = text.split(':', 1) | |
try: | |
book, chapter = book_chapter.rsplit(' ', 1) | |
except ValueError: | |
logging.error('No valid bible text - there must be a space between book and chapter: %s' %text) | |
return '' | |
else: | |
book_found = False | |
for b in one_chapter_books: | |
if b in text: | |
logging.warning('This is a text from a book with one chapter: %s' %text) | |
try: | |
book, verse = text.rsplit(' ', 1) | |
except ValueError: | |
logging.error('No valid bible text - there must be a space between book and verse: %s' %text) | |
chapter = '1' | |
book_found = True | |
if not book_found: | |
logging.error('No valid text! There must be a colon between the chapter and the verse: %s' %text) | |
return '' | |
logging.debug('Book: %s' %book) | |
logging.debug('Chapter: %s' %chapter) | |
logging.debug('Verse: %s' %verse) | |
if '-' in verse: | |
start_verse, end_verse = verse.split('-') | |
verselist = range(int(start_verse.strip()), int(end_verse.strip())+1) | |
verselist = [str(v) for v in verselist] | |
elif ',' in verse: | |
a, b = verse.split(',') | |
verselist = [a.strip(), b.strip()] | |
else: | |
verselist = [verse.replace(';','').replace(',','')] | |
verselist = [v for v in verselist if v] | |
logging.debug('Will now look up these verse(s): %s' %str(verselist)) | |
book = book.replace(u'\xa0', ' ') | |
texturl = book.replace('. ', '-') + '/' + chapter + '/' | |
r = requests.get(baseurl + texturl) | |
soup = BeautifulSoup(r.content) | |
try: | |
bookid = str(biblebooks[book]) | |
except KeyError as err: | |
logging.error('Book %s was not found! %s' %(book, err)) | |
result = '' | |
for v in verselist: | |
textid = 'v' + bookid + (3 - len(chapter)) * '0' + chapter + (3 - len(v)) * '0' + v | |
logging.debug('ID of the texts element: %s' %textid) | |
souptext = soup.find(id=textid) | |
if souptext: | |
result += souptext.text + '\n' | |
else: | |
logging.error('No text found for this verse: %s' %v) | |
return result | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# various Python code snippets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# scrape Wikipedia table and export as .csv file | |
import urllib, urllib2 | |
import unicodecsv as csv | |
import json | |
import codecs | |
from BeautifulSoup import BeautifulSoup | |
def main(url, table_names): | |
all_data = list() | |
fh = urllib.urlopen(url) | |
content = fh.read() | |
soup = BeautifulSoup(content) | |
tables = soup.findAll('table', 'wikitable') #excludes vertical navigation box with article summary | |
for table_no, table in enumerate(tables[:-1]): | |
if table_no == 0: | |
heads = [th.text for th in table.findAll('th')] | |
all_data.append(heads + [u'Continent']) | |
for row in table.findAll('tr'): | |
row_data = [] | |
if row.findAll('th'): | |
continue | |
for table_data in row.findAll('td'): | |
if table_data.a: row_data.append(table_data.a.text) | |
else: row_data.append(table_data.text) | |
row_data.append(table_names[table_no]) | |
all_data.append(row_data) | |
# csv module cannot handle unicode (shame on you) | |
all_data_utf8 = [] | |
for line in all_data: | |
all_data_utf8.append([unicode(s).encode('utf-8') for s in line]) | |
out = open('all_data.csv', 'w') | |
writer = csv.writer(out, dialect='excel', encoding='utf-8') | |
for row in all_data_utf8: | |
print row | |
writer.writerow(row) | |
out.close() | |
if __name__ == '__main__': | |
url = r"https://en.wikipedia.org/wiki/Jehovah's_Witnesses_by_country" | |
# TODO | |
table_names = [u'Africa', u'North America', u'Caribbean', u'South America', u'Asia', u'Europe', u'Oceania', u'Other'] | |
main(url, table_names) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Send mobi files to kindle email address with python's email package''' | |
def send2kindle(f): | |
msg = MIMEMultipart() | |
msg['Subject'] = "Neue Zeitschriftenlieferung!" | |
msg['From'] = | |
msg['To'] = sender_email | |
msg['Date'] = formatdate(localtime=True) | |
message = "Neue Zeitschriften im Anhang" | |
msg.attach(MIMEText(message)) | |
part = MIMEApplication(open(f, 'rb').read(), _subtype='application/x-mobipocket-ebook') | |
part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f)) | |
msg.attach(part) | |
# connect to Google's SMTP server | |
mailserver = smtplib.SMTP(smtpserver, port) | |
mailserver.ehlo() | |
mailserver.starttls() | |
mailserver.ehlo() | |
mailserver.login(sender_email, sender_password) | |
mailserver.sendmail(sender_email, kindles_email, msg.as_string()) | |
mailserver.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# choose a key from a dictionary randomly - the probability of a key getting chosen is commensurate to its value | |
# d ist dictionary mit index - count werten | |
def simpleProbDist(d): | |
i = 1 | |
d2 = dict() | |
for k in d.keys(): | |
r = (i, i+d[k]) | |
#print r | |
i += d[k] | |
#print k | |
d2[k] = r | |
x = random.randint(1, i-1) | |
#print "x: " + str(x) | |
for k in d2.keys(): | |
if x in range(*d2[k]): | |
chosen = k | |
break | |
return chosen |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#remove punctuation: | |
import string | |
table = string.maketrans("","") | |
def rem_punct(s): | |
return s.translate(table, string.punctuation) | |
#lemmatizing of nouns and verbs with the Nodebox Linguistics Library und WordNet | |
#perform POS tagging before to improve precision | |
def lemmatize(words): | |
lemmas = [] | |
for w in words: | |
if en.is_verb(w): | |
lemmas.append(en.verb.infinitive(w.lower())) | |
elif en.is_noun(w): | |
lemmas.append(en.noun.singular(w.lower())) | |
else: | |
lemmas.append(w.lower()) | |
return lemmas |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# hopefully i won't forget them anymore when i wrote them down here :-) | |
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
str(datetime.datetime.now()) | |
if __name__ == '__main__': | |
# only Python 2, not 3 | |
except IOerror as e: | |
print >> stderr, "IOError: %s" %e.strerror | |
import logging | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
import warnings | |
warnings.filterwarnings('ignore', category=DeprecationWarning) | |
# execute bash command | |
subprocess.Popen(['/bin/bash', '-c', cmd]) | |
# get output of bash command in python | |
output = subprocess.check_output(['/bin/bash', '-c', cmd]) | |
# get stderr | |
output = subprocess.check_output('ls', stderr=subprocess.STDOUT) | |
Pylab / Numpy etc. | |
data = np.genfromtxt('all_absolute.csv', delimiter=';') | |
data = np.array(1, 2, 3], [4, 5, 6) | |
plt.imshow(data4, interpolation='nearest', cmap=plt.cm.bone) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import wikipedia #https://github.com/goldsmith/Wikipedia | |
import urllib | |
import re | |
msg = 'Error!' | |
try: | |
p = wikipedia.page(page_title) | |
# if you know only the page_title and url but not the pageid you have to | |
# find it in the html code of the wikipedia page before you can use | |
# the API functions | |
except wikipedia.DisambiguationError as err: | |
print err | |
fh = urllib.urlopen(baseurl + page_link) | |
if fh: | |
html = fh.read() | |
r = re.search('"wgArticleId":(\d+),', html) | |
groups = r.groups() | |
if groups: | |
pageid = groups[0] | |
p = wikipedia.page(pageid=pageid) | |
else: | |
print msg | |
return | |
else: | |
print msg | |
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import codecs | |
import string | |
import re | |
import logging | |
import nltk | |
import gensim | |
import en | |
# before starting the script install Cython with `pip install cython` to use optimized word2vec training (70x speedup). | |
# Creates and saves a word2vec model from a big (some MB) file of raw text. | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
global table | |
table = string.maketrans("","") | |
def remove_punct(s): | |
return s.translate(table, string.punctuation) | |
def lemmatize(words): | |
lemmas = [] | |
for w in words: | |
if en.is_verb(w): | |
lemmas.append(en.verb.infinitive(w.lower())) | |
elif en.is_noun(w): | |
lemmas.append(en.noun.singular(w.lower())) | |
else: | |
lemmas.append(w.lower()) | |
return lemmas | |
class WordPreparer(object): | |
def __init__(self, fname): | |
self.fname = fname | |
fh = codecs.open(fname, mode='r', encoding='utf-8') | |
text = fh.read() | |
self.sents = nltk.sent_tokenize(text) | |
self.sents = [remove_punct(s.encode('utf-8')) for s in self.sents] | |
def __iter__(self): | |
for s in self.sents: | |
yield lemmatize(remove_punct(s).split()) | |
if __name__ == '__main__': | |
words = WordPreparer('raw_text.txt') | |
model = gensim.models.Word2Vec(words, min_count=3, workers=2) | |
model.save('word2vec_model') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment