SimonMayerhofer · January 14, 2018 18:01
diff --git a/german_typos_wikipedia_to_alfred_snippets.py b/german_typos_wikipedia_to_alfred_snippets.py
 # !/usr/bin/env python
 # -*- coding: utf-8 -*-

 '''
 Script to parse the Wikipedia pages for common german typos and convert them
 to Alfred Snippets.

 See: https://de.wikipedia.org/wiki/Wikipedia:Liste_von_Tippfehlern

 '''

 import re
 import os
 import urllib.request
 import uuid

 entries = 0


 def get_correction_list(page_source):
    regex = "<li>\s*([\w+]+)\*?\s?\(([\w\s]+\+?)\*?\)"

    matches = re.findall(regex, page_source, re.UNICODE)
    correction_list = list()

    for match in matches:
        correction_list.append({"alfredsnippet": {
            "keyword": match[0].replace("+", " ") + " ",
            "name": match[0].replace("+", " "),
            "snippet": match[1] + " ",
            "uid": str.upper(str(uuid.uuid4()))
        }})
    return correction_list


 def parse_sub_page(sub_page):
    response = urllib.request.urlopen(
        "https://de.wikipedia.org/wiki/Wikipedia:Liste_von_Tippfehlern/" +
        sub_page)
    page_source = str(
        response.read().decode(response.headers.get_content_charset()))
    return get_correction_list(page_source)


 def get_all_typos():
    pages = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
             'N', 'O', 'PQ', 'R', 'S', 'T', 'U', 'V', 'W', 'XYZ']
    typo_list = list()

    print("Start fetching data...")

    for page in pages:
        print(page + "...")
        correction_list = parse_sub_page(page)
        for correction in correction_list:
            typo_list.append(correction)
            global entries
            entries += 1
    return typo_list


 def write_to_files(typo_list):
    for line in typo_list:
        item = line["alfredsnippet"]
        filename = item["name"] + " [" + item["uid"] + "].json"
        directory = "german_typos_wikipedia/"

        if not os.path.exists(directory):
            os.makedirs(directory)

        with open(directory + filename, encoding='utf-8',
                  mode='w+') as f:
            f.write(str(line).replace("'", "\"") + "\n")

    print("finished " + u'\u2713')
    global entries
    print("Entries: " + str(entries))


 if __name__ == "__main__":
    write_to_files(get_all_typos())
	# !/usr/bin/env python
	# -- coding: utf-8 --

	'''
	Script to parse the Wikipedia pages for common german typos and convert them
	to Alfred Snippets.

	See: https://de.wikipedia.org/wiki/Wikipedia:Liste_von_Tippfehlern

	'''

	import re
	import os
	import urllib.request
	import uuid

	entries = 0


	def get_correction_list(page_source):
	regex = "<li>\s([\w+]+)\?\s?\(([\w\s]+\+?)\*?\)"

	matches = re.findall(regex, page_source, re.UNICODE)
	correction_list = list()

	for match in matches:
	correction_list.append({"alfredsnippet": {
	"keyword": match[0].replace("+", " ") + " ",
	"name": match[0].replace("+", " "),
	"snippet": match[1] + " ",
	"uid": str.upper(str(uuid.uuid4()))
	}})
	return correction_list


	def parse_sub_page(sub_page):
	response = urllib.request.urlopen(
	"https://de.wikipedia.org/wiki/Wikipedia:Liste_von_Tippfehlern/" +
	sub_page)
	page_source = str(
	response.read().decode(response.headers.get_content_charset()))
	return get_correction_list(page_source)


	def get_all_typos():
	pages = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
	'N', 'O', 'PQ', 'R', 'S', 'T', 'U', 'V', 'W', 'XYZ']
	typo_list = list()

	print("Start fetching data...")

	for page in pages:
	print(page + "...")
	correction_list = parse_sub_page(page)
	for correction in correction_list:
	typo_list.append(correction)
	global entries
	entries += 1
	return typo_list


	def write_to_files(typo_list):
	for line in typo_list:
	item = line["alfredsnippet"]
	filename = item["name"] + " [" + item["uid"] + "].json"
	directory = "german_typos_wikipedia/"

	if not os.path.exists(directory):
	os.makedirs(directory)

	with open(directory + filename, encoding='utf-8',
	mode='w+') as f:
	f.write(str(line).replace("'", "\"") + "\n")

	print("finished " + u'\u2713')
	global entries
	print("Entries: " + str(entries))


	if __name__ == "__main__":
	write_to_files(get_all_typos())