Created
February 20, 2016 16:16
-
-
Save nudomarinero/eaee524f11de0316f19c to your computer and use it in GitHub Desktop.
Transform MacJournal individual text entries to Zim
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import locale | |
import datetime | |
import re | |
import os | |
from os.path import isfile, join | |
import unicodedata | |
from unidecode import unidecode | |
# Configuration | |
loc = locale.setlocale(locale.LC_TIME, ("es_ES.utf8", "es_ES.utf8")) | |
def parse_date(input_line): | |
""" | |
Parse the date in the MacJournal format | |
""" | |
date_numbers = list(re.findall("(\d{1,2}) de (.*?) de (\d{2,4}), (\d{1,2}):(\d{1,2})", input_line)[0]) | |
date_numbers[0] = "{:02d}".format(int(date_numbers[0])) # Day | |
date_numbers[3] = "{:02d}".format(int(date_numbers[3])) # Hour | |
date_re = "{} {} {} {}:{}".format(*date_numbers) | |
return datetime.datetime.strptime(date_re, "%d %B %Y %H:%M") | |
def parse_file(input_file): | |
""" | |
Parse a MacJournal txt single entry file | |
""" | |
with open(input_file, "r") as f: | |
line1 = f.readline().split("\t") | |
assert line1[1] == "Fecha:", "Wrong format of line 1" | |
date_entry = parse_date(line1[2]) | |
line2 = f.readline() | |
if "Tema:" in line2: | |
title_entry = line2.split("\t")[2].strip() | |
else: | |
title_entry = input_file.split("/")[-1].strip().replace(".txt", "") | |
content = [line for line in f] | |
return title_entry, content, date_entry | |
def format_zim(title, content, date, creation_date=False): | |
""" | |
Generate a Zim-like list of lines | |
""" | |
out = [] | |
out.append("Content-Type: text/x-zim-wiki\n") | |
out.append("Wiki-Format: zim 0.4\n") | |
out.append("Creation-Date: "+date.isoformat()+"+00:00\n\n") | |
out.append("====== {} ======\n".format(title)) | |
if creation_date: | |
out.append("Creado "+date.isoformat(" ")+"\n") | |
out.extend(content) | |
return out | |
def transform_file(input_file, output_file, creation_date=True): | |
""" | |
Enter a MacJournal file and output a Zim file | |
""" | |
title_entry, content, date_entry = parse_file(input_file) | |
out = format_zim(title_entry, content, date_entry, creation_date=creation_date) | |
with open(output_file, "w") as f: | |
for line in out: | |
f.write(line) | |
if __name__ == "__main__": | |
output_dir = "MacJournal" | |
dirs = ["Blog", "LOFAR", "Personal"] | |
for d in dirs: | |
path = join("text_1", d) | |
entries = [f for f in os.listdir(path) if isfile(join(path, f)) and f.endswith("txt")] | |
if not os.path.exists(join(output_dir, d)): | |
os.mkdir(join(output_dir, d)) | |
for entry in entries: | |
#name = unicodedata.normalize('NFKC', entry.replace(" ", "_")) | |
name = unidecode(entry.replace(" ", "_")) | |
print(entry, name) | |
input_file = join("text_1", d, entry) | |
output_file = join(output_dir, d, name) | |
transform_file(input_file, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment