Created
June 30, 2020 00:21
-
-
Save fstamour/119c4f9b0d956c81a38099d58d3cd492 to your computer and use it in GitHub Desktop.
A python script to extract information from a directory full of org files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env nix-shell | |
#! nix-shell -p python38 graphviz mate.eom -i python | |
import os | |
import sys | |
import glob | |
import re | |
import subprocess | |
# TODO Better variable name? | |
# How far to look for tags in a file's header (in lines) | |
HEADER_N = 5 | |
def extract_tag_file(files, header=None): | |
""" | |
Return a dictionay, mapping each tag to a set of files. | |
""" | |
tag_file = {} | |
# go through each files | |
for file in files: | |
with open(file) as input: | |
# find all tags | |
line_number = 0 | |
for line in input: | |
line_number += 1 | |
if header is not None and line_number > header: | |
break | |
if line.startswith('# Tags'): | |
for tag in line.split()[2:]: | |
# add the file to the result | |
if tag not in tag_file: | |
tag_file[tag] = set() | |
tag_file[tag].add(file) | |
return tag_file | |
def extract_links(files): | |
# TODO docstring | |
link_regex = re.compile(r'\[(file):([^]]+)]', re.MULTILINE) | |
links = set() | |
# go through each files | |
for file in files: | |
with open(file) as input: | |
# read the whole file at once. | |
content = input.read() | |
for match in link_regex.finditer(content): | |
type = match.group(1) | |
target = match.group(2) | |
links.add((file, type, target)) | |
return links | |
def extract_headers(files): | |
header_file = {} | |
# go through each files | |
for file in files: | |
with open(file) as input: | |
for line in input: | |
if line.startswith('* '): | |
header = line[2:].strip() | |
# add the file to the result | |
if header not in header_file: | |
header_file[header] = set() | |
header_file[header].add(file) | |
return header_file | |
def print_link_edges(links): | |
# TODO docstring | |
for file, type, target in links: | |
# TODO more flexible filtering | |
if file != 'index.org' \ | |
and not target == 'index.org' \ | |
and not target.startswith('~') \ | |
and not target.endswith('.pdf'): | |
print(f'"{file}" -> "{target}";') | |
def print_tag_edges(tag_file): | |
# TODO docstring | |
for tag in tag_file: | |
files = tag_file[tag] | |
if len(files) > 1: | |
print('->'.join(['"' + file + '"' for file in files]) + f'[label="{tag}"];') | |
def print_graph(files, include_tags): | |
# TODO docstring | |
links = extract_links(files) | |
print('digraph links {') | |
print('graph [overlap=false]') | |
# print('graph [K=2]') | |
print('graph [repulsiveforce=2]') | |
print('rankdir=LR') | |
print_link_edges(links) | |
if include_tags: | |
tag_file = extract_tag_file(files) | |
print('subgraph tags {') | |
print('edge [dir=none, style=dotted]') | |
print_tag_edges(tag_file) | |
print('}') | |
print('}') | |
def print_tags(files): | |
tag_file = extract_tag_file(files, HEADER_N) | |
for tag in sorted(tag_file.keys()): | |
print(tag) | |
def print_file_that_has_tag(files, tag_pattern): | |
""" | |
Tags are matched if they contain that pattern. | |
Print all the files that contain the matched tag. | |
""" | |
tag_file = extract_tag_file(files, HEADER_N) | |
pattern = sys.argv[1] | |
tags = [tag for tag in tag_file.keys() if pattern in tag] | |
for tag in tags: | |
print(tag) | |
print(tag_file[tag]) | |
def show_graph(): | |
os.system('./extract.py --graph | dot -Tsvg -o graph.svg && eom graph.svg') | |
def normalize_header(header): | |
parts = header.lower().split() | |
# excluded = set( | |
parts = [p for p in parts if not in excluded] | |
return ' '.join() | |
def examine_headers(files): | |
header_file = extract_headers(files) | |
similar_headers = {} | |
for header in header_file: | |
normalized_header = normalize_header(header) | |
if normalize_header not in similar_headers: | |
similar_headers[normalized_header] = set() | |
similar_headers[normalized_header].add(normalized_header) | |
# print(similar_headers) | |
for header, files in header_file.items(): | |
# for header, files in similar_headers.items(): | |
count = len(files) | |
#if count > 1: | |
print(header, count) | |
def print_help(): | |
print("""Usage: extract.py [command] | |
Extract information from org files. | |
All commands: | |
--help print this help | |
--graph print a graph (dot file) that represents the | |
relations between the *.org files | |
--show-graph generate graph.svg and open it with eom (eye | |
of mate) | |
--tags print all the tags found in the first lines | |
--files pattern list the files that has the tags that match | |
the pattern | |
--headers print all the top-level headers that appears | |
more than once | |
Example: | |
extract.py --graph | dot -Tsvg -o graph.svg && eom graph.svg | |
""") | |
if __name__ == '__main__': | |
if len(sys.argv) == 1: | |
cmd = '--help' | |
else: | |
cmd = sys.argv[1] | |
files = glob.glob('*.org') | |
pattern = None | |
if len(sys.argv) > 2: | |
pattern = sys.argv[2] | |
default = lambda: print(f"invalid command '{sys.argv[1]}'") | |
{ | |
'--files': lambda: print_file_that_has_tag(files, pattern), | |
'--tags': lambda: print_tags(files), | |
'--graph': lambda: print_graph(files, True), | |
'--help': lambda: print_help(), | |
'--show-graph': lambda: show_graph(), | |
'--headers': lambda: examine_headers(files) | |
}.get(cmd, default)() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment