Skip to content

Instantly share code, notes, and snippets.

@fstamour
Created June 30, 2020 00:21
Show Gist options
  • Save fstamour/119c4f9b0d956c81a38099d58d3cd492 to your computer and use it in GitHub Desktop.
Save fstamour/119c4f9b0d956c81a38099d58d3cd492 to your computer and use it in GitHub Desktop.
A python script to extract information from a directory full of org files
#!/usr/bin/env nix-shell
#! nix-shell -p python38 graphviz mate.eom -i python
import os
import sys
import glob
import re
import subprocess
# TODO Better variable name?
# How far to look for tags in a file's header (in lines)
HEADER_N = 5
def extract_tag_file(files, header=None):
"""
Return a dictionay, mapping each tag to a set of files.
"""
tag_file = {}
# go through each files
for file in files:
with open(file) as input:
# find all tags
line_number = 0
for line in input:
line_number += 1
if header is not None and line_number > header:
break
if line.startswith('# Tags'):
for tag in line.split()[2:]:
# add the file to the result
if tag not in tag_file:
tag_file[tag] = set()
tag_file[tag].add(file)
return tag_file
def extract_links(files):
# TODO docstring
link_regex = re.compile(r'\[(file):([^]]+)]', re.MULTILINE)
links = set()
# go through each files
for file in files:
with open(file) as input:
# read the whole file at once.
content = input.read()
for match in link_regex.finditer(content):
type = match.group(1)
target = match.group(2)
links.add((file, type, target))
return links
def extract_headers(files):
header_file = {}
# go through each files
for file in files:
with open(file) as input:
for line in input:
if line.startswith('* '):
header = line[2:].strip()
# add the file to the result
if header not in header_file:
header_file[header] = set()
header_file[header].add(file)
return header_file
def print_link_edges(links):
# TODO docstring
for file, type, target in links:
# TODO more flexible filtering
if file != 'index.org' \
and not target == 'index.org' \
and not target.startswith('~') \
and not target.endswith('.pdf'):
print(f'"{file}" -> "{target}";')
def print_tag_edges(tag_file):
# TODO docstring
for tag in tag_file:
files = tag_file[tag]
if len(files) > 1:
print('->'.join(['"' + file + '"' for file in files]) + f'[label="{tag}"];')
def print_graph(files, include_tags):
# TODO docstring
links = extract_links(files)
print('digraph links {')
print('graph [overlap=false]')
# print('graph [K=2]')
print('graph [repulsiveforce=2]')
print('rankdir=LR')
print_link_edges(links)
if include_tags:
tag_file = extract_tag_file(files)
print('subgraph tags {')
print('edge [dir=none, style=dotted]')
print_tag_edges(tag_file)
print('}')
print('}')
def print_tags(files):
tag_file = extract_tag_file(files, HEADER_N)
for tag in sorted(tag_file.keys()):
print(tag)
def print_file_that_has_tag(files, tag_pattern):
"""
Tags are matched if they contain that pattern.
Print all the files that contain the matched tag.
"""
tag_file = extract_tag_file(files, HEADER_N)
pattern = sys.argv[1]
tags = [tag for tag in tag_file.keys() if pattern in tag]
for tag in tags:
print(tag)
print(tag_file[tag])
def show_graph():
os.system('./extract.py --graph | dot -Tsvg -o graph.svg && eom graph.svg')
def normalize_header(header):
parts = header.lower().split()
# excluded = set(
parts = [p for p in parts if not in excluded]
return ' '.join()
def examine_headers(files):
header_file = extract_headers(files)
similar_headers = {}
for header in header_file:
normalized_header = normalize_header(header)
if normalize_header not in similar_headers:
similar_headers[normalized_header] = set()
similar_headers[normalized_header].add(normalized_header)
# print(similar_headers)
for header, files in header_file.items():
# for header, files in similar_headers.items():
count = len(files)
#if count > 1:
print(header, count)
def print_help():
print("""Usage: extract.py [command]
Extract information from org files.
All commands:
--help print this help
--graph print a graph (dot file) that represents the
relations between the *.org files
--show-graph generate graph.svg and open it with eom (eye
of mate)
--tags print all the tags found in the first lines
--files pattern list the files that has the tags that match
the pattern
--headers print all the top-level headers that appears
more than once
Example:
extract.py --graph | dot -Tsvg -o graph.svg && eom graph.svg
""")
if __name__ == '__main__':
if len(sys.argv) == 1:
cmd = '--help'
else:
cmd = sys.argv[1]
files = glob.glob('*.org')
pattern = None
if len(sys.argv) > 2:
pattern = sys.argv[2]
default = lambda: print(f"invalid command '{sys.argv[1]}'")
{
'--files': lambda: print_file_that_has_tag(files, pattern),
'--tags': lambda: print_tags(files),
'--graph': lambda: print_graph(files, True),
'--help': lambda: print_help(),
'--show-graph': lambda: show_graph(),
'--headers': lambda: examine_headers(files)
}.get(cmd, default)()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment