fstamour · June 30, 2020 00:21
diff --git a/extract.py b/extract.py
 #!/usr/bin/env nix-shell
 #! nix-shell -p python38 graphviz mate.eom -i python 

 import os
 import sys
 import glob
 import re
 import subprocess

 # TODO Better variable name?
 # How far to look for tags in a file's header (in lines)
 HEADER_N = 5

 def extract_tag_file(files, header=None):
    """
    Return a dictionay, mapping each tag to a set of files.
    """
    tag_file = {}
    # go through each files
    for file in files:
        with open(file) as input:
            # find all tags
            line_number = 0
            for line in input:
                line_number += 1
                if header is not None and line_number > header:
                    break
                if line.startswith('# Tags'):
                    for tag in line.split()[2:]:
                        # add the file to the result
                        if tag not in tag_file:
                            tag_file[tag] = set()
                        tag_file[tag].add(file)
    return tag_file

 def extract_links(files):
    # TODO docstring
    link_regex = re.compile(r'\[(file):([^]]+)]', re.MULTILINE)
    links = set()
    # go through each files
    for file in files:
        with open(file) as input:
            # read the whole file at once.
            content = input.read()
            for match in link_regex.finditer(content):
                type = match.group(1)
                target = match.group(2)
                links.add((file, type, target))
    return links

 def extract_headers(files):
    header_file = {}
    # go through each files
    for file in files:
        with open(file) as input:
            for line in input:
                if line.startswith('* '):
                    header = line[2:].strip()
                    # add the file to the result
                    if header not in header_file:
                        header_file[header] = set()
                    header_file[header].add(file)
    return header_file

 def print_link_edges(links):
    # TODO docstring
    for file, type, target in links:
        # TODO more flexible filtering
        if file != 'index.org' \
                and not target == 'index.org' \
                and not target.startswith('~') \
                and not target.endswith('.pdf'):
            print(f'"{file}" -> "{target}";')

 def print_tag_edges(tag_file):
    # TODO docstring
    for tag in tag_file:
        files = tag_file[tag]
        if len(files) > 1:
            print('->'.join(['"' + file + '"' for file in files]) + f'[label="{tag}"];')

 def print_graph(files, include_tags):
    # TODO docstring
    links = extract_links(files)
    print('digraph links {')
    print('graph [overlap=false]')
    # print('graph [K=2]')
    print('graph [repulsiveforce=2]')
    print('rankdir=LR')
    print_link_edges(links)

    if include_tags:
        tag_file = extract_tag_file(files)
        print('subgraph tags {')
        print('edge [dir=none, style=dotted]')
        print_tag_edges(tag_file)
        print('}')

    print('}')

 def print_tags(files):
    tag_file = extract_tag_file(files, HEADER_N)
    for tag in sorted(tag_file.keys()):
        print(tag)

 def print_file_that_has_tag(files, tag_pattern):
    """
    Tags are matched if they contain that pattern.
    Print all the files that contain the matched tag.
    """
    tag_file = extract_tag_file(files, HEADER_N)
    pattern = sys.argv[1]
    tags = [tag for tag in tag_file.keys() if pattern in tag]
    for tag in tags:
        print(tag)
        print(tag_file[tag])

 def show_graph():
    os.system('./extract.py --graph | dot -Tsvg -o graph.svg && eom graph.svg')

 def normalize_header(header):
    parts = header.lower().split()
    # excluded = set(
    parts = [p for p in parts if not in excluded]
    return ' '.join()

 def examine_headers(files):
    header_file = extract_headers(files)

    similar_headers = {}
    for header in header_file:
        normalized_header = normalize_header(header)
        if normalize_header not in similar_headers:
            similar_headers[normalized_header] = set()
        similar_headers[normalized_header].add(normalized_header)
    # print(similar_headers)

    for header, files in header_file.items():
    # for header, files in similar_headers.items():
        count = len(files)
        #if count > 1:
        print(header, count)

 def print_help():
    print("""Usage:  extract.py [command]

    Extract information from org files.

    All commands:
    --help            print this help
    --graph           print a graph (dot file) that represents the
                        relations between the *.org files
    --show-graph      generate graph.svg and open it with eom (eye
                        of mate)
    --tags            print all the tags found in the first lines
    --files pattern   list the files that has the tags that match
                        the pattern
    --headers         print all the top-level headers that appears
                        more than once
 Example:

 extract.py --graph | dot -Tsvg -o graph.svg && eom graph.svg
    """)

 if __name__ == '__main__':
    if len(sys.argv) == 1:
        cmd = '--help'
    else:
        cmd = sys.argv[1]

    files = glob.glob('*.org')
    pattern = None
    if len(sys.argv) > 2:
        pattern = sys.argv[2]
    default = lambda: print(f"invalid command '{sys.argv[1]}'")
    {
        '--files': lambda: print_file_that_has_tag(files, pattern),
        '--tags': lambda: print_tags(files),
        '--graph': lambda: print_graph(files, True),
        '--help': lambda: print_help(),
        '--show-graph': lambda: show_graph(),
        '--headers': lambda: examine_headers(files)
    }.get(cmd, default)()
	#!/usr/bin/env nix-shell
	#! nix-shell -p python38 graphviz mate.eom -i python

	import os
	import sys
	import glob
	import re
	import subprocess

	# TODO Better variable name?
	# How far to look for tags in a file's header (in lines)
	HEADER_N = 5

	def extract_tag_file(files, header=None):
	"""
	Return a dictionay, mapping each tag to a set of files.
	"""
	tag_file = {}
	# go through each files
	for file in files:
	with open(file) as input:
	# find all tags
	line_number = 0
	for line in input:
	line_number += 1
	if header is not None and line_number > header:
	break
	if line.startswith('# Tags'):
	for tag in line.split()[2:]:
	# add the file to the result
	if tag not in tag_file:
	tag_file[tag] = set()
	tag_file[tag].add(file)
	return tag_file

	def extract_links(files):
	# TODO docstring
	link_regex = re.compile(r'\[(file):([^]]+)]', re.MULTILINE)
	links = set()
	# go through each files
	for file in files:
	with open(file) as input:
	# read the whole file at once.
	content = input.read()
	for match in link_regex.finditer(content):
	type = match.group(1)
	target = match.group(2)
	links.add((file, type, target))
	return links

	def extract_headers(files):
	header_file = {}
	# go through each files
	for file in files:
	with open(file) as input:
	for line in input:
	if line.startswith('* '):
	header = line[2:].strip()
	# add the file to the result
	if header not in header_file:
	header_file[header] = set()
	header_file[header].add(file)
	return header_file

	def print_link_edges(links):
	# TODO docstring
	for file, type, target in links:
	# TODO more flexible filtering
	if file != 'index.org' \
	and not target == 'index.org' \
	and not target.startswith('~') \
	and not target.endswith('.pdf'):
	print(f'"{file}" -> "{target}";')

	def print_tag_edges(tag_file):
	# TODO docstring
	for tag in tag_file:
	files = tag_file[tag]
	if len(files) > 1:
	print('->'.join(['"' + file + '"' for file in files]) + f'[label="{tag}"];')

	def print_graph(files, include_tags):
	# TODO docstring
	links = extract_links(files)
	print('digraph links {')
	print('graph [overlap=false]')
	# print('graph [K=2]')
	print('graph [repulsiveforce=2]')
	print('rankdir=LR')
	print_link_edges(links)

	if include_tags:
	tag_file = extract_tag_file(files)
	print('subgraph tags {')
	print('edge [dir=none, style=dotted]')
	print_tag_edges(tag_file)
	print('}')

	print('}')

	def print_tags(files):
	tag_file = extract_tag_file(files, HEADER_N)
	for tag in sorted(tag_file.keys()):
	print(tag)

	def print_file_that_has_tag(files, tag_pattern):
	"""
	Tags are matched if they contain that pattern.
	Print all the files that contain the matched tag.
	"""
	tag_file = extract_tag_file(files, HEADER_N)
	pattern = sys.argv[1]
	tags = [tag for tag in tag_file.keys() if pattern in tag]
	for tag in tags:
	print(tag)
	print(tag_file[tag])

	def show_graph():
	os.system('./extract.py --graph \| dot -Tsvg -o graph.svg && eom graph.svg')

	def normalize_header(header):
	parts = header.lower().split()
	# excluded = set(
	parts = [p for p in parts if not in excluded]
	return ' '.join()

	def examine_headers(files):
	header_file = extract_headers(files)

	similar_headers = {}
	for header in header_file:
	normalized_header = normalize_header(header)
	if normalize_header not in similar_headers:
	similar_headers[normalized_header] = set()
	similar_headers[normalized_header].add(normalized_header)
	# print(similar_headers)

	for header, files in header_file.items():
	# for header, files in similar_headers.items():
	count = len(files)
	#if count > 1:
	print(header, count)

	def print_help():
	print("""Usage: extract.py [command]

	Extract information from org files.

	All commands:
	--help print this help
	--graph print a graph (dot file) that represents the
	relations between the *.org files
	--show-graph generate graph.svg and open it with eom (eye
	of mate)
	--tags print all the tags found in the first lines
	--files pattern list the files that has the tags that match
	the pattern
	--headers print all the top-level headers that appears
	more than once
	Example:

	extract.py --graph \| dot -Tsvg -o graph.svg && eom graph.svg
	""")

	if __name__ == '__main__':
	if len(sys.argv) == 1:
	cmd = '--help'
	else:
	cmd = sys.argv[1]

	files = glob.glob('*.org')
	pattern = None
	if len(sys.argv) > 2:
	pattern = sys.argv[2]
	default = lambda: print(f"invalid command '{sys.argv[1]}'")
	{
	'--files': lambda: print_file_that_has_tag(files, pattern),
	'--tags': lambda: print_tags(files),
	'--graph': lambda: print_graph(files, True),
	'--help': lambda: print_help(),
	'--show-graph': lambda: show_graph(),
	'--headers': lambda: examine_headers(files)
	}.get(cmd, default)()