kylebarron · May 11, 2018 18:16 · kylebarron · May 11, 2018
diff --git a/stata_desc.py b/stata_desc.py
 #! /usr/bin/env python3
 """
 ---------------------------------------------------------------------
 Program: stata_desc.py
 Author:  Kyle Barron <[email protected]>
 Created: 5/11/2018, 2:01:13 PM
 Purpose: Generate markdown data codebooks for documentation

 The command line input should be the path to the `.dta` file.

 This program outputs a Markdown file with extension `.md` with the same name as
 the `.dta` file in its same directory. You can then use
 [Pandoc](https://pandoc.org) to convert this Markdown file to PDF, docx, or
 HTML.

 The dependencies are Python 3.6 and pandas.
 """

 import pandas as pd
 from textwrap import dedent
 from pathlib import Path
 from sys import argv

 infile = 'auto.dta'
 def main():
    try:
        infile = argv[1]
    except IndexError:
        msg = 'Provide file path as command line argument'
        raise ValueError(msg)

    if not infile.endswith('.dta'):
        msg = 'Path must end with ".dta"'
        raise ValueError(msg)

    path = Path(infile)
    stub = path.parts[-1][:-4]

    r = generate_codebook(str(path))
    text = dedent(f"""\
    # `{stub}.dta`

    This file has {r[0]} columns and {r[1]} observations.

    ## Codebook

    """)
    text += r[2]

    mdfile = path.with_suffix('.md')
    with open(mdfile, 'w') as f:
        f.write(text)

 def generate_codebook(dta_path):
    """Generates Markdown text with variable names as the subheaders and
    variable labels, type, and format in a list.
    """

    itr = pd.read_stata(dta_path, iterator=True)

    ncol = itr.nvar
    nobs = itr.nobs

    variables = []
    for varname, varlabel in itr.variable_labels().items():
        d = {}
        d['name'] = varname
        d['label'] = varlabel
        variables.append(d)

    for i in range(ncol):
        # variables[i]['col_size'] = itr.col_sizes[i]
        # variables[i]['dtype'] = itr.dtyplist[i]
        variables[i]['fmt'] = itr.fmtlist[i]
        variables[i]['type'] = itr.typlist[i]

    text = []

    var = variables[0]
    for var in variables:
        if type(var['type']) == int:
            var_type = f"str{var['type']}"
        elif var['type'] == 'b':
            var_type = 'byte'
        elif var['type'] == 'h':
            var_type = 'int'
        elif var['type'] == 'l':
            var_type = 'long'
        elif var['type'] == 'd':
            var_type = 'double'
        elif var['type'] == 'f':
            var_type = 'float'

        var_text = f"""\
        ### `{var['name']}`

        - Label: {var['label']}
        - Type: `{var_type}`
        - Format: `{var['fmt']}`

        """
        text.append(dedent(var_text))

    return (ncol, nobs, ''.join(text))

 if __name__ == '__main__':
    main()
	#! /usr/bin/env python3
	"""
	---------------------------------------------------------------------
	Program: stata_desc.py
	Author: Kyle Barron <[email protected]>
	Created: 5/11/2018, 2:01:13 PM
	Purpose: Generate markdown data codebooks for documentation

	The command line input should be the path to the `.dta` file.

	This program outputs a Markdown file with extension `.md` with the same name as
	the `.dta` file in its same directory. You can then use
	[Pandoc](https://pandoc.org) to convert this Markdown file to PDF, docx, or
	HTML.

	The dependencies are Python 3.6 and pandas.
	"""

	import pandas as pd
	from textwrap import dedent
	from pathlib import Path
	from sys import argv

	infile = 'auto.dta'
	def main():
	try:
	infile = argv[1]
	except IndexError:
	msg = 'Provide file path as command line argument'
	raise ValueError(msg)

	if not infile.endswith('.dta'):
	msg = 'Path must end with ".dta"'
	raise ValueError(msg)

	path = Path(infile)
	stub = path.parts[-1][:-4]

	r = generate_codebook(str(path))
	text = dedent(f"""\
	# `{stub}.dta`

	This file has {r[0]} columns and {r[1]} observations.

	## Codebook

	""")
	text += r[2]

	mdfile = path.with_suffix('.md')
	with open(mdfile, 'w') as f:
	f.write(text)

	def generate_codebook(dta_path):
	"""Generates Markdown text with variable names as the subheaders and
	variable labels, type, and format in a list.
	"""

	itr = pd.read_stata(dta_path, iterator=True)

	ncol = itr.nvar
	nobs = itr.nobs

	variables = []
	for varname, varlabel in itr.variable_labels().items():
	d = {}
	d['name'] = varname
	d['label'] = varlabel
	variables.append(d)

	for i in range(ncol):
	# variables[i]['col_size'] = itr.col_sizes[i]
	# variables[i]['dtype'] = itr.dtyplist[i]
	variables[i]['fmt'] = itr.fmtlist[i]
	variables[i]['type'] = itr.typlist[i]

	text = []

	var = variables[0]
	for var in variables:
	if type(var['type']) == int:
	var_type = f"str{var['type']}"
	elif var['type'] == 'b':
	var_type = 'byte'
	elif var['type'] == 'h':
	var_type = 'int'
	elif var['type'] == 'l':
	var_type = 'long'
	elif var['type'] == 'd':
	var_type = 'double'
	elif var['type'] == 'f':
	var_type = 'float'

	var_text = f"""\
	### `{var['name']}`

	- Label: {var['label']}
	- Type: `{var_type}`
	- Format: `{var['fmt']}`

	"""
	text.append(dedent(var_text))

	return (ncol, nobs, ''.join(text))

	if __name__ == '__main__':
	main()