Created
March 23, 2018 14:53
-
-
Save ColinTalbert/5ef8e36b4fdb3be713f649730a4c1858 to your computer and use it in GitHub Desktop.
Convert FGDC XMLs to Formatted Word docx.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## An example of how to batch convert FGDC XML documents to a rendered Word format (docx)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from docx import Document\n", | |
" \n", | |
"from pymdwizard.core import review_utils\n", | |
"from docx.shared import Inches\n", | |
"from pymdwizard.core.review_utils import _get_longname, _add_child_content\n", | |
"from pymdwizard.core import xml_utils\n", | |
"\n", | |
"import glob" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def md_to_docx(xml_fname, docx_fname):\n", | |
" document = Document()\n", | |
"\n", | |
" DOCX = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'\n", | |
" element = document.settings.element.find(DOCX + 'proofState')\n", | |
" element.attrib[DOCX + 'grammar'] = 'dirty'\n", | |
" element.attrib[DOCX + 'spelling'] = 'dirty'\n", | |
"\n", | |
" review_utils._load_styles(document)\n", | |
" \n", | |
" md = xml_utils.XMLRecord(xml_fname)\n", | |
" title2 = document.add_heading('Metadata:', level=3)\n", | |
" title2.style = document.styles['fgdc heading 3']\n", | |
"\n", | |
" for child in md.metadata.children:\n", | |
" long_name = _get_longname(child.tag)\n", | |
" link = document.add_paragraph(text=long_name, style='fgdc link')\n", | |
" link.paragraph_format.left_indent = Inches(0.5)\n", | |
" link.paragraph_format.line_spacing=1\n", | |
"\n", | |
" for child in md.metadata.children:\n", | |
" long_name = _get_longname(child.tag)\n", | |
" bar = document.add_paragraph('_'*72, style='fgdc bar')\n", | |
"\n", | |
" section_title = document.add_heading(long_name+ ':', level=3)\n", | |
" section_title.style = document.styles['fgdc heading 3']\n", | |
" section_title.paragraph_format.space_after = Inches(.15)\n", | |
" _add_child_content(document, child)\n", | |
"\n", | |
" document.save(docx_fname)\n", | |
"\n", | |
"# if you would like to test thies function on a single file uncomment and update these lines \n", | |
"# fname = r\"C:\\Users\\talbertc\\Downloads\\MD_AGES_and FREQUENCIES.csv.xml\"\n", | |
"# md_to_docx(fname, fname+'.docx')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Put in your directory name here" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dname = r\"C:\\temp\\WizardScratch\"\n", | |
"\n", | |
"md_fnames = list(glob.iglob('{}/**/*.xml'.format(dname), recursive=True))\n", | |
"md_fnames = [f for f in md_fnames if '~' not in f]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"processing: C:\\temp\\WizardScratch\\Bathymetric_and_hydrodynamic_surveys_of_Lake_Michigan_nearshore_near_Ogden_Dunes_Burns_Harbor_Indiana.xml\n", | |
"\n", | |
"processing: C:\\temp\\WizardScratch\\Bathymetric_surveys_of_the_Neosho_River_Spring_River_and_Elk_River_northeastern_Oklahoma_southwestern_Missouri_2016-17.xml\n", | |
"\n", | |
"processing: C:\\temp\\WizardScratch\\Bathymetric_survey_of_Rock_Run_Rookery_Lake_Will_County_Illinois_FINAL.xml\n", | |
"\n", | |
"processing: C:\\temp\\WizardScratch\\Bathymetry_and_Capacity_of_Shawnee_Reservoir_Oklahoma_2016.xml\n", | |
"\n", | |
"processing: C:\\temp\\WizardScratch\\Bathymetry_on_the_East_Fork_White_River_at_Columbus_Indiana_March_29-30_and_April_13_2017.xml\n", | |
"\n", | |
"processing: C:\\temp\\WizardScratch\\Bathymetry_on_the_Rolling_Fork_and_Beech_Fork_near_Boston_Kentucky_April_4-5_2017.xml\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for fname in md_fnames:\n", | |
" print(\"processing: {}\\n\".format(fname))\n", | |
" md_to_docx(fname, fname+'.docx')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"gist": { | |
"data": { | |
"description": "Convert FGDC XMLs to Formatted Word docx.ipynb", | |
"public": true | |
}, | |
"id": "" | |
}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
}, | |
"toc": { | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment