zacharysyoung · June 16, 2021 04:08
diff --git a/commands.sh b/commands.sh
 #!/bin/bash

 grep 'INSTANT CH4' *.PRT | \             # scan all files (*.PRT) and filter each file by the text "INSTANT CH4"
       awk ' NR % 2 == 1 { print; } ' | \   # there are two different datasets per file with your variables, this takes the
                                            #   'INSTANT CH4' line from the first dataset
       cut -c 1-7,67-76 | \                 # cut out everything *but* the filename/year (first 7 characters) and the column
                                            #   for the data point you care about (characters 67 to 76)
       sed -E 's/ +/,/' \                   # `cut` takes year and data columns and joins them with a space, `sed` replaces 
                                            #    the space with a comma for CSV
       > INSTANT_CH4.csv                    # save the output to a CSV file
diff --git a/count_headers.py b/count_headers.py
 #!/usr/bin/env python

 import pprint

 """
 Interactively find new kinds of frames in one of your PRT files.

 As you run this script it will print out the first few lines of a frame that doesn't match any
 of the column headers in KNOWN_KINDS.  KNOWN_KINDS initially starts out empty, so **all** frames
 will be of an unknown kind and will be printed for you to inspect.

 You should be able to:
  1. spot the column header in the print-out
  2. copy the entire line into KNOWN_KINDS
  3. expect not to see that frame, or any other frame of its kind again

 You should eventually be able to fill out KNOWN_KINDS to the point where there are next to zero
 frames with unknown kinds being printed-out.

 I've include the column headers from ANN2006 as a sample.  Play with un-commenting them and see
 if your "unkown count" decreases.

 Some examples of a print-out with the column-header lines pointed out:

  This first example is highly repetitious, nicely enclosed with all the ------ lines, pretty easy to spot:

    [ '',
        ' From:  2006  JAN  1,  Hr  0      To:  2006  DEC 31, Hr 24  Model-Time:  2750640     Dif: 365.00 Days',
        '0                              Threshold velocity for dust emission (m/s)                      ',
        '  ----------------------------------------------------------------------------------------------------------------------------',
 -->     '  P(MB)   MEAN G      NH      SH    90  81  73  65  57  49  41  33  25  17   9   1  -7 -15 -23 -31 -39 -47 -55 -63 -71 -79 -87',
        '  ----------------------------------------------------------------------------------------------------------------------------'],


  The next two are a little more difficult because they have a unique ID at start of each column header.
  Scrolling through the print-out your eye might catch the repeating flickering lines of the 1 2 3 4 5 6 7...
  Also, look just below the 'From:  2006  JAN  1' line:

    [ '',
        'From:  2006  JAN  1,  Hr  0      To:  2006  DEC 31, Hr 24  Model-Time:  2750640     Dif: 365.00 Days',
 -->     '0AF1774,57  1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24  AVE',
        ' ',
        ' INC SW     0    0    0    0    0   41  241  532  793 1000 1139 1200 1180 1079  904  668  387  119    6    0    0    0    0    0  387',
        ' P ALBD   100  100  100   94   75   54   41   37   34   32   30   30   31   31   33   35   39   47   65   86   98  100  100  100   62'],
    [ '',
        'From:  2006  JAN  1,  Hr  0      To:  2006  DEC 31, Hr 24  Model-Time:  2750640     Dif: 365.00 Days',
 -->     '0AF1880,61  1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24  AVE',
        ' ',
        ' INC SW     0    0    0    1   60  235  495  738  931 1061 1118 1098 1004  842  622  361  134   16    0    0    0    0    0    0  363',
        ' P ALBD   100   99   90   72   51   36   30   26   24   22   22   21   22   23   27   31   41   60   83   96  100  100  100  100   57'],


  And finally, not really column headers, but a table descriptor that repeats throughout the frame:

    [ '',
        ' From:  2006  JAN  1,  Hr  0      To:  2006  DEC 31, Hr 24  Model-Time:  2750640     Dif: 365.00 Days',
 -->     'ISCCP CLOUD FREQUENCY (NTAU,NPRES) % 30N-60N',
        '------------------------------------------------------------------------',
        'PRESSTAU    0.  1.3  3.6  9.4  23   60   >',
        '90         0.0  0.0  0.0  0.0  0.0  0.0']


 When you copy lines like these from your shell/terminal, copy the entire line--both single-quotes and the comma--just
 to make it easier to copy-paste into KNOWN_KINDS, below.
 """

 SOURCE_FILE = 'ANN2006.E134TcadiRCP30aF40oQ32.PRT'
 FRAME_DELIMITER = 'E134TcadiRCP30aF40oQ32 (E134Tcadif9aF40oQ32 + RCP3.0 future scenario;                           AIC=1JAN2006.rsfE134Tcadif9aF40oQ32.'


 # if a column header in this list is found in a frame, that frame is skipped and not printed
 KNOWN_KINDS = [
    # '  P(MB)   MEAN G      NH      SH    90  81  73  65  57  49  41  33  25  17   9   1  -7 -15 -23 -31 -39 -47 -55 -63 -71 -79 -87',
    # '  P(MB)   MEAN G      NH      SH    88  80  72  64  56  48  40  32  24  16   8   0  -8 -16 -24 -32 -40 -48 -56 -64 -72 -80 -88',
    # '                    G      NH     SH     90  81  73  65  57  49  41  33  25  17   9   1  -7 -15 -23 -31 -39 -47 -55 -63 -71 -79',
    # '                                   GLOBAL    SH       -1    -9   -17   -25   -33   -41   -49   -57   -65   -73   -81   -90',
    # '                                   GLOBAL    NH       90    81    73    65    57    49    41    33    25    17     9     1',
    # '1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24  AVE',
    # ' Strait               Sum/Mean     1     2     3     4     5     6     7',
    # ' Lat   Atlantic   Pacific    Indian     Global',
    # '                  U.S. U.S. U.S. CNDA LAND EUR. RUSS SIBR PLAT CHNA DSRT DSRT DSRT SHRA SHRA SAHL RAIN RAIN ATL. ATL. PAC. PAC. PAC.',
    # 'ISCCP CLOUD FREQUENCY (NTAU,NPRES) % 30N-60N',
 ]


 # collect some lines from a frame to manually examine for new kinds, copy new kinds into KNOWN_KINDS
 frames_of_unknown_kind = []

 # read entire file
 with open(SOURCE_FILE, 'r') as f:
    all_text = f.read()
    
 # split into tables
 frames = all_text.split(FRAME_DELIMITER)

 unknown_count = 0
 for frame in frames:
    known = False

    for known_kind in KNOWN_KINDS:
        if known_kind in frame:
            # frame has a kind of data that is known, skip to next frame
            known = True
            break

    if not known:
        # frame is still just one long string of text with newlines
        lines = frame.splitlines()

        # add enough lines/context to be able to spot column header
        frames_of_unknown_kind.append(lines[:6])

        unknown_count += 1

 # print first 20 frames
 pprint.pprint(frames_of_unknown_kind[:20], indent=1)

 print('\nFrames without a known kind: %d  (only showing first 20 frames)\n' % unknown_count)
diff --git a/extract_vars.sh b/extract_vars.sh
 #!/bin/bash

 echo "Processing CH4"

 # Clean up from previous run because awk will append, and not overwrite, VAR_NAME.csv
 rm -f ./ch4_*

 # A lot going on in the following pipeline, but essentially:
 #
 #     for every file, for every "CH4 line", get 3 columns into a master CSV file

 echo -n "Initializing master CSV..."

 # Filter on INSTANT CH4 and the following 22 lines for its sub-variables

 grep -A22 "INSTANT CH4" ./*.PRT | \

 # Because each file has two different versions/measurements for each variable, 
 # grep will "group" them and separate groups by '--', so get rid (-v) of these 
 # "group separator lines"
 #
 #                                                  v same   v different v
 #     --------------------------------------------------------------------
 #     ANN2006... INSTANT CH4 (10^-6 kg/m^2)       9463.23  9649.10   9913
 #     ANN2006... CHANGE OF CH4 BY DYNAMICS           0.00  -106.57    204
 #     --   <<< filter these out                                               
 #     ANN2006... INSTANT CH4 (10^-6 kg/m^2)       9463.23  9277.36   9696
 #     ANN2006... CHANGE OF CH4 BY DYNAMICS           0.00   106.57    103

 grep -v '\-\-' | \

 # Cut line down to just the three columns needed
 #
 #                                                  v same
 #     ---------------------------------------------------
 #     ANN2006... INSTANT CH4 (10^-6 kg/m^2)       9463.23
 #     ANN2006... CHANGE OF CH4 BY DYNAMICS           0.00
 #     ANN2006... INSTANT CH4 (10^-6 kg/m^2)       9463.23
 #     ANN2006... CHANGE OF CH4 BY DYNAMICS           0.00

 cut -c 3-79 | \

 # Because the 2nd column happens to be the same value in both versions/measurements
 # (see "same" and "different" from comments above)...
 #
 # sort piped into uniq will condense the two different lines for each same value into
 # one line
 #
 #                                                  v same
 #     ---------------------------------------------------
 #     ANN2006... INSTANT CH4 (10^-6 kg/m^2)       9463.23
 #     ANN2006... CHANGE OF CH4 BY DYNAMICS           0.00

 sort | \
 uniq | \

 # Remove part of the filename that isn't needed, and some small variability in how the
 # variable's name starts: ':0' for INSTANT, '- ' for all others:
 #
 #                  vv
 #    ...F40oQ32.PRT:0INSTANT CH...
 #    ...F40oQ32.PRT- CHANGE OF ...
 #                  ^^

 sed 's/.E134TcadiRCP30aF40oQ32.PRT:0/ /' | sed 's/.E134TcadiRCP30aF40oQ32.PRT- / /' | \

 # Convert from fixed-width to CSV and dump (>) into initial.csv
 #
 #    ANN2006 INSTANT CH4 (10^-6 kg/m^2)       9463.23
 #    ANN2006 CHANGE OF CH4 BY DYNAMICS           0.00
 # >>>
 #    ANN2006,INSTANT CH4 (10^-6 kg/m^2),9463.23
 #    ANN2006,CHANGE OF CH4 BY DYNAMICS ,   0.00

 awk ' { print substr($0,1,7) "," substr($0,9,28) "," substr($0,42,7) }' \
 > ch4_0_initial.csv

 echo "done"

 echo -n "Finalizing master CSV..."

 cat ch4_0_initial.csv | \

 # Trim leading (/^) and trailing ($/) whitespace from 2nd and 3rd columns
 #
 #    ANN2006,INSTANT CH4 (10^-6 kg/m^2),9463.23
 #    ANN2006,CHANGE OF CH4 BY DYNAMICS ,   0.00
 # >>>
 #    ANN2006,INSTANT CH4 (10^-6 kg/m^2),9463.23
 #    ANN2006,CHANGE OF CH4 BY DYNAMICS,0.00

 awk -F, '
    BEGIN{FS=OFS=","}
    { gsub(/^[ \t]+/,"",$2) ; 
      gsub(/[ \t]+$/,"",$2) ;
      gsub(/^[ \t]+/,"",$3) ;
      gsub(/[ \t]+$/,"",$3) ;
    }1
    ' | \

 # Use sed to remove units in parentheses that will not work as a filename
 sed 's/ (10^-6 kg\/m^2)//' \
 > ch4_1_final.csv

 echo "done"

 echo -n "Breaking-up master CSV into component CSVs by var name..."

 # Read from final.csv, use 2nd column (var name) as filename and filter all
 # that vars data into that file
 awk -F, '{ fname = "ch4__" $2 ".csv"; print >> fname; close(fname) }' ch4_1_final.csv

 echo "done"

 wc -l ./ch4_*.csv | grep -v total

 echo ""
 echo "Processing PAN"

 rm -f ./pan_*.csv

 echo -n "Initializing master CSV..."

 # PAN only has 9 sub-variables after INSTANT
 grep -A9 "INSTANT PAN" ./*.PRT | \
    grep -v -- '--' | \
    cut -c 3-79 | \
    sort | \
    uniq | \
    sed -E 's/.E134TcadiRCP30aF40oQ32.PRT(:0|- )/ /' | \
    awk ' { print substr($0,1,7) "," substr($0,9,28) "," substr($0,42) }' \
    > pan_0_initial.csv

 echo "done"

 echo -n "Finalizing master CSV..."

 awk -F, '
    BEGIN{FS=OFS=","}
    {
    gsub(/^[ \t]+/,"",$2);gsub(/[ \t]+$/,"",$2);gsub(/^[ \t]+/,"",$3);gsub(/[ \t]+$/,"",$3)
    }1' pan_0_initial.csv | \
    sed 's/ (10^-9 kg\/m^2)//' \
    > pan_1_final.csv

 echo "done"

 echo -n "Breaking-up master CSV into component CSVs by var name..."

 awk -F, '{ fname = "pan__" $2 ".csv"; print >> fname; close(fname) }' pan_1_final.csv

 echo "done"

 wc -l ./pan_*.csv | grep -v total
	#!/bin/bash

	grep 'INSTANT CH4' .PRT \| \ # scan all files (.PRT) and filter each file by the text "INSTANT CH4"
	awk ' NR % 2 == 1 { print; } ' \| \ # there are two different datasets per file with your variables, this takes the
	# 'INSTANT CH4' line from the first dataset
	cut -c 1-7,67-76 \| \ # cut out everything but the filename/year (first 7 characters) and the column
	# for the data point you care about (characters 67 to 76)
	sed -E 's/ +/,/' \ # `cut` takes year and data columns and joins them with a space, `sed` replaces
	# the space with a comma for CSV
	> INSTANT_CH4.csv # save the output to a CSV file
	#!/usr/bin/env python

	import pprint

	"""
	Interactively find new kinds of frames in one of your PRT files.

	As you run this script it will print out the first few lines of a frame that doesn't match any
	of the column headers in KNOWN_KINDS. KNOWN_KINDS initially starts out empty, so all frames
	will be of an unknown kind and will be printed for you to inspect.

	You should be able to:
	1. spot the column header in the print-out
	2. copy the entire line into KNOWN_KINDS
	3. expect not to see that frame, or any other frame of its kind again

	You should eventually be able to fill out KNOWN_KINDS to the point where there are next to zero
	frames with unknown kinds being printed-out.

	I've include the column headers from ANN2006 as a sample. Play with un-commenting them and see
	if your "unkown count" decreases.

	Some examples of a print-out with the column-header lines pointed out:

	This first example is highly repetitious, nicely enclosed with all the ------ lines, pretty easy to spot:

	[ '',
	' From: 2006 JAN 1, Hr 0 To: 2006 DEC 31, Hr 24 Model-Time: 2750640 Dif: 365.00 Days',
	'0 Threshold velocity for dust emission (m/s) ',
	' ----------------------------------------------------------------------------------------------------------------------------',
	--> ' P(MB) MEAN G NH SH 90 81 73 65 57 49 41 33 25 17 9 1 -7 -15 -23 -31 -39 -47 -55 -63 -71 -79 -87',
	' ----------------------------------------------------------------------------------------------------------------------------'],


	The next two are a little more difficult because they have a unique ID at start of each column header.
	Scrolling through the print-out your eye might catch the repeating flickering lines of the 1 2 3 4 5 6 7...
	Also, look just below the 'From: 2006 JAN 1' line:

	[ '',
	'From: 2006 JAN 1, Hr 0 To: 2006 DEC 31, Hr 24 Model-Time: 2750640 Dif: 365.00 Days',
	--> '0AF1774,57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 AVE',
	' ',
	' INC SW 0 0 0 0 0 41 241 532 793 1000 1139 1200 1180 1079 904 668 387 119 6 0 0 0 0 0 387',
	' P ALBD 100 100 100 94 75 54 41 37 34 32 30 30 31 31 33 35 39 47 65 86 98 100 100 100 62'],
	[ '',
	'From: 2006 JAN 1, Hr 0 To: 2006 DEC 31, Hr 24 Model-Time: 2750640 Dif: 365.00 Days',
	--> '0AF1880,61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 AVE',
	' ',
	' INC SW 0 0 0 1 60 235 495 738 931 1061 1118 1098 1004 842 622 361 134 16 0 0 0 0 0 0 363',
	' P ALBD 100 99 90 72 51 36 30 26 24 22 22 21 22 23 27 31 41 60 83 96 100 100 100 100 57'],


	And finally, not really column headers, but a table descriptor that repeats throughout the frame:

	[ '',
	' From: 2006 JAN 1, Hr 0 To: 2006 DEC 31, Hr 24 Model-Time: 2750640 Dif: 365.00 Days',
	--> 'ISCCP CLOUD FREQUENCY (NTAU,NPRES) % 30N-60N',
	'------------------------------------------------------------------------',
	'PRESSTAU 0. 1.3 3.6 9.4 23 60 >',
	'90 0.0 0.0 0.0 0.0 0.0 0.0']


	When you copy lines like these from your shell/terminal, copy the entire line--both single-quotes and the comma--just
	to make it easier to copy-paste into KNOWN_KINDS, below.
	"""

	SOURCE_FILE = 'ANN2006.E134TcadiRCP30aF40oQ32.PRT'
	FRAME_DELIMITER = 'E134TcadiRCP30aF40oQ32 (E134Tcadif9aF40oQ32 + RCP3.0 future scenario; AIC=1JAN2006.rsfE134Tcadif9aF40oQ32.'


	# if a column header in this list is found in a frame, that frame is skipped and not printed
	KNOWN_KINDS = [
	# ' P(MB) MEAN G NH SH 90 81 73 65 57 49 41 33 25 17 9 1 -7 -15 -23 -31 -39 -47 -55 -63 -71 -79 -87',
	# ' P(MB) MEAN G NH SH 88 80 72 64 56 48 40 32 24 16 8 0 -8 -16 -24 -32 -40 -48 -56 -64 -72 -80 -88',
	# ' G NH SH 90 81 73 65 57 49 41 33 25 17 9 1 -7 -15 -23 -31 -39 -47 -55 -63 -71 -79',
	# ' GLOBAL SH -1 -9 -17 -25 -33 -41 -49 -57 -65 -73 -81 -90',
	# ' GLOBAL NH 90 81 73 65 57 49 41 33 25 17 9 1',
	# '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 AVE',
	# ' Strait Sum/Mean 1 2 3 4 5 6 7',
	# ' Lat Atlantic Pacific Indian Global',
	# ' U.S. U.S. U.S. CNDA LAND EUR. RUSS SIBR PLAT CHNA DSRT DSRT DSRT SHRA SHRA SAHL RAIN RAIN ATL. ATL. PAC. PAC. PAC.',
	# 'ISCCP CLOUD FREQUENCY (NTAU,NPRES) % 30N-60N',
	]


	# collect some lines from a frame to manually examine for new kinds, copy new kinds into KNOWN_KINDS
	frames_of_unknown_kind = []

	# read entire file
	with open(SOURCE_FILE, 'r') as f:
	all_text = f.read()

	# split into tables
	frames = all_text.split(FRAME_DELIMITER)

	unknown_count = 0
	for frame in frames:
	known = False

	for known_kind in KNOWN_KINDS:
	if known_kind in frame:
	# frame has a kind of data that is known, skip to next frame
	known = True
	break

	if not known:
	# frame is still just one long string of text with newlines
	lines = frame.splitlines()

	# add enough lines/context to be able to spot column header
	frames_of_unknown_kind.append(lines[:6])

	unknown_count += 1

	# print first 20 frames
	pprint.pprint(frames_of_unknown_kind[:20], indent=1)

	print('\nFrames without a known kind: %d (only showing first 20 frames)\n' % unknown_count)
	#!/bin/bash

	echo "Processing CH4"

	# Clean up from previous run because awk will append, and not overwrite, VAR_NAME.csv
	rm -f ./ch4_*

	# A lot going on in the following pipeline, but essentially:
	#
	# for every file, for every "CH4 line", get 3 columns into a master CSV file

	echo -n "Initializing master CSV..."

	# Filter on INSTANT CH4 and the following 22 lines for its sub-variables

	grep -A22 "INSTANT CH4" ./*.PRT \| \

	# Because each file has two different versions/measurements for each variable,
	# grep will "group" them and separate groups by '--', so get rid (-v) of these
	# "group separator lines"
	#
	# v same v different v
	# --------------------------------------------------------------------
	# ANN2006... INSTANT CH4 (10^-6 kg/m^2) 9463.23 9649.10 9913
	# ANN2006... CHANGE OF CH4 BY DYNAMICS 0.00 -106.57 204
	# -- <<< filter these out
	# ANN2006... INSTANT CH4 (10^-6 kg/m^2) 9463.23 9277.36 9696
	# ANN2006... CHANGE OF CH4 BY DYNAMICS 0.00 106.57 103

	grep -v '\-\-' \| \

	# Cut line down to just the three columns needed
	#
	# v same
	# ---------------------------------------------------
	# ANN2006... INSTANT CH4 (10^-6 kg/m^2) 9463.23
	# ANN2006... CHANGE OF CH4 BY DYNAMICS 0.00
	# ANN2006... INSTANT CH4 (10^-6 kg/m^2) 9463.23
	# ANN2006... CHANGE OF CH4 BY DYNAMICS 0.00

	cut -c 3-79 \| \

	# Because the 2nd column happens to be the same value in both versions/measurements
	# (see "same" and "different" from comments above)...
	#
	# sort piped into uniq will condense the two different lines for each same value into
	# one line
	#
	# v same
	# ---------------------------------------------------
	# ANN2006... INSTANT CH4 (10^-6 kg/m^2) 9463.23
	# ANN2006... CHANGE OF CH4 BY DYNAMICS 0.00

	sort \| \
	uniq \| \

	# Remove part of the filename that isn't needed, and some small variability in how the
	# variable's name starts: ':0' for INSTANT, '- ' for all others:
	#
	# vv
	# ...F40oQ32.PRT:0INSTANT CH...
	# ...F40oQ32.PRT- CHANGE OF ...
	# ^^

	sed 's/.E134TcadiRCP30aF40oQ32.PRT:0/ /' \| sed 's/.E134TcadiRCP30aF40oQ32.PRT- / /' \| \

	# Convert from fixed-width to CSV and dump (>) into initial.csv
	#
	# ANN2006 INSTANT CH4 (10^-6 kg/m^2) 9463.23
	# ANN2006 CHANGE OF CH4 BY DYNAMICS 0.00
	# >>>
	# ANN2006,INSTANT CH4 (10^-6 kg/m^2),9463.23
	# ANN2006,CHANGE OF CH4 BY DYNAMICS , 0.00

	awk ' { print substr($0,1,7) "," substr($0,9,28) "," substr($0,42,7) }' \
	> ch4_0_initial.csv

	echo "done"

	echo -n "Finalizing master CSV..."

	cat ch4_0_initial.csv \| \

	# Trim leading (/^) and trailing ($/) whitespace from 2nd and 3rd columns
	#
	# ANN2006,INSTANT CH4 (10^-6 kg/m^2),9463.23
	# ANN2006,CHANGE OF CH4 BY DYNAMICS , 0.00
	# >>>
	# ANN2006,INSTANT CH4 (10^-6 kg/m^2),9463.23
	# ANN2006,CHANGE OF CH4 BY DYNAMICS,0.00

	awk -F, '
	BEGIN{FS=OFS=","}
	{ gsub(/^[ \t]+/,"",$2) ;
	gsub(/[ \t]+$/,"",$2) ;
	gsub(/^[ \t]+/,"",$3) ;
	gsub(/[ \t]+$/,"",$3) ;
	}1
	' \| \

	# Use sed to remove units in parentheses that will not work as a filename
	sed 's/ (10^-6 kg\/m^2)//' \
	> ch4_1_final.csv

	echo "done"

	echo -n "Breaking-up master CSV into component CSVs by var name..."

	# Read from final.csv, use 2nd column (var name) as filename and filter all
	# that vars data into that file
	awk -F, '{ fname = "ch4__" $2 ".csv"; print >> fname; close(fname) }' ch4_1_final.csv

	echo "done"

	wc -l ./ch4_*.csv \| grep -v total

	echo ""
	echo "Processing PAN"

	rm -f ./pan_*.csv

	echo -n "Initializing master CSV..."

	# PAN only has 9 sub-variables after INSTANT
	grep -A9 "INSTANT PAN" ./*.PRT \| \
	grep -v -- '--' \| \
	cut -c 3-79 \| \
	sort \| \
	uniq \| \
	sed -E 's/.E134TcadiRCP30aF40oQ32.PRT(:0\|- )/ /' \| \
	awk ' { print substr($0,1,7) "," substr($0,9,28) "," substr($0,42) }' \
	> pan_0_initial.csv

	echo "done"

	echo -n "Finalizing master CSV..."

	awk -F, '
	BEGIN{FS=OFS=","}
	{
	gsub(/^[ \t]+/,"",$2);gsub(/[ \t]+$/,"",$2);gsub(/^[ \t]+/,"",$3);gsub(/[ \t]+$/,"",$3)
	}1' pan_0_initial.csv \| \
	sed 's/ (10^-9 kg\/m^2)//' \
	> pan_1_final.csv

	echo "done"

	echo -n "Breaking-up master CSV into component CSVs by var name..."

	awk -F, '{ fname = "pan__" $2 ".csv"; print >> fname; close(fname) }' pan_1_final.csv

	echo "done"

	wc -l ./pan_*.csv \| grep -v total