Skip to content

Instantly share code, notes, and snippets.

@allenanie
Created December 17, 2019 18:02
Show Gist options
  • Save allenanie/66d92e5bf6bf1da89bb50f21f970f4e9 to your computer and use it in GitHub Desktop.
Save allenanie/66d92e5bf6bf1da89bb50f21f970f4e9 to your computer and use it in GitHub Desktop.
Parsing latex table in CSV
"""
We write a way to convert LaTex to CSV
"""
import csv
import re
def to_csv(latex_text, file_name):
"""We learn to parse the text.
We assume the very first line tells the format of the table!
Arguments:
latex_text {[type]} -- [description]
"""
rows = []
for i, line in enumerate(latex_text.split("\n")):
if '&' in line:
# then we process
row = []
for cell in line.strip().split('&'):
cell = cell.strip().replace("\\", "").replace('~~', " ").replace("\\%", '%')
cell = cell.replace("\x08egin{tabular}{@{}l@{}}", "")
cell = cell.replace("end{tabular}", "")
cell = cell.replace(" ~ ", "")
row.append(cell.strip())
rows.append(row)
with open(file_name, 'w') as f:
csv_writer = csv.writer(f)
for row in rows:
csv_writer.writerow(row)
if __name__ == "__main__":
latex_text = """{\begin{tabular}{@{}llll@{}}
\toprule
& ACSC-related Visits & Non-ACSC-related Visits \\
Predictor Variables & (n=823,759) & (n=1,926,289) \\ \colrule % (n=2,711,839)
Age, mean (SD) & 59.91 (17.99) & 39.66 (23.03) \\
Race, num (\%) & & \\
~~ White & 615237 (74.69\%) & 1319754 (68.51\%) \\
~~ \begin{tabular}{@{}l@{}}Black or \\ ~ African American \end{tabular} & 61785 (7.5\%) & 110755 (5.75\%) & \\
~~ Vietnamese & 2063 (0.25\%) & 2860 (0.15\%) & \\
~~ \begin{tabular}{@{}l@{}}American Indian or \\ ~ Alaska Native \end{tabular} & 2054 (0.25\%) & 6148 (0.32\%) & \\
~~ Filipino & 1001 (0.12\%) & 1249 (0.06\%) & \\
~~ \begin{tabular}{@{}l@{}}Native Hawaiian or \\ ~ Other Pacific Islander \end{tabular} & 949 (0.12\%) & 2804 (0.15\%) & \\
~~ Japanese & 678 (0.08\%) & 719 (0.04\%) & \\
~~ Other Pacific Islander & 543 (0.07\%) & 1537 (0.08\%) & \\
~~ Chinese & 198 (0.02\%) & 780 (0.04\%) & \\
~~ Native Hawaiian & 147 (0.02\%) & 592 (0.03\%) & \\
~~ Korean & 31 (0.0\%) & 80 (0.0\%) & \\
~~ Asian Indian & 24 (0.0\%) & 25 (0.0\%) & \\
~~ Other & $<11$ (0.0\%) & $<11$ (0.0\%) & \\
~~ Missing & 139,045 (16.9\%) & 478,976 (24.9\%) \\
Ethnicity, num (\%) & & \\
~~ Non-Hispanic or Latino & 580099 (70.42\%) & 1218670 (63.27\%) & \\
~~ Hispanic or Latino & 63875 (7.75\%) & 195740 (10.16\%) & \\
~~ Mexican & 2447 (0.3\%) & 3838 (0.2\%) & \\
~~ Central American & 166 (0.02\%) & 242 (0.01\%) & \\
~~ Puerto Rican & 71 (0.01\%) & 159 (0.01\%) & \\
~~ Cuban & 23 (0.0\%) & 52 (0.0\%) & \\
~~ Other & 14 (0.0\%) & 42 (0.0\%) & \\
~~ Missing & 177,064 (21.5\%) & 507,546 (26.3\%) \\
Tobacco, num (\%) & & \\
~~ Never Smoked & 311,761 (70.46\%) & 826,680 (76.36\%) & \\
~~ Prior History & 78,799 (17.81\%) & 146,505 (13.53\%) & \\
~~ Active Smoker & 51,876 (11.73\%) & 109,393 (10.1\%) & \\
~~ Missing & 289,938 (35.2\%) & 925,961 (48.1\%) \\
\botrule
\end{tabular}}
"""
to_csv(latex_text, "Table1Demographics.csv")
table2 = """\begin{table}[h]
\tbl{Socioeconomics Status (SES) Characteristics of the Patients. We report mean (standard deviation) in the table. SD computed over each individual.}
{\begin{tabular}{@{}lcc@{}}
\toprule
& ACSC-related Visits & Non-ACSC-related Visits \\
Predictor Variables & (n=648,041) & (n=2,063,798) \\ \colrule
\begin{tabular}{@{}l@{}} The median household income \end{tabular} & 54,548.71 (20175.04) & 56,810.59 (21913.58) \\
\begin{tabular}{@{}l@{}} Frac of high needs population \end{tabular} & 0.38 (0.11) & 0.37 (0.12) \\
\begin{tabular}{@{}l@{}} Frac of population living in \\ ~~ renter occupied units \end{tabular} & 0.3 (0.15) & 0.3 (0.16) \\
\begin{tabular}{@{}l@{}} Frac of households with children \\ ~~ and a single parent \end{tabular} & 0.16 (0.08) & 0.16 (0.08) \\
\begin{tabular}{@{}l@{}} Frac of families with incomes $<$ 100\% \\ ~~ of the Federal Poverty Level (FPL) \end{tabular} & 0.15 (0.09) & 0.14 (0.09) \\
\begin{tabular}{@{}l@{}} Frac of African American population \end{tabular} & 0.1 (0.15) & 0.08 (0.14) \\
\begin{tabular}{@{}l@{}} Frac of Hispanic population \end{tabular} & 0.15 (0.21) & 0.16 (0.22) \\
\begin{tabular}{@{}l@{}} Frac of households receiving public assistance \end{tabular} & 0.14 (0.08) & 0.13 (0.08) \\
\begin{tabular}{@{}l@{}} Frac of population with no health \\ ~~ insurance coverage \end{tabular} & 0.14 (0.06) & 0.13 (0.06) \\
\begin{tabular}{@{}l@{}} Frac of people age 25 or older \\ ~~ who have no high school degree \end{tabular} & 0.14 (0.1) & 0.13 (0.1) \\
\begin{tabular}{@{}l@{}} Frac of houses that are vacant \end{tabular} & 0.13 (0.08) & 0.12 (0.08) \\
\begin{tabular}{@{}l@{}} Frac of adults who are unemployed \end{tabular} & 0.08 (0.04) & 0.07 (0.04) \\
\begin{tabular}{@{}l@{}} Frac of population that are foreign born \end{tabular} & 0.08 (0.1) & 0.09 (0.1) \\
\begin{tabular}{@{}l@{}} Frac of household with no car \end{tabular} & 0.06 (0.05) & 0.05 (0.04) \\
\begin{tabular}{@{}l@{}} Frac of population living in \\ ~~ crowded housing units \end{tabular} & 0.03 (0.04) & 0.03 (0.05) \\
\botrule
\end{tabular}}\label{tab:ses-demo}
\end{table}
"""
to_csv(table2, "Table2SES.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment