Skip to content

Instantly share code, notes, and snippets.

@do-me
Created October 13, 2024 12:12
Show Gist options
  • Save do-me/594d5f140578b40cbf4a375e47ba5888 to your computer and use it in GitHub Desktop.
Save do-me/594d5f140578b40cbf4a375e47ba5888 to your computer and use it in GitHub Desktop.
Extract markdown table from arbitrary markdown text based on regex
import pandas as pd
import re
def extract_markdown_table(text):
"""
Extracts a markdown table from a string, removing other markdown elements.
Args:
text: The input string containing markdown.
Returns:
The extracted markdown table as a string, or an empty string if no table is found.
"""
if text is None or pd.isna(text): # Handle missing values
return ""
# Use regex to find the markdown table. This regex handles multi-line tables and some variations.
match = re.search(r"(?:^|\n)([\s\S]*?\n)(\|.*\|\n)((?:\|.*\|\n)+)", text)
if match:
# Reconstruct the table with optional header separator
header_row = match.group(1).strip() # Extract the header row (which could be blank if no header)
# Check if there's a proper header separator (--- or ===)
if re.match(r"^\|(?:[-=]+(?:\|[-=]+)+)\|$", header_row):
table_content = header_row + "\n" + match.group(2) + match.group(3)
elif header_row:
table_content = match.group(2) + match.group(3) # No header separator, skip the header row
else:
table_content = match.group(2) + match.group(3)
return table_content.strip()
else:
return ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment