Last active
October 5, 2020 11:14
-
-
Save Ze1598/3ca5d41971ae6b709135abf7bcf527e7 to your computer and use it in GitHub Desktop.
Extract expenses with regex from txt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import datetime | |
import pandas as pd | |
PATTERN = r'(^([\d]+)([\.]?)([\d]*))( - )(.*)' | |
# Load text | |
with open("expenses.txt", "r") as f: | |
expenses_txt = f.readlines() | |
# Put all the lines into a single string | |
whole_txt = "".join(expenses_txt) | |
# Find all the expense matches | |
matches = re.findall(PATTERN, whole_txt, flags=re.MULTILINE) | |
# Extract the relevant match information | |
expenses = [ [m[5], m[0]] for m in matches ] | |
# Create a DF for the expenses | |
df = pd.DataFrame(data=expenses) | |
# Reset the index so we have an actual column for it | |
df.reset_index(inplace=True) | |
# Rename the columns | |
df.columns = ["ExpenseID", "Name", "Cost"] | |
# Increase all IDs so they start at 1 | |
df["ExpenseID"] += 1 | |
# Export it as a CSV | |
df.to_csv("expenses.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment