ryancollingwood · November 11, 2024 02:53
diff --git a/csv_to_pptx.py b/csv_to_pptx.py
 import sys
 import re

 # Dataframe
 import pandas as pd
 import numpy as np

 # Powerpoint
 from pptx import Presentation

 """
 requirements:
 python-pptx
 pandas
 numpy
 """

 def find_placeholders(text):
    # Define the regex pattern to match placeholders like {{PlaceholderName}}
    pattern = r'\{\{(.*?)\}\}'

    # Find all matches in the input text
    matches = re.findall(pattern, text)

    return matches

 def generate_slide(template_path, replacements, ouput_path, remove_missing = True):
  """
  This function is definately spagetti, and I haven't fully figured out how the 
  `python-pptx` library does things. There is some pretty extensive docco 
  here: https://python-pptx.readthedocs.io/en/latest/user/quickstart.html

  But what we have is good enough to get going.

  If you see `Malformatted run?` then go to the offending text in the Slide and 
  delete the template value - e.g. `{{Idea Rank}}` as a run is determined by 
  formatting which breaks the text into parts
  """
  def escaped_key(key_value):
    return f'{{{{{key_value}}}}}'

  prs = Presentation(template_path)

  # Iterate through each slide in the presentation
  for slide in prs.slides:
    # Iterate through each shape in the slide
    for shape in slide.shapes:
      # Check if the shape has text
      if shape.has_text_frame:
        # Iterate through each paragraph in the text frame
        for paragraph in shape.text_frame.paragraphs:
          # Iterate through each run in the paragraph
          for run in paragraph.runs:
            # Replace text wrapped in double curly braces
            search_text = str(run.text).strip()
            place_holders = find_placeholders(search_text)

            if search_text.find("{{") > -1 and len(place_holders) == 0:
              print("failed to find placeholders:", search_text, place_holders)
              print("Malformatted run?", [x.text for x in paragraph.runs], "\n")

            if len(place_holders) == 0:
              continue
            else:
              print("search_text", search_text)

            for place_holder in place_holders:
              if place_holder not in replacements:
                if remove_missing:
                  print(f"removing {place_holder}", "\n", sep = "")
                  run.text = run.text.replace(escaped_key(place_holder), "")
                continue
              print(f"replacing {place_holder}", "\n", replacements[place_holder], "\n", sep = "")
              run.text = run.text.replace(escaped_key(place_holder), str(replacements[place_holder]))

            if run.text.find("{{") > -1:
              print("Stil got templeted text?", run.text, find_placeholders(run.text))
              print("Malformatted run?", [x.text for x in paragraph.runs])

  # Save the modified presentation
  prs.save(ouput_path)


 def process_row(template_path, filename_cols, row):
  """
  Process the Sheet

  Do the thing for all of the rows
  """

  filename_parts = list()
  for col in filename_cols:
    filename_parts.append(str(row[col]).strip().replace(" ", "-"))

  row_data = {str(k).replace(" ", "").strip():str(v).strip() for k,v in row.to_dict().items()}

  filename = "_".join(filename_parts)
  filename = f"{filename}.pptx"
  generate_slide(template_path, row_data, filename)

  return filename

 def main(df_path, template_path, mandatory_columns, filename_columns):
  """
  Populate Templete with Data

  It is assumed that in your template pptx you have text you want to substitute 
  in the form of `{{ColumnNameWithNoSpaces}}`

  So column `Business Value` will be mapped to text `{{BusinessValue}}`

  df_path - Path to csv with the data
  template_path - Path to templatised pptx
  mandatory_columns - What columns are mandatory for the rows
  filename_columns - What columns (and what order) for the generated filename
  """

  df = pd.read_csv(df_path)

  for column in mandatory_columns:
    df = df[~df[column].str.strip().replace("", np.nan).isnull()]

  print("Number of rows from sheet:", len(df))

  df.apply(
      lambda row: process_row(template_path, filename_columns, row),
      axis=1
      )

 if __name__ == "__main__":
  # yes could be more robust
  args = sys.argv[1:]

  df_path = args[0]
  template_path = args[1]
  mandatory_columns = args[2].split(",")
  filename_columns = args[3].split(",")

  main(df_path, mandatory_columns, filename_columns)
	import sys
	import re

	# Dataframe
	import pandas as pd
	import numpy as np

	# Powerpoint
	from pptx import Presentation

	"""
	requirements:
	python-pptx
	pandas
	numpy
	"""

	def find_placeholders(text):
	# Define the regex pattern to match placeholders like {{PlaceholderName}}
	pattern = r'\{\{(.*?)\}\}'

	# Find all matches in the input text
	matches = re.findall(pattern, text)

	return matches

	def generate_slide(template_path, replacements, ouput_path, remove_missing = True):
	"""
	This function is definately spagetti, and I haven't fully figured out how the
	`python-pptx` library does things. There is some pretty extensive docco
	here: https://python-pptx.readthedocs.io/en/latest/user/quickstart.html

	But what we have is good enough to get going.

	If you see `Malformatted run?` then go to the offending text in the Slide and
	delete the template value - e.g. `{{Idea Rank}}` as a run is determined by
	formatting which breaks the text into parts
	"""
	def escaped_key(key_value):
	return f'{{{{{key_value}}}}}'

	prs = Presentation(template_path)

	# Iterate through each slide in the presentation
	for slide in prs.slides:
	# Iterate through each shape in the slide
	for shape in slide.shapes:
	# Check if the shape has text
	if shape.has_text_frame:
	# Iterate through each paragraph in the text frame
	for paragraph in shape.text_frame.paragraphs:
	# Iterate through each run in the paragraph
	for run in paragraph.runs:
	# Replace text wrapped in double curly braces
	search_text = str(run.text).strip()
	place_holders = find_placeholders(search_text)

	if search_text.find("{{") > -1 and len(place_holders) == 0:
	print("failed to find placeholders:", search_text, place_holders)
	print("Malformatted run?", [x.text for x in paragraph.runs], "\n")

	if len(place_holders) == 0:
	continue
	else:
	print("search_text", search_text)

	for place_holder in place_holders:
	if place_holder not in replacements:
	if remove_missing:
	print(f"removing {place_holder}", "\n", sep = "")
	run.text = run.text.replace(escaped_key(place_holder), "")
	continue
	print(f"replacing {place_holder}", "\n", replacements[place_holder], "\n", sep = "")
	run.text = run.text.replace(escaped_key(place_holder), str(replacements[place_holder]))

	if run.text.find("{{") > -1:
	print("Stil got templeted text?", run.text, find_placeholders(run.text))
	print("Malformatted run?", [x.text for x in paragraph.runs])

	# Save the modified presentation
	prs.save(ouput_path)


	def process_row(template_path, filename_cols, row):
	"""
	Process the Sheet

	Do the thing for all of the rows
	"""

	filename_parts = list()
	for col in filename_cols:
	filename_parts.append(str(row[col]).strip().replace(" ", "-"))

	row_data = {str(k).replace(" ", "").strip():str(v).strip() for k,v in row.to_dict().items()}

	filename = "_".join(filename_parts)
	filename = f"{filename}.pptx"
	generate_slide(template_path, row_data, filename)

	return filename

	def main(df_path, template_path, mandatory_columns, filename_columns):
	"""
	Populate Templete with Data

	It is assumed that in your template pptx you have text you want to substitute
	in the form of `{{ColumnNameWithNoSpaces}}`

	So column `Business Value` will be mapped to text `{{BusinessValue}}`

	df_path - Path to csv with the data
	template_path - Path to templatised pptx
	mandatory_columns - What columns are mandatory for the rows
	filename_columns - What columns (and what order) for the generated filename
	"""

	df = pd.read_csv(df_path)

	for column in mandatory_columns:
	df = df[~df[column].str.strip().replace("", np.nan).isnull()]

	print("Number of rows from sheet:", len(df))

	df.apply(
	lambda row: process_row(template_path, filename_columns, row),
	axis=1
	)

	if __name__ == "__main__":
	# yes could be more robust
	args = sys.argv[1:]

	df_path = args[0]
	template_path = args[1]
	mandatory_columns = args[2].split(",")
	filename_columns = args[3].split(",")

	main(df_path, mandatory_columns, filename_columns)