Skip to content

Instantly share code, notes, and snippets.

@felixvd
Last active February 3, 2016 11:52
Show Gist options
  • Save felixvd/bfd65c4d06e2505f5551 to your computer and use it in GitHub Desktop.
Save felixvd/bfd65c4d06e2505f5551 to your computer and use it in GitHub Desktop.
Excel checker
import pandas as pd
import re
indf = pd.read_excel("inputfile.xlsx")
# Build a data frame with source and target segments
df = pd.concat([indf.Japanese, indf.German], axis = 1)
df.columns = ["Source", "Target"]
# Check if model numbers in source are also in target and vice versa
pattern = re.compile(r"[A-Z0-9][A-Z0-9]+")
fout = open('checkeroutput1.txt', 'w')
fout.write("The following lines may not have the same model numbers in source and target.\n")
for i in df.index:
left = re.findall(pattern, df.Source[i])
right = re.findall(pattern, df.Target[i])
triggered = False
for code in left:
if code not in right:
# print("ALERT. Model numbers may not be the same in source and target.")
# print("Row ", i, " | ", df.Target[i], " | ", df.Source[i])
triggered = True
break
if not triggered:
for code in right:
if code not in left:
triggered = True
break
if triggered:
outstring = "Row " + str(i) + "\t" + df.Target[i] + "\t" + df.Source[i] + "\n"
fout.write(outstring)
fout.close()
# Check if target contains repetitions of more than 3 words
fout = open('checkeroutput2.txt', 'w')
fout.write("The following lines have repetitions of 3+ words in the target field.\n")
for i in df.index:
a = re.search(r"(\w+\W+\w+\W+\w+).*\1", df.Target[i])
if a:
outstring = "Row " + str(i) + "\t" + df.Target[i] + "\t" + df.Source[i] + "\n"
fout.write(outstring)
fout.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment