Last active
February 3, 2016 11:52
-
-
Save felixvd/bfd65c4d06e2505f5551 to your computer and use it in GitHub Desktop.
Excel checker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
indf = pd.read_excel("inputfile.xlsx") | |
# Build a data frame with source and target segments | |
df = pd.concat([indf.Japanese, indf.German], axis = 1) | |
df.columns = ["Source", "Target"] | |
# Check if model numbers in source are also in target and vice versa | |
pattern = re.compile(r"[A-Z0-9][A-Z0-9]+") | |
fout = open('checkeroutput1.txt', 'w') | |
fout.write("The following lines may not have the same model numbers in source and target.\n") | |
for i in df.index: | |
left = re.findall(pattern, df.Source[i]) | |
right = re.findall(pattern, df.Target[i]) | |
triggered = False | |
for code in left: | |
if code not in right: | |
# print("ALERT. Model numbers may not be the same in source and target.") | |
# print("Row ", i, " | ", df.Target[i], " | ", df.Source[i]) | |
triggered = True | |
break | |
if not triggered: | |
for code in right: | |
if code not in left: | |
triggered = True | |
break | |
if triggered: | |
outstring = "Row " + str(i) + "\t" + df.Target[i] + "\t" + df.Source[i] + "\n" | |
fout.write(outstring) | |
fout.close() | |
# Check if target contains repetitions of more than 3 words | |
fout = open('checkeroutput2.txt', 'w') | |
fout.write("The following lines have repetitions of 3+ words in the target field.\n") | |
for i in df.index: | |
a = re.search(r"(\w+\W+\w+\W+\w+).*\1", df.Target[i]) | |
if a: | |
outstring = "Row " + str(i) + "\t" + df.Target[i] + "\t" + df.Source[i] + "\n" | |
fout.write(outstring) | |
fout.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment