Created
April 14, 2016 23:20
-
-
Save brydavis/c7ba7c961a2fee78042b7917cfe7be95 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| # In[70]: | |
| import re | |
| ssn_data = [ | |
| ["000115464","000135464"], # 4 | |
| ["763415464", "165415464"], #6 | |
| ["763-41-5464", "165-41-5464"], #6 | |
| ["-01-5464", "2-41-5464"], #5 | |
| ["469879875", "469879870"], # 0 | |
| ["783-65-6374", "783656374"], # 9 | |
| ["115464","5464"], # 4 | |
| ] | |
| def ssn(a, b): | |
| def trail_zeros(x): | |
| return x + ("0" * (9 - len(x))) | |
| def clean(x): | |
| return re.sub("[^0-9]","", x) | |
| a = trail_zeros(clean(a[::-1])) | |
| b = trail_zeros(clean(b[::-1])) | |
| score = 0 | |
| for i in range(9): | |
| if a[i] is b[i]: | |
| score += 1 | |
| else: | |
| break | |
| return score | |
| # In[71]: | |
| for pair in ssn_data: | |
| print(ssn(pair[0], pair[1]), "\t", pair) | |
| # In[97]: | |
| import time | |
| dob_data = [ | |
| ["1985-06-07", "1985-06-07"], # 6 | |
| ["1985-06-07", "1985-6-7"], # 6 | |
| ["1988-06-07", "1985-06-07"], # 3 | |
| ["1985-01-07", "1985-06-07"], # 4 | |
| ] | |
| def dob(a, b): | |
| a = time.strptime(a, "%Y-%m-%d") | |
| b = time.strptime(b, "%Y-%m-%d") | |
| score = 3 if a.tm_year == b.tm_year else (3 - (1*(abs(a.tm_year-b.tm_year)))) | |
| score += 2 if a.tm_mon == b.tm_mon else 0 | |
| score += 1 if a.tm_mday == b.tm_mday else 0 | |
| return score | |
| # In[98]: | |
| for pair in dob_data: | |
| print(dob(pair[0], pair[1]), "\t", pair) | |
| # In[102]: | |
| # a = "783656374" | |
| # # print(a[1::-1]) | |
| # abs(sum([-1*(int(x)+2) for x in a])) | |
| # In[199]: | |
| import re | |
| from collections import defaultdict | |
| name_data = [ | |
| ["Bryan","BrYaN"], # 5 | |
| ["ChristoPHER","CHRIST opher"], # 11 | |
| ["lucia","LUCIA"], # 5 | |
| ["O'Brien", "O'Brien"], # 6 | |
| ["Davis", "Davidson"], # 4 | |
| ["Julia", "Julie"], # 4 | |
| ["Amy Jo", "Jo"], # 2 | |
| ["Amy Jo", "Amy"], # 0 | |
| ["Michaels-Smith", "Michaels Smith"], # 0 | |
| ["Jones-Smith", "JonesSmith"], # 0 | |
| ["Billy Jo", "Billy Jo"], # 0 | |
| ] | |
| name_counter = defaultdict(int) | |
| def name(a, b): | |
| def clean(x): | |
| return re.sub("[^A-z]","", x).lower() | |
| x = a.split(" ") | |
| y = b.split(" ") | |
| if len(x) > 1 or len(y) > 1: | |
| s = [] | |
| for i in x: | |
| for j in y: | |
| s.append(name(i, j)) | |
| a = clean(a) | |
| b = clean(b) | |
| lim = min(len(a), len(b)) | |
| score = 0 | |
| for i in range(lim): | |
| if a[i] is b[i]: | |
| score += 1 | |
| else: | |
| break | |
| s.append(score) | |
| return max(s) | |
| else: | |
| a = clean(a) | |
| b = clean(b) | |
| name_counter[a]+=1 | |
| name_counter[b]+=1 | |
| lim = min(len(a), len(b)) | |
| score = 0 | |
| for i in range(lim): | |
| if a[i] is b[i]: | |
| score += 1 | |
| else: | |
| break | |
| commonality = max(name_counter.get(a), name_counter.get(b)) / sum(name_counter.values()) | |
| print(commonality) | |
| return score | |
| # In[200]: | |
| for pair in name_data: | |
| print(name(pair[0], pair[1]), "\t", pair) | |
| name_counter = dict(name_counter) | |
| if "" in name_counter: | |
| name_counter.pop("") | |
| print(name_counter) | |
| total_names = sum(name_counter.values()) | |
| for nc in name_counter: | |
| name_counter[nc] = round(name_counter[nc] / total_names, 2) | |
| print(name_counter) | |
| # In[192]: | |
| round(45.34545345,2) | |
| # In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment