Created
August 26, 2018 12:24
-
-
Save khuangaf/4c40784cc506ae8d43188089be25177c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def same_subtitle(current_subtitle, next_subtitle): | |
'''Return true if the two given subtitles are the same (but can tolerate a bit difference)''' | |
# convert the two subtitle into set e.g. '我很乖' -> {'我','很','乖'} | |
current_set = set(current_subtitle) | |
next_set = set(next_subtitle) | |
current_set_len = len(current_set) | |
next_set_len = len(next_set) | |
intersect_set = current_set & next_set | |
intersect_set_len = len(intersect_set) | |
# if any of the two subtitle are of 70% the same with the intersected set return True | |
if intersect_set_len >= 0.7 * current_set_len or intersect_set_len >= 0.7 * next_set_len: | |
return True | |
else: | |
return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment