Last active
March 20, 2024 18:15
-
-
Save rsalaza4/a923e5591401bccb770b244b766efd17 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define multiple universities names patterns | |
sub_patterns = [ | |
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]*', | |
'[A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]*', | |
'[A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]*', | |
'[A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* University of [A-Z][a-z]*', | |
'University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*', | |
'University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]*', | |
'University of [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*', | |
'University of [A-Z][a-z]* at [A-Z][a-z]*', | |
'University of [A-Z][a-z]* [A-Z][a-z]*', | |
'University of [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* University', | |
'[A-Z][a-z]* University', | |
'[A-Z]* University', | |
'[A-Z][a-z]* Institute of [A-Z][a-z]* & [A-Z][a-z]*', | |
'[A-Z][a-z]* Institute of [A-Z][a-z]* [A-Z][a-z]*', | |
'[A-Z][a-z]* Institute of [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* Community College', | |
'[A-Z][a-z]* Community College', | |
'College of [A-Z][a-z]* & [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* College of [A-Z][a-z]*', | |
'[A-Z]* College of [A-Z][a-z]*', | |
'[A-Z][a-z]* College of [A-Z][a-z]*', | |
'[A-Z][a-z]* [A-Z][a-z]* College', | |
'[A-Z][a-z]* College', | |
'[A-Z][a-z]* [A-Z][a-z]* High School', | |
'[A-Z][a-z]* High School', | |
'[A-Z][a-z]* [A-Z][a-z]* Military Academy', | |
'[A-Z][a-z]* Military Academy', | |
'Universidad de los [A-Z][a-z]*', | |
'Universidad de las [A-Z][a-z]*', | |
'Universidad de [A-Z][a-z]*', | |
'Universidad [A-Z][a-z]* de [A-Z][a-z]* [A-Z][a-z]*', | |
'Universidad [A-Z][a-z]* de los [A-Z][a-z]*', | |
'Universidad [A-Z][a-z]* de las [A-Z][a-z]*', | |
'Universidad [A-Z][a-z]* de [A-Z][a-z]*', | |
] | |
# Join university patterns | |
university_patterns = '({})'.format('|'.join(sub_patterns)) | |
def get_univerisities(self): | |
# Find all strings in text that follow university names patterns | |
university_matches = re.findall(university_patterns, self.raw_text) | |
# Declare list of university names to remove if found | |
universities_to_remove = ["University","University "," University"," University ","College"] | |
# Assign list of universities names | |
self.universities = list(set(university_matches)) | |
# Loop through the list of universities to remove | |
for university in universities_to_remove: | |
try: | |
self.universities.remove(university) | |
except: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment