Skip to content

Instantly share code, notes, and snippets.

@eightysteele
Created August 6, 2011 23:08
Show Gist options
  • Select an option

  • Save eightysteele/1129865 to your computer and use it in GitHub Desktop.

Select an option

Save eightysteele/1129865 to your computer and use it in GitHub Desktop.
# Stop words not to index
STOP_WORDS = [
'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been',
'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either',
'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have',
'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into',
'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me',
'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off',
'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say',
'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their',
'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas',
'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while',
'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your']
# Darwin Core concept names whose value should not be full text indexed
DO_NOT_FULL_TEXT = [
'acceptednameusageid', 'accessrights', 'basisofrecord', 'collectionid',
'coordinateprecision', 'coordinateuncertaintyinmeters', 'datasetid',
'dateidentified', 'day', 'decimallatitude', 'decimallongitude', 'disposition',
'enddayofyear', 'eventdate', 'eventid', 'eventtime', 'fieldnotes',
'footprintspatialfit', 'footprintsrs', 'footprintwkt', 'geologicalcontextid',
'georeferenceremarks', 'georeferenceverificationstatus', 'highergeographyid',
'identificationid', 'individualcount', 'individualid', 'institutionid',
'language', 'locationid', 'maximumdepthinmeters',
'maximumdistanceabovesurfaceinmeters', 'maximumelevationinmeters',
'minimumdepthinmeters', 'minimumdistanceabovesurfaceinmeters',
'minimumelevationinmeters', 'modified', 'month', 'nameaccordingtoid',
'namepublishedinid', 'nomenclaturalcode', 'occurrencedetails', 'occurrenceid',
'originalnameusageid', 'parentnameusageid', 'pointradiusspatialfit', 'rights',
'rightsholder', 'scientificnameid', 'startdayofyear', 'taxonconceptid', 'taxonid',
'type', 'verbatimcoordinates', 'verbatimeventdate', 'verbatimlatitude',
'verbatimlongitude', 'year']
# Darwin Core concept names that should not be indexed
DO_NOT_INDEX = [
'acceptednameusageid', 'accessrights', 'associatedmedia',
'associatedoccurrences', 'associatedreferences',
'associatedsequences', 'associatedtaxa', 'bibliographiccitation',
'collectionid', 'datageneralizations', 'datasetid', 'dateidentified',
'disposition', 'eventdate', 'eventid', 'eventremarks', 'eventtime',
'fieldnotes', 'footprintspatialfit', 'footprintsrs', 'footprintwkt',
'geologicalcontextid', 'georeferenceremarks', 'georeferencesources',
'habitat', 'higherclassification', 'highergeography', 'highergeographyid',
'identificationid', 'identificationreferences', 'identificationremarks',
'individualcount', 'individualid', 'informationwithheld', 'institutionid',
'locationid', 'locationremarks', 'modified', 'nameaccordingtoid',
'namepublishedin', 'namepublishedinid', 'occurrencedetails', 'occurrenceid',
'occurrenceremarks', 'originalnameusageid', 'othercatalognumbers',
'parentnameusageid', 'pointradiusspatialfit', 'preparations',
'previousidentifications', 'rights', 'rightsholder', 'scientificnameid',
'taxonconceptid', 'taxonid', 'taxonremarks', 'verbatimcoordinates',
'verbatimlatitude', 'verbatimlongitude']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment