Skip to content

Instantly share code, notes, and snippets.

@robertklep
Created June 14, 2012 05:53
Show Gist options
  • Save robertklep/2928202 to your computer and use it in GitHub Desktop.
Save robertklep/2928202 to your computer and use it in GitHub Desktop.
Regular expression matching most common Dutch, German, English and Spanish stopwords
# -*- coding:utf-8 -*-
import re
STOPWORDS = re.compile(r'(?u)\b(?:a(?:(?:an|b(?:er|o(?:ut|ve))|fter|gain(?:st)?|l(?:(?:g(?:un(?:as|os)|o)|l(?:e[mnrs]?)?|so?|tijd))?|n(?:(?:d(?:er(?:(?:e[mnrs]?|[mnrs]))?)?|tes?|y))?|re|u(?:ch|[fs])|[mst]))?|b(?:e(?:(?:cause|en|fore|i(?:ng)?|low|tween|n))?|i(?:st?|[jn])|oth|ut|y)|c(?:o(?:mo|n(?:tra)?)|ua(?:ndo|l))|d(?:a(?:(?:ar|mit|nn?|s(?:selbe)?|zu|[tß]))?|e(?:(?:in(?:e[mnrs]?)?|m(?:selben)?|n(?:(?:selben|n))?|r(?:(?:er|selben?))?|s(?:(?:de|se(?:lben|n)))?|ze|l))?|i(?:ch|e(?:s(?:e(?:(?:lben?|[mnrs]))?)?)?|[drt])|o(?:(?:ch|e[ns]|ing|nde|or|rt|wn))?|u(?:(?:r(?:ante|ch|ing)|s))?)|e(?:(?:ach|ens?|in(?:(?:e[mnrs]?|ig(?:e[mnrs]?)?|mal))?|l(?:l(?:as?|os))?|n(?:tre)?|r(?:(?:a(?:(?:is|[ns]))?|es))?|s(?:(?:as?|os?|t(?:a(?:(?:ba(?:(?:is|[ns]))?|d(?:(?:as?|os?))?|mos|ndo|r(?:(?:emos|á[ns]?|é(?:is)?|ía(?:(?:is|mos|[ns]))?))?|s))?|e(?:mos)?|o[sy]?|uv(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|é(?:ramos|semos))|[eo])|á(?:(?:bamos|is|[ns]))?|é(?:(?:is|[ns]))?)|e))?|twas|u(?:ch|er|re[mnrs]?)))?|f(?:ew|or|rom|u(?:e(?:(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?))?|i(?:(?:mos|ste(?:is)?))?|rther|é(?:ramos|semos))|ür)|ge(?:(?:en|gen|we(?:est|sen)))?|h(?:a(?:(?:ar|b(?:(?:en?|i(?:d(?:as?|os?)|endo)|r(?:emos|á[ns]?|é(?:is)?|ía(?:(?:is|mos|[ns]))?)|éis|ía(?:(?:is|mos|[ns]))?))?|s(?:ta)?|t(?:ten?)?|v(?:ing|e)|y(?:(?:a(?:(?:mos|[ns]))?|áis))?|[dn]))?|e(?:(?:b(?:ben)?|eft|m(?:os)?|r(?:(?:s(?:elf)?|e))?|t))?|i(?:er|m(?:self)?|n(?:ter)?|[js])|o[ew]|u(?:b(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|é(?:ramos|semos))|[eo])|n))|i(?:(?:ch|e(?:mand|ts)|h(?:n(?:en)?|r(?:e[mnrs]?)?|m)|n(?:(?:dem|to|s))?|st?|t(?:s(?:elf)?)?|[fkm]))?|j(?:e(?:(?:de[mnrs]?|ne[mnrs]?|tzt))?|a)|k(?:ann?|ein(?:e[mnrs]?)?|on|unnen|önn(?:en|te))|l(?:as?|es?|os?)|m(?:a(?:ar|chen|n(?:che[mnrs]?)?)|e(?:(?:er|in(?:e[mnrs]?)?|[nt]))?|i(?:(?:ch|jn?|[rst]))?|o(?:et|re|st)|u(?:chos?|ss(?:te)?|y)|y(?:self)?|ás|í(?:(?:as?|os?))?)|n(?:a(?:(?:ar|ch|da))?|i(?:(?:chts?|ets?))?|o(?:(?:ch|s(?:otr(?:as|os))?|[grt]))?|u(?:(?:estr(?:as?|os?)|[nr]))?)|o(?:(?:der|ff?|hne|m(?:dat)?|n(?:(?:ce|der|ly|s))?|ok|t(?:her|r(?:as?|os?))|u(?:r(?:s(?:elves)?)?|t)|ver|wn|[bprs]))?|p(?:ara|ero|o(?:co|r(?:que)?))|qu(?:ien(?:es)?|[eé])|reeds|s(?:ame|e(?:(?:a(?:(?:mos|[ns]))?|hr|in(?:e[mnrs]?)?|lbst|ntid(?:(?:as?|os?))?|r(?:emos|á[ns]?|é(?:is)?|ía(?:(?:is|mos|[ns]))?)|áis))?|he|i(?:ch|e(?:nte)?|n(?:(?:tiendo|d))?)|o(?:(?:bre|is|l(?:che[mnrs]?|l(?:te)?)|m(?:os|e)|n(?:(?:dern|st))?|y))?|u(?:(?:ch|y(?:as?|os?)|s))?|í)|t(?:a(?:mbién|nto)|e(?:(?:gen|n(?:dr(?:emos|á[ns]?|é(?:is)?|ía(?:(?:is|mos|[ns]))?)|e(?:mos|d)|g(?:a(?:(?:mos|[ns]))?|áis|o)|i(?:d(?:as?|os?)|endo)|éis|ía(?:(?:is|mos|[ns]))?)))?|h(?:a[nt]|e(?:(?:irs?|m(?:selves)?|re|se|[ny]))?|is|ose|rough)|i(?:ene[ns]?)?|o(?:(?:ch|dos?|en|[ot]))?|u(?:(?:v(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|é(?:ramos|semos))|[eo])|y(?:as?|os?)|s))?|ú)|u(?:(?:it|n(?:(?:d(?:er)?|os?|s(?:e[mnrs]?)?|t(?:er|il)|a))?|[mpw]))?|v(?:an|e(?:el|ry)|iel|o(?:or|sostr(?:as|os)|[mnr])|uestr(?:as?|os?))|w(?:a(?:nt|r(?:(?:en|st))?|[st])|e(?:(?:i(?:ter|l)|lche[mnrs]?|nn|r(?:d(?:en?)?|e)|zen|g))?|h(?:at|e(?:re|n)|i(?:ch|le)|om?|y)|i(?:e(?:der)?|ll?|r(?:(?:st|d))?|th)|o(?:(?:ll(?:en|te)|rd(?:en|t)))?|ährend|ürden?)|y(?:(?:o(?:u(?:r(?:s(?:el(?:ves|f))?)?)?)?|a))?|z(?:al|e(?:lf)?|i(?:ch|jn?)|o(?:(?:nder|u))?|u[mr]?|w(?:ar|ischen))|é(?:ramos|l)|über)\b', re.I)
if __name__ == '__main__':
import sys
print STOPWORDS.sub(' ', sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment