Skip to content

Instantly share code, notes, and snippets.

@tslmy
Created March 15, 2019 00:01
Show Gist options
  • Save tslmy/324324c9d52237ff58131d3cd956779e to your computer and use it in GitHub Desktop.
Save tslmy/324324c9d52237ff58131d3cd956779e to your computer and use it in GitHub Desktop.
additional_features.py
set_of_hedges_en = {"almost", "apparent", "apparently", "appear", "appeared", "appears", "approximately", "argue", "argued", "argues", "around", "assume", "assumed", "broadly", "certain amount", "certain extent", "certain level", "claim", "claimed", "claims", "doubt", "doubtful", "essentially", "estimate", "estimated", "fairly", "feel", "feels", "felt", "frequently", "from my perspective", "from our perspective", "from this perspective", "generally", "guess", "in general", "in most cases", "in most instances", "in my opinion", "in my view", "in our opinion", "in our view", "in this view", "indicate", "indicated", "indicates", "largely", "likely", "mainly", "may", "maybe", "might", "mostly", "often", "on the whole", "ought", "perhaps", "plausible", "plausibly", "possible", "possibly", "postulate", "postulated", "postulates", "presumable", "presumably", "probable", "probably", "quite", "rather", "relatively", "roughly", "seems", "should", "sometimes", "somewhat", "suggest", "suggested", "suggests", "suppose", "supposed", "supposes", "suspect", "suspects", "tend to", "tended to", "tends to", "think", "thinking", "thought", "to my knowledge", "typical", "typically", "uncertain", "uncertainly", "unclear", "unclearly", "unlikely", "usually"}
set_of_hedges_cn = {'可', '可以', '能', '不能', '应', '应该', '需', '会', '不回', '将', '一些', '几乎', '上下', '左右', '尽可能', '多', '少数', '多数', '验证', '按时', '表明', '推测', '判断', '猜测', '猜', '估计', '大概', '没准', '也许', '或许', '或者', '可能', '似乎', '说不定', '少许', '稍微', '一点儿', '一点', '一丁点', '一丁点儿', '稍稍', '少量'}
set_of_taboo_cn = {'米田共', '屁', '屎', '屌', '粪', '尿', '死'}
set_of_namecalling_cn = set('白目,白痴,人渣,王八蛋,怪胎,孬种,畜生,淫妇,混蛋,混蛋,魂淡,龟孙,笨蛋,智障,傻瓜,蠢猪,蠢狗,傻狗,窝囊废,废物,泼妇,骚货,骚逼,贱人,贱货,荡妇,杂种,坏蛋,烂货,傻帽,250,贰佰伍,二货,2B,二百五,SB,傻逼,傻B,煞笔,沙比,混账,婊子,脑残'.split(','))
set_of_swearfiller_cn = set('拷,靠,操,艹,草,cao,我擦,擦嘞,干,呸,夭寿,他妈,他妹的,你妈,你妹,nm,tm,去你的,他奶奶的,tnnd,妈蛋,妈的,md,该死,靠背,靠杯'.split(','))
set_of_taboo_cn = set_of_taboo_cn | set_of_namecalling_cn | set_of_swearfiller_cn
set_of_taboo_en = {"4r5e", "5h1t", "5hit", "a55", "anal", "anus", "ar5e", "arrse", "arse", "ass", "ass-fucker", "asses", "assfucker", "assfukka", "asshole", "assholes", "asswhole", "a_s_s", "b!tch", "b00bs", "b17ch", "b1tch", "ballbag", "balls", "ballsack", "bastard", "beastial", "beastiality", "bellend", "bestial", "bestiality", "bi+ch", "biatch", "bitch", "bitcher", "bitchers", "bitches", "bitchin", "bitching", "bloody", "blow job", "blowjob", "blowjobs", "boiolas", "bollock", "bollok", "boner", "boob", "boobs", "booobs", "boooobs", "booooobs", "booooooobs", "breasts", "buceta", "bugger", "bum", "bunny fucker", "butt", "butthole", "buttmuch", "buttplug", "c0ck", "c0cksucker", "carpet muncher", "cawk", "chink", "cipa", "cl1t", "clit", "clitoris", "clits", "cnut", "cock", "cock-sucker", "cockface", "cockhead", "cockmunch", "cockmuncher", "cocks", "cocksuck", "cocksucked", "cocksucker", "cocksucking", "cocksucks", "cocksuka", "cocksukka", "cok", "cokmuncher", "coksucka", "coon", "cox", "crap", "cum", "cummer", "cumming", "cums", "cumshot", "cunilingus", "cunillingus", "cunnilingus", "cunt", "cuntlick", "cuntlicker", "cuntlicking", "cunts", "cyalis", "cyberfuc", "cyberfuck", "cyberfucked", "cyberfucker", "cyberfuckers", "cyberfucking", "d1ck", "damn", "dick", "dickhead", "dildo", "dildos", "dink", "dinks", "dirsa", "dlck", "dog-fucker", "doggin", "dogging", "donkeyribber", "doosh", "duche", "dyke", "ejaculate", "ejaculated", "ejaculates", "ejaculating", "ejaculatings", "ejaculation", "ejakulate", "f u c k", "f u c k e r", "f4nny", "fag", "fagging", "faggitt", "faggot", "faggs", "fagot", "fagots", "fags", "fanny", "fannyflaps", "fannyfucker", "fanyy", "fatass", "fcuk", "fcuker", "fcuking", "feck", "fecker", "felching", "fellate", "fellatio", "fingerfuck", "fingerfucked", "fingerfucker", "fingerfuckers", "fingerfucking", "fingerfucks", "fistfuck", "fistfucked", "fistfucker", "fistfuckers", "fistfucking", "fistfuckings", "fistfucks", "flange", "fook", "fooker", "fuck", "fucka", "fucked", "fucker", "fuckers", "fuckhead", "fuckheads", "fuckin", "fucking", "fuckings", "fuckingshitmotherfucker", "fuckme", "fucks", "fuckwhit", "fuckwit", "fudge packer", "fudgepacker", "fuk", "fuker", "fukker", "fukkin", "fuks", "fukwhit", "fukwit", "fux", "fux0r", "f_u_c_k", "gangbang", "gangbanged", "gangbangs", "gaylord", "gaysex", "goatse", "God", "god-dam", "god-damned", "goddamn", "goddamned", "hardcoresex", "hell", "heshe", "hoar", "hoare", "hoer", "homo", "hore", "horniest", "horny", "hotsex", "jack-off", "jackoff", "jap", "jerk-off", "jism", "jiz", "jizm", "jizz", "kawk", "knob", "knobead", "knobed", "knobend", "knobhead", "knobjocky", "knobjokey", "kock", "kondum", "kondums", "kum", "kummer", "kumming", "kums", "kunilingus", "l3i+ch", "l3itch", "labia", "lmfao", "lust", "lusting", "m0f0", "m0fo", "m45terbate", "ma5terb8", "ma5terbate", "masochist", "master-bate", "masterb8", "masterbat*", "masterbat3", "masterbate", "masterbation", "masterbations", "masturbate", "mo-fo", "mof0", "mofo", "mothafuck", "mothafucka", "mothafuckas", "mothafuckaz", "mothafucked", "mothafucker", "mothafuckers", "mothafuckin", "mothafucking", "mothafuckings", "mothafucks", "mother fucker", "motherfuck", "motherfucked", "motherfucker", "motherfuckers", "motherfuckin", "motherfucking", "motherfuckings", "motherfuckka", "motherfucks", "muff", "mutha", "muthafecker", "muthafuckker", "muther", "mutherfucker", "n1gga", "n1gger", "nazi", "nigg3r", "nigg4h", "nigga", "niggah", "niggas", "niggaz", "nigger", "niggers", "nob", "nob jokey", "nobhead", "nobjocky", "nobjokey", "numbnuts", "nutsack", "orgasim", "orgasims", "orgasm", "orgasms", "p0rn", "pawn", "pecker", "penis", "penisfucker", "phonesex", "phuck", "phuk", "phuked", "phuking", "phukked", "phukking", "phuks", "phuq", "pigfucker", "pimpis", "piss", "pissed", "pisser", "pissers", "pisses", "pissflaps", "pissin", "pissing", "pissoff", "poop", "porn", "porno", "pornography", "pornos", "prick", "pricks", "pron", "pube", "pusse", "pussi", "pussies", "pussy", "pussys", "rectum", "retard", "rimjaw", "rimming", "s hit", "s.o.b.", "sadist", "schlong", "screwing", "scroat", "scrote", "scrotum", "semen", "sex", "sh!+", "sh!t", "sh1t", "shag", "shagger", "shaggin", "shagging", "shemale", "shi+", "shit", "shitdick", "shite", "shited", "shitey", "shitfuck", "shitfull", "shithead", "shiting", "shitings", "shits", "shitted", "shitter", "shitters", "shitting", "shittings", "shitty", "skank", "slut", "sluts", "smegma", "smut", "snatch", "son-of-a-bitch", "spac", "spunk", "s_h_i_t", "t1tt1e5", "t1tties", "teets", "teez", "testical", "testicle", "tit", "titfuck", "tits", "titt", "tittie5", "tittiefucker", "titties", "tittyfuck", "tittywank", "titwank", "tosser", "turd", "tw4t", "twat", "twathead", "twatty", "twunt", "twunter", "v14gra", "v1gra", "vagina", "viagra", "vulva", "w00se", "wang", "wank", "wanker", "wanky", "whoar", "whore", "willies", "willy", "xrated", "xxx"} # https://gist.github.com/jamiew/1112488
set_of_wishes_en = re.compile(r"(wish|hope) (you|y'all|everyone)")#(have a (nice|good|great|excellent|wonderful) (day|night|time|week|year)|(wish|hope) you have|best wishes)')
set_of_praise_en = set('awesome,outstanding,excellent,great,good,neat,remarkable,fantastic,super,beautiful,bravo,incredible'.split(','))
set_of_emergency_en = set('right now,rn,as soon as possible,asap,immediately,hurry up,straightaway,at once'.split(','))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment