Created
March 15, 2019 00:01
-
-
Save tslmy/324324c9d52237ff58131d3cd956779e to your computer and use it in GitHub Desktop.
additional_features.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set_of_hedges_en = {"almost", "apparent", "apparently", "appear", "appeared", "appears", "approximately", "argue", "argued", "argues", "around", "assume", "assumed", "broadly", "certain amount", "certain extent", "certain level", "claim", "claimed", "claims", "doubt", "doubtful", "essentially", "estimate", "estimated", "fairly", "feel", "feels", "felt", "frequently", "from my perspective", "from our perspective", "from this perspective", "generally", "guess", "in general", "in most cases", "in most instances", "in my opinion", "in my view", "in our opinion", "in our view", "in this view", "indicate", "indicated", "indicates", "largely", "likely", "mainly", "may", "maybe", "might", "mostly", "often", "on the whole", "ought", "perhaps", "plausible", "plausibly", "possible", "possibly", "postulate", "postulated", "postulates", "presumable", "presumably", "probable", "probably", "quite", "rather", "relatively", "roughly", "seems", "should", "sometimes", "somewhat", "suggest", "suggested", "suggests", "suppose", "supposed", "supposes", "suspect", "suspects", "tend to", "tended to", "tends to", "think", "thinking", "thought", "to my knowledge", "typical", "typically", "uncertain", "uncertainly", "unclear", "unclearly", "unlikely", "usually"} | |
set_of_hedges_cn = {'可', '可以', '能', '不能', '应', '应该', '需', '会', '不回', '将', '一些', '几乎', '上下', '左右', '尽可能', '多', '少数', '多数', '验证', '按时', '表明', '推测', '判断', '猜测', '猜', '估计', '大概', '没准', '也许', '或许', '或者', '可能', '似乎', '说不定', '少许', '稍微', '一点儿', '一点', '一丁点', '一丁点儿', '稍稍', '少量'} | |
set_of_taboo_cn = {'米田共', '屁', '屎', '屌', '粪', '尿', '死'} | |
set_of_namecalling_cn = set('白目,白痴,人渣,王八蛋,怪胎,孬种,畜生,淫妇,混蛋,混蛋,魂淡,龟孙,笨蛋,智障,傻瓜,蠢猪,蠢狗,傻狗,窝囊废,废物,泼妇,骚货,骚逼,贱人,贱货,荡妇,杂种,坏蛋,烂货,傻帽,250,贰佰伍,二货,2B,二百五,SB,傻逼,傻B,煞笔,沙比,混账,婊子,脑残'.split(',')) | |
set_of_swearfiller_cn = set('拷,靠,操,艹,草,cao,我擦,擦嘞,干,呸,夭寿,他妈,他妹的,你妈,你妹,nm,tm,去你的,他奶奶的,tnnd,妈蛋,妈的,md,该死,靠背,靠杯'.split(',')) | |
set_of_taboo_cn = set_of_taboo_cn | set_of_namecalling_cn | set_of_swearfiller_cn | |
set_of_taboo_en = {"4r5e", "5h1t", "5hit", "a55", "anal", "anus", "ar5e", "arrse", "arse", "ass", "ass-fucker", "asses", "assfucker", "assfukka", "asshole", "assholes", "asswhole", "a_s_s", "b!tch", "b00bs", "b17ch", "b1tch", "ballbag", "balls", "ballsack", "bastard", "beastial", "beastiality", "bellend", "bestial", "bestiality", "bi+ch", "biatch", "bitch", "bitcher", "bitchers", "bitches", "bitchin", "bitching", "bloody", "blow job", "blowjob", "blowjobs", "boiolas", "bollock", "bollok", "boner", "boob", "boobs", "booobs", "boooobs", "booooobs", "booooooobs", "breasts", "buceta", "bugger", "bum", "bunny fucker", "butt", "butthole", "buttmuch", "buttplug", "c0ck", "c0cksucker", "carpet muncher", "cawk", "chink", "cipa", "cl1t", "clit", "clitoris", "clits", "cnut", "cock", "cock-sucker", "cockface", "cockhead", "cockmunch", "cockmuncher", "cocks", "cocksuck", "cocksucked", "cocksucker", "cocksucking", "cocksucks", "cocksuka", "cocksukka", "cok", "cokmuncher", "coksucka", "coon", "cox", "crap", "cum", "cummer", "cumming", "cums", "cumshot", "cunilingus", "cunillingus", "cunnilingus", "cunt", "cuntlick", "cuntlicker", "cuntlicking", "cunts", "cyalis", "cyberfuc", "cyberfuck", "cyberfucked", "cyberfucker", "cyberfuckers", "cyberfucking", "d1ck", "damn", "dick", "dickhead", "dildo", "dildos", "dink", "dinks", "dirsa", "dlck", "dog-fucker", "doggin", "dogging", "donkeyribber", "doosh", "duche", "dyke", "ejaculate", "ejaculated", "ejaculates", "ejaculating", "ejaculatings", "ejaculation", "ejakulate", "f u c k", "f u c k e r", "f4nny", "fag", "fagging", "faggitt", "faggot", "faggs", "fagot", "fagots", "fags", "fanny", "fannyflaps", "fannyfucker", "fanyy", "fatass", "fcuk", "fcuker", "fcuking", "feck", "fecker", "felching", "fellate", "fellatio", "fingerfuck", "fingerfucked", "fingerfucker", "fingerfuckers", "fingerfucking", "fingerfucks", "fistfuck", "fistfucked", "fistfucker", "fistfuckers", "fistfucking", "fistfuckings", "fistfucks", "flange", "fook", "fooker", "fuck", "fucka", "fucked", "fucker", "fuckers", "fuckhead", "fuckheads", "fuckin", "fucking", "fuckings", "fuckingshitmotherfucker", "fuckme", "fucks", "fuckwhit", "fuckwit", "fudge packer", "fudgepacker", "fuk", "fuker", "fukker", "fukkin", "fuks", "fukwhit", "fukwit", "fux", "fux0r", "f_u_c_k", "gangbang", "gangbanged", "gangbangs", "gaylord", "gaysex", "goatse", "God", "god-dam", "god-damned", "goddamn", "goddamned", "hardcoresex", "hell", "heshe", "hoar", "hoare", "hoer", "homo", "hore", "horniest", "horny", "hotsex", "jack-off", "jackoff", "jap", "jerk-off", "jism", "jiz", "jizm", "jizz", "kawk", "knob", "knobead", "knobed", "knobend", "knobhead", "knobjocky", "knobjokey", "kock", "kondum", "kondums", "kum", "kummer", "kumming", "kums", "kunilingus", "l3i+ch", "l3itch", "labia", "lmfao", "lust", "lusting", "m0f0", "m0fo", "m45terbate", "ma5terb8", "ma5terbate", "masochist", "master-bate", "masterb8", "masterbat*", "masterbat3", "masterbate", "masterbation", "masterbations", "masturbate", "mo-fo", "mof0", "mofo", "mothafuck", "mothafucka", "mothafuckas", "mothafuckaz", "mothafucked", "mothafucker", "mothafuckers", "mothafuckin", "mothafucking", "mothafuckings", "mothafucks", "mother fucker", "motherfuck", "motherfucked", "motherfucker", "motherfuckers", "motherfuckin", "motherfucking", "motherfuckings", "motherfuckka", "motherfucks", "muff", "mutha", "muthafecker", "muthafuckker", "muther", "mutherfucker", "n1gga", "n1gger", "nazi", "nigg3r", "nigg4h", "nigga", "niggah", "niggas", "niggaz", "nigger", "niggers", "nob", "nob jokey", "nobhead", "nobjocky", "nobjokey", "numbnuts", "nutsack", "orgasim", "orgasims", "orgasm", "orgasms", "p0rn", "pawn", "pecker", "penis", "penisfucker", "phonesex", "phuck", "phuk", "phuked", "phuking", "phukked", "phukking", "phuks", "phuq", "pigfucker", "pimpis", "piss", "pissed", "pisser", "pissers", "pisses", "pissflaps", "pissin", "pissing", "pissoff", "poop", "porn", "porno", "pornography", "pornos", "prick", "pricks", "pron", "pube", "pusse", "pussi", "pussies", "pussy", "pussys", "rectum", "retard", "rimjaw", "rimming", "s hit", "s.o.b.", "sadist", "schlong", "screwing", "scroat", "scrote", "scrotum", "semen", "sex", "sh!+", "sh!t", "sh1t", "shag", "shagger", "shaggin", "shagging", "shemale", "shi+", "shit", "shitdick", "shite", "shited", "shitey", "shitfuck", "shitfull", "shithead", "shiting", "shitings", "shits", "shitted", "shitter", "shitters", "shitting", "shittings", "shitty", "skank", "slut", "sluts", "smegma", "smut", "snatch", "son-of-a-bitch", "spac", "spunk", "s_h_i_t", "t1tt1e5", "t1tties", "teets", "teez", "testical", "testicle", "tit", "titfuck", "tits", "titt", "tittie5", "tittiefucker", "titties", "tittyfuck", "tittywank", "titwank", "tosser", "turd", "tw4t", "twat", "twathead", "twatty", "twunt", "twunter", "v14gra", "v1gra", "vagina", "viagra", "vulva", "w00se", "wang", "wank", "wanker", "wanky", "whoar", "whore", "willies", "willy", "xrated", "xxx"} # https://gist.github.com/jamiew/1112488 | |
set_of_wishes_en = re.compile(r"(wish|hope) (you|y'all|everyone)")#(have a (nice|good|great|excellent|wonderful) (day|night|time|week|year)|(wish|hope) you have|best wishes)') | |
set_of_praise_en = set('awesome,outstanding,excellent,great,good,neat,remarkable,fantastic,super,beautiful,bravo,incredible'.split(',')) | |
set_of_emergency_en = set('right now,rn,as soon as possible,asap,immediately,hurry up,straightaway,at once'.split(',')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment