Skip to content

Instantly share code, notes, and snippets.

View tslmy's full-sized avatar

Ming tslmy

View GitHub Profile
@tslmy
tslmy / build.gradle
Created September 16, 2022 18:59
Generate code from OpenAPI contracts: Gradle
plugins {
id 'org.hidetake.swagger.generator' version '2.19.2'
}
dependencies {
// https://github.com/int128/gradle-swagger-generator-plugin#code-generation
swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.35'
}
swaggerSources {
// We name this API "petstore".
petstore {
@tslmy
tslmy / pom.xml
Created September 16, 2022 18:58
Generate code from OpenAPI contracts: Maven
<plugin>
<groupId>io.swagger</groupId>
<artifactId>swagger-codegen-maven-plugin</artifactId>
<version>2.3.1</version>
<executions>
<execution>
<!-- We name this API "petstore". -->
<id>petstore</id>
<goals>
<goal>generate</goal>
@tslmy
tslmy / getDocVecFromBert.py
Created September 16, 2019 14:32
Get document vector from BERT using Flair
import pandas as pd
from flair.embeddings import BertEmbeddings
from flair.embeddings import Sentence
bert = BertEmbeddings('bert-base-uncased')
def getDocVecFromBertForStr(sent_str):
# Create a Sentence object:
sentence = Sentence(sent_str)
# Compute its vector form using BERT:
bert.embed(sentence)
@tslmy
tslmy / Render Emojis and CJK Characters.tex
Created April 4, 2019 01:00
Render Emojis and CJK Characters in XeTeX
\usepackage{xeCJK}
\usepackage{xelatexemoji} % Available at <https://github.com/mreq/xelatex-emoji>
@tslmy
tslmy / main.py
Created March 28, 2019 02:05
Printing Maximum Sum Increasing Subsequence
def find(l):
'''Function to construct Maximum Sum Increasing Subsequence.
A rewrite of <https://www.geeksforgeeks.org/printing-maximum-sum-increasing-subsequence/>.'''
m =[[l[0]]]
for i in range(1,len(l)): # start from index 1
c = [] # "current selection"
for j in range(i): # for every j less than i
if l[i]>l[j] and sum(c)<sum(m[j]): c = m[j][:] # `[:]` is for copying, not referencing.
c.append(l[i]) # current selection only valid with selection of the current item
m.append(c)
@tslmy
tslmy / additional_features.py
Created March 15, 2019 00:01
additional_features.py
set_of_hedges_en = {"almost", "apparent", "apparently", "appear", "appeared", "appears", "approximately", "argue", "argued", "argues", "around", "assume", "assumed", "broadly", "certain amount", "certain extent", "certain level", "claim", "claimed", "claims", "doubt", "doubtful", "essentially", "estimate", "estimated", "fairly", "feel", "feels", "felt", "frequently", "from my perspective", "from our perspective", "from this perspective", "generally", "guess", "in general", "in most cases", "in most instances", "in my opinion", "in my view", "in our opinion", "in our view", "in this view", "indicate", "indicated", "indicates", "largely", "likely", "mainly", "may", "maybe", "might", "mostly", "often", "on the whole", "ought", "perhaps", "plausible", "plausibly", "possible", "possibly", "postulate", "postulated", "postulates", "presumable", "presumably", "probable", "probably", "quite", "rather", "relatively", "roughly", "seems", "should", "sometimes", "somewhat", "suggest", "suggested", "suggests", "suppose", "
@tslmy
tslmy / taboo_cn.py
Created March 11, 2019 20:15
A Set of Taboo Words in Simplified Chinese
set_of_taboo_cn = {'米田共', '屁', '屎', '屌', '粪', '尿', '死'}
set_of_namecalling_cn = set('白目,白痴,人渣,王八蛋,怪胎,孬种,畜生,淫妇,混蛋,混蛋,魂淡,龟孙,笨蛋,智障,傻瓜,蠢猪,蠢狗,傻狗,窝囊废,废物,泼妇,骚货,骚逼,贱人,贱货,荡妇,杂种,坏蛋,烂货,傻帽,250,贰佰伍,二货,2B,二百五,SB,傻逼,傻B,煞笔,沙比,混账,婊子,脑残'.split(','))
set_of_swearfiller_cn = set('拷,靠,操,艹,草,cao,我擦,擦嘞,干,呸,夭寿,他妈,他妹的,你妈,你妹,nm,tm,去你的,他奶奶的,tnnd,妈蛋,妈的,md,该死,靠背,靠杯'.split(','))
set_of_taboo_cn = set_of_taboo_cn | set_of_namecalling_cn | set_of_swearfiller_cn
@tslmy
tslmy / geolocList_jp_manual.py
Created January 11, 2019 17:09
List of ~4,700 Geolocation Strings Manually Identified to be in Japan, plus ~600 Not Really Although Seems So
# List of geolocation strings in Japan and not-in-Japan. Manually determined by @tslmy and to their best knowledge.
# All lowercased, whitespace-stripped strings.
in_japan = [
'aichi japan',
'aichi',
'aichi,japan',
'aomori',
'chiba',
'chiba japan',
'chiba,japan',
@tslmy
tslmy / CountBigrams.py
Last active December 7, 2018 18:13
Almost-Best Practice of Counting Bigrams From A Text File To Space-Delimited Table File
from collections import Counter
from tqdm import tqdm
def countLines(fpath):
with open(fpath) as f:
for i, l in enumerate(f): pass
return i + 1
def countBigrams(fpath='corpora/TwtUk2014_trainingData.txt'):
cnt = Counter()
getBigrams = lambda l: zip(l, l[1:])
with open(fpath, 'r') as f:
@tslmy
tslmy / Always-Linear-Time Select Function For 1D Arrays, but in Numpy.py
Created October 8, 2018 00:29
Always-Linear-Time Select Function For 1D Arrays, but in Numpy
import numpy as np
ceil = lambda x, base: x if x % base == 0 else x + base - x % base
fill = lambda a, b: np.append(a, values=[np.nan]*(ceil(len(a), b)-len(a)))
median = lambda m: (len(m)-int(len(m) % 2 == 0))//2
dropna = lambda x: x[~np.isnan(x)]
def select(a, i):
if len(a)<5: return sorted(a)[i]
m = dropna(np.median(fill(a, 5).reshape(-1, 5), axis=1))
median_to_get = median(m)
x = select(m, i = median_to_get) # use SELECT to find the median-of-medians.