Skip to content

Instantly share code, notes, and snippets.

@wareya
wareya / actuallycompilemecab.sh
Last active April 16, 2018 22:45
compile mecab without autotools making everything unreasonably elaborate
#!sh
# On mingw, YOU NEED TO EDIT mecab.h to change DLL_EXPORT to MECAB_DLL_EXPORT or similar. on other platforms, you need to edit the first g++ line below to use DDLL_EXPORT instead (remove mconsole while you're at it)
cd src
MECAB_DEFAULT_RC="\"C:/Program Files/mecab/etc/mecabrc\""
DIC_VERSION="102"
g++ -O3 -m64 -mconsole -I.. -DDIC_VERSION=$DIC_VERSION -DMECAB_DEFAULT_RC="$MECAB_DEFAULT_RC" -DHAVE_CONFIG_H -Wfatal-errors -DMECAB_DLL_EXPORT learner.cpp tagger.cpp viterbi.cpp char_property.cpp dictionary_compiler.cpp feature_index.cpp learner_tagger.cpp nbest_generator.cpp tokenizer.cpp connector.cpp dictionary_generator.cpp iconv_utils.cpp param.cpp utils.cpp context_id.cpp dictionary_rewriter.cpp lbfgs.cpp string_buffer.cpp dictionary.cpp eval.cpp writer.cpp libmecab.cpp -shared -static -static-libgcc -static-libstdc++ -lpthread -liconv -o libmecab.dll
g++ -Os -fdata-sections -ffunction-sections -fwhole-program -Wl,--gc-sections -Wl,--strip-all -m64 -mconsole -I.. -DDIC_VERSION=$DIC_VERSION -DMECAB_DEFA
@wareya
wareya / rip.py
Last active March 31, 2018 02:23
narou scraper with persistent knowledge of updates (not fully tested) - use also: https://gist.github.com/wareya/4305e2f971a78c960402ac69f308128c
#!python
from bs4 import BeautifulSoup
import urllib
from urllib.parse import urljoin
import sys
import aiohttp
import asyncio
#!python
from bs4 import BeautifulSoup
import urllib
from urllib.parse import urljoin
import sys
def get_top_300(url):
r = urllib.request.urlopen(url)
data = r.read()
@wareya
wareya / rip.py
Last active March 28, 2018 04:01
scrape web novels from narou
#!python
from bs4 import BeautifulSoup
import urllib
from urllib.parse import urljoin
import sys
import aiohttp
import asyncio
#!/usr/bin/env python
# make directory "glyphs" first
from PIL import Image
import json
stuff = json.load(open("sjr_fonts.json", encoding="utf-8"))
stuff = stuff["subbooks"][0]["fonts"][0]
@wareya
wareya / filter_vocal_sounds.py
Last active March 3, 2018 10:49
execution code for vocalsoundfilter.py
#!python
# -*- coding: utf-8 -*-
import numpy as np
from random import shuffle, seed, randrange, choice, uniform
from math import floor
lines = []
data = []
low = 0x3000
@wareya
wareya / vocalsoundfilter.py
Last active September 6, 2018 06:51
Code to train a text line filter, modelled for detecting lines made up of vocal sounds in Japanese stories
#!python
# -*- coding: utf-8 -*-
import numpy as np
# training data in B.txt
# format with the letter B prefixed to lines that contain mostly spoken sound effects and other vocal sounds
# use https://gist.github.com/wareya/ec9f33ce8e5c12f5005b5345ddcd7e6b to "run" model on a text file
np.set_printoptions(suppress=True)
np.set_printoptions(threshold=np.nan)
@wareya
wareya / find.py
Last active March 13, 2018 00:04
search for han characters based on a list of components
#!python
# coding=utf-8
# Get ids.txt from https://github.com/cjkvi/cjkvi-ids/ and place it next to this script
# ~requires python 3.6 or newer on windows~
# note: depends on the accuracy of ids.txt. for some characters, like 祭, it's pretty bad.
# see also: http://www.chise.org/ids-find
contains = {}
@wareya
wareya / main.py
Created February 19, 2018 18:01
""minimal"" pysdl2 test
#!python
class Entity:
def __init__(self):
global id_counter
self.id = id_counter
id_counter += 1
self.stash()
def stash(self):
name = self.__class__.__name__
@wareya
wareya / test.py
Last active February 17, 2018 00:06
broken line ocr
#!python
# -*- coding: utf-8 -*-
import numpy as np
from PIL import Image, ImageFont, ImageDraw
import PIL.ImageOps as ImageOps
np.set_printoptions(suppress=True)
np.set_printoptions(threshold=np.nan)
from random import shuffle, seed, randrange