wareya’s gists

wareya / actuallycompilemecab.sh

Last active April 16, 2018 22:45

compile mecab without autotools making everything unreasonably elaborate

	#!sh
	# On mingw, YOU NEED TO EDIT mecab.h to change DLL_EXPORT to MECAB_DLL_EXPORT or similar. on other platforms, you need to edit the first g++ line below to use DDLL_EXPORT instead (remove mconsole while you're at it)
	cd src
	MECAB_DEFAULT_RC="\"C:/Program Files/mecab/etc/mecabrc\""
	DIC_VERSION="102"
	g++ -O3 -m64 -mconsole -I.. -DDIC_VERSION=$DIC_VERSION -DMECAB_DEFAULT_RC="$MECAB_DEFAULT_RC" -DHAVE_CONFIG_H -Wfatal-errors -DMECAB_DLL_EXPORT learner.cpp tagger.cpp viterbi.cpp char_property.cpp dictionary_compiler.cpp feature_index.cpp learner_tagger.cpp nbest_generator.cpp tokenizer.cpp connector.cpp dictionary_generator.cpp iconv_utils.cpp param.cpp utils.cpp context_id.cpp dictionary_rewriter.cpp lbfgs.cpp string_buffer.cpp dictionary.cpp eval.cpp writer.cpp libmecab.cpp -shared -static -static-libgcc -static-libstdc++ -lpthread -liconv -o libmecab.dll

	g++ -Os -fdata-sections -ffunction-sections -fwhole-program -Wl,--gc-sections -Wl,--strip-all -m64 -mconsole -I.. -DDIC_VERSION=$DIC_VERSION -DMECAB_DEFA

wareya / rip.py

Last active March 31, 2018 02:23

narou scraper with persistent knowledge of updates (not fully tested) - use also: https://gist.github.com/wareya/4305e2f971a78c960402ac69f308128c

	#!python

	from bs4 import BeautifulSoup
	import urllib
	from urllib.parse import urljoin
	import sys

	import aiohttp
	import asyncio

wareya / yomou.py

Last active March 30, 2018 21:26

	#!python

	from bs4 import BeautifulSoup
	import urllib
	from urllib.parse import urljoin
	import sys

	def get_top_300(url):
	r = urllib.request.urlopen(url)
	data = r.read()

wareya / rip.py

Last active March 28, 2018 04:01

scrape web novels from narou

	#!python

	from bs4 import BeautifulSoup
	import urllib
	from urllib.parse import urljoin
	import sys

	import aiohttp
	import asyncio

wareya / bitmapize.py

Created March 12, 2018 13:22

	#!/usr/bin/env python

	# make directory "glyphs" first

	from PIL import Image
	import json

	stuff = json.load(open("sjr_fonts.json", encoding="utf-8"))
	stuff = stuff["subbooks"][0]["fonts"][0]

wareya / filter_vocal_sounds.py

Last active March 3, 2018 10:49

execution code for vocalsoundfilter.py

	#!python
	# -- coding: utf-8 --
	import numpy as np
	from random import shuffle, seed, randrange, choice, uniform
	from math import floor

	lines = []
	data = []

	low = 0x3000

wareya / vocalsoundfilter.py

Last active September 6, 2018 06:51

Code to train a text line filter, modelled for detecting lines made up of vocal sounds in Japanese stories

	#!python
	# -- coding: utf-8 --
	import numpy as np

	# training data in B.txt
	# format with the letter B prefixed to lines that contain mostly spoken sound effects and other vocal sounds
	# use https://gist.github.com/wareya/ec9f33ce8e5c12f5005b5345ddcd7e6b to "run" model on a text file

	np.set_printoptions(suppress=True)
	np.set_printoptions(threshold=np.nan)

wareya / find.py

Last active March 13, 2018 00:04

search for han characters based on a list of components

	#!python
	# coding=utf-8

	# Get ids.txt from https://github.com/cjkvi/cjkvi-ids/ and place it next to this script
	# ~requires python 3.6 or newer on windows~
	# note: depends on the accuracy of ids.txt. for some characters, like 祭, it's pretty bad.
	# see also: http://www.chise.org/ids-find

	contains = {}

wareya / main.py

Created February 19, 2018 18:01

""minimal"" pysdl2 test

wareya / test.py

Last active February 17, 2018 00:06

broken line ocr

	#!python
	# -- coding: utf-8 --
	import numpy as np
	from PIL import Image, ImageFont, ImageDraw
	import PIL.ImageOps as ImageOps

	np.set_printoptions(suppress=True)
	np.set_printoptions(threshold=np.nan)

	from random import shuffle, seed, randrange

Wareya wareya