halfak’s gists

halfak / revscoring_cjk.errors

Last active November 19, 2020 17:06

	==================================================================================================== FAILURES =====================================================================================================
	_________________________________________________________________________________________________ test_cjk_chars __________________________________________________________________________________________________

	def test_cjk_chars():
	cache = {p_text: "This is 55 {{るは}} a string.",
	r_text: "This is 56 [[壌のは]] a string."}

	assert solve(revision.cjk_chars, cache=cache) == 3
	assert solve(revision.parent.cjk_chars, cache=cache) == 2
	> assert solve(revision.diff.cjk_chars_added, cache=cache) == 2

halfak / nlwiki_template_extractor.py

Created October 22, 2020 15:56

	"""
	Process a collection of XML dumps looking for the introduction and removal of {{Beginnetje}} templates
	and assume the introduction represents a quality label ("E") and the removal represents the quality
	label "D". Note: This script does not yet handle reverts (e.g. vandalism). To do that, look into
	the mwreverts libraray

	USAGE:
	nlwiki_template_extractor (-h\|--help)
	nlwiki_template_extractor <xml-dump>...
	[--namespace=<num>...] [--processes=<num>]

halfak / list_set_demo.py

Created September 8, 2020 13:52

	$ python
	Python 3.8.2 (default, Jul 16 2020, 14:00:26)
	[GCC 9.3.0] on linux
	Type "help", "copyright", "credits" or "license" for more information.
	>>> import time
	>>> commons_pids = list(range(1, 50))
	>>> entity_pids = list(range(50, 100))
	>>> def linear_scan():
	... for val in entity_pids:
	... if val in commons_pids:

halfak / line_breaks.py

Last active August 6, 2020 15:29

	self._cjk_processing(tokenized_text, language=max_char_lang_frac, token_class=token_class)

	# TO

	self._cjk_processing(
	tokenized_text, language=max_char_lang_frac, token_class=token_class)

	# OR

	self._cjk_processing(

halfak / 99-local.yaml

Last active July 16, 2020 18:46

	# Score cache options
	score_caches:
	ores_redis:
	class: ores.score_caches.Redis
	host: 127.0.0.1 # Local
	port: 6379 # Default port

	scoring_systems:
	defaults:
	metrics_collector: local_logging # Don't try to connect graphite

halfak / enwiki_vectors.bash

Created July 13, 2020 19:25

	$ bzcat datasets/enwiki-20200501-learned_vectors.50_cell.vec.bz2 \| head
	10000 50
	he 0.3081902 -1.7661377 -0.26351795 -2.6554227 0.20365804 -0.2694949 -0.45049766 0.4969274 0.05990017 -0.25923896 0.31140116 -0.5986264 0.8714344 -0.48532763 -0.3693647 -0.32436007 -1.3534849 0.32795456 0.61355996 -0.94715625 -0.4455092 -1.1391499 0.93853545 1.1432649 0.8293254 0.4228589 1.1020386 -1.8064842 -0.82438534 -0.6033067 -0.23347689 -0.70451045 -0.32537228 -0.35027832 0.67294115 1.5023739 0.49681044 -0.87179273 0.3224187 0.33918247 0.67424035 0.73597753 -0.8553163 1.2491947 0.32812893 0.33435673 1.6141726 1.270183 0.67849094 0.27532846
	his 0.013586188 -0.63250244 -0.35859776 -1.0720271 0.17980172 -0.1954321 -0.245025 0.29639333 0.12190101 -0.2575211 0.051075332 -0.53400046 0.4236296 -0.39663923 -0.55470556 -0.14697435 -0.82484066 0.18489014 0.48893666 -0.34694576 -0.21766871 -0.55657053 0.37504694 0.39883402 0.20798574 0.4159887 0.53843856 -0.88261944 -0.32378322 -0.23307447 -0.10691466 -0.21688144 0.09186076 -0.1620926

halfak / demo_tokenize_time.py

Created June 29, 2020 14:13

	import time

	import mwapi
	from deltas.tokenizers import wikitext_split

	'''text = """
	This is a sentence [[derp\|link]].

	Here is another paragraph with the number 10.
	"""'''

halfak / output.json

Created June 11, 2020 16:32

{"transformed_content": ["short", "description", "Scottish", "born", "U", "S", "based", "stage", "film", "and", "television", "actress", "distinguish", "Helen", "Carroll", "Use", "British", "English", "date", "April", "More", "footnotes", "date", "April", "Use", "dmy", "dates", "date", "April", "Infobox", "person", "name", "Helena", "Carroll", "image", "imagesize", "caption", "birthname", "Helena", "Winifred", "Carroll", "birth_date", "Birth_date", "df", "yes", "birth_place", "Glasgow", "Scotland", "UK", "death_date", "death_date", "and", "age", "df", "yes", "death_place", "Los", "Angeles", "California", "U", "S", "occupation", "Actress", "years_active", "Helena", "Winifred", "Carroll", "November", "March", "was", "a", "veteran", "film", "television", "and", "stage", "actress", "Early", "life", "Born", "to", "clothing", "designer", "Helena", "Reilly", "and", "Abbey", "Theatre", "playwright", "Paul", "Vincent", "Carroll", "ref", "Obituary", "Notices", "Carroll", "Helena", "Winifred", "Los", "Angeles", "Times",

halfak / delta_example.py

Last active June 2, 2020 19:45

	>>> from deltas.tokenizers import wikitext_split
	>>>
	>>> text = """
	... I am some Wikipedia content.
	...
	... This is a {{template}}.<ref> foo</ref>
	... """
	>>>
	>>> wikitext_split.tokenize(text)
	[Token('\n', type='whitespace'), Token('I', type='word'), Token(' ', type='whitespace'), Token('am', type='word'), Token(' ', type='whitespace'), Token('some', type='word'), Token(' ', type='whitespace'), Token('Wikipedia', type='word'), Token(' ', type='whitespace'), Token('content', type='word'), Token('.', type='period'), Token('\n\n', type='break'), Token('This', type='word'), Token(' ', type='whitespace'), Token('is', type='word'), Token(' ', type='whitespace'), Token('a', type='word'), Token(' ', type='whitespace'), Token('{{', type='dcurly_open'), Token('template', type='word'), Token('}}', type='dcurly_close'), Token('.', type='period'), Token('<ref>', type='ref_open'), Token(' ', type='whitespace'), Token('foo', type='word'), Token('</ref>', type='ref_close'), Token('\n', type='whitespace')]

halfak / demo_tokenize.py

Created June 1, 2020 16:11

Tokenize stuck on japanese revision

	import mwapi
	from deltas.tokenizers import wikitext_split

	rev_id = 57246316

	session = mwapi.Session("https://ja.wikipedia.org")
	doc = session.get(action="query", prop="revisions", revids=[rev_id], rvslots="main", rvprop="content", formatversion=2)
	text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content']

	location = 0

Aaron Halfaker halfak