Dmitry Nikolayev macleginn

University of Manchester
Manchester
https://dnikolaev.com
@macleginn

Recently created

Least recently created

Recently updated

Least recently updated

macleginn / compare_dumps.py

Created February 3, 2019 12:44

	import sqlite3
	import pandas as pd
	import html
	import re

	# Убираем <br>, <br/>, </br>; заменяем любые последовательности
	# whitespace-символов на один пробел.
	def normalise_ws(s):
	s = re.sub(r'</?br/?>', ' ', s)
	s = re.sub(r'[\n\r]+ *', ' ', s)

macleginn / preprocess.py

Created January 26, 2019 21:37

Preprocess Berezkin

	import os
	import os.path
	import sqlite3

	from bs4 import BeautifulSoup
	from sys import exit

	WORKING_DIR = 'XXX'
	INPUT_DIR = 'input_html'

macleginn / PDPROTO_v_PHOIBLE.r

Last active January 14, 2019 15:36

	proto.data <- read.csv('bdproto.csv',
	sep = ',')

	## Clean the data
	p.d <- proto.data[ !is.na(proto.data$LanguageFamilyRoot) &
	proto.data$LanguageFamilyRoot != '' &
	!is.na(proto.data$LanguageName) &
	proto.data$LanguageName != '', ]

	## People mostly reconstruct weird stuff:

macleginn / query_glottolog.py

Created September 5, 2018 17:56

	from pyglottolog.api import Glottolog

	# 'full' is a pandas dataframe with glottocodes

	api = Glottolog('/Users/macbook/tmp/glottolog')

	gltc_temp = {}
	gltc_err = set()

	for i in range(full.shape[0]):

macleginn / cropndisplay.html

Created August 2, 2018 16:14

A piece of JavaScript code for selecting parts of images and displaying them in the original context. Composite shapes consisting of rectangles can be cropped to their bounding box.

	<html>
	<head>
	<meta charset="utf8">
	<title>Crop’n’display demo</title>
	<style>
	.overlay {
	padding: 30px;
	width: 100vw;
	height: 100vh;
	position: absolute;

macleginn / compute_distance.jl

Created July 13, 2018 20:42

	using DataFrames;
	using Feather;

	# Заранее подготовленная таблица расстояний между этносами
	dist_data = Feather.read("geodistances.feather");
	@everywhere dist_array = Array{Int64}(926,926);
	for i = 1:926
	for j = 2:927
	dist_array[i,j-1] = dist_data[i,j]
	end

macleginn / Ciris_word_list.txt

Created March 5, 2018 19:28

Lexical list of Ciris

	a
	ab
	abisses
	abruptas
	absistam
	abstulit
	ac
	accendet
	accepit
	accepta

macleginn / normalise_Kurdistan_data.py

Last active June 25, 2017 09:36

Convert the data-table of the phonologies of the languages of Kurdistan into the normalised format

	import numpy as np
	import pandas as pd
	import re
	from functools import reduce

	def process_phoneme(p):
	"""Normalise phonetic symbols and enforce pharyngealised treatment of emphatics."""
	p = p.split('/')[0].replace(':', 'ː').replace('\u0361', '').replace('ˠ', 'ˤ').replace('\u033b', '').replace("'", 'ʰ').replace('\u032a', '')
	if 'l' not in p and '\u0334' in p:
	p = p.replace('\u0334', 'ˤ')

macleginn / rusgram_postprocessor.py

Created January 12, 2017 18:33

Convertor and postprocessor of .docx files prepared for rusgram.ru

	import subprocess
	import re

	# Convert to html using pandoc and capture output
	fn = 'sources/re_docx/reflexive_letuchiy_20141102_nst_site.docx'
	txt = subprocess.check_output(['pandoc',
	'-f', 'docx',
	'-t', 'html',
	fn]).decode('utf8')

macleginn / getAveragedFormants.praat

Last active July 16, 2016 10:31

Praat editor script for getting average values of the first three formants for the selected segment

	Extract selected sound (time from 0)
	endeditor
	duration = Get total duration
	writeInfoLine: fixed$ (duration, 3)
	# Replace 5000 with 5500 for female voice.
	# You may need to tweak the number of formants (2nd parameter) based on
	# your data. Sometimes, if two formants are close to each other,
	# it is necessary to ask for 6 formants so that they may be decoupled.
	# In other cases, however, this may introduce spurious formants, and
	# it is always advisable to check both the spectrogram and

Newer Older