This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sqlite3 | |
| import pandas as pd | |
| import html | |
| import re | |
| # Убираем <br>, <br/>, </br>; заменяем любые последовательности | |
| # whitespace-символов на один пробел. | |
| def normalise_ws(s): | |
| s = re.sub(r'</?br/?>', ' ', s) | |
| s = re.sub(r'[\n\r]+ *', ' ', s) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import os.path | |
| import sqlite3 | |
| from bs4 import BeautifulSoup | |
| from sys import exit | |
| WORKING_DIR = 'XXX' | |
| INPUT_DIR = 'input_html' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| proto.data <- read.csv('bdproto.csv', | |
| sep = ',') | |
| ## Clean the data | |
| p.d <- proto.data[ !is.na(proto.data$LanguageFamilyRoot) & | |
| proto.data$LanguageFamilyRoot != '' & | |
| !is.na(proto.data$LanguageName) & | |
| proto.data$LanguageName != '', ] | |
| ## People mostly reconstruct weird stuff: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pyglottolog.api import Glottolog | |
| # 'full' is a pandas dataframe with glottocodes | |
| api = Glottolog('/Users/macbook/tmp/glottolog') | |
| gltc_temp = {} | |
| gltc_err = set() | |
| for i in range(full.shape[0]): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <html> | |
| <head> | |
| <meta charset="utf8"> | |
| <title>Crop’n’display demo</title> | |
| <style> | |
| .overlay { | |
| padding: 30px; | |
| width: 100vw; | |
| height: 100vh; | |
| position: absolute; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using DataFrames; | |
| using Feather; | |
| # Заранее подготовленная таблица расстояний между этносами | |
| dist_data = Feather.read("geodistances.feather"); | |
| @everywhere dist_array = Array{Int64}(926,926); | |
| for i = 1:926 | |
| for j = 2:927 | |
| dist_array[i,j-1] = dist_data[i,j] | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| a | |
| ab | |
| abisses | |
| abruptas | |
| absistam | |
| abstulit | |
| ac | |
| accendet | |
| accepit | |
| accepta |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| import re | |
| from functools import reduce | |
| def process_phoneme(p): | |
| """Normalise phonetic symbols and enforce pharyngealised treatment of emphatics.""" | |
| p = p.split('/')[0].replace(':', 'ː').replace('\u0361', '').replace('ˠ', 'ˤ').replace('\u033b', '').replace("'", 'ʰ').replace('\u032a', '') | |
| if 'l' not in p and '\u0334' in p: | |
| p = p.replace('\u0334', 'ˤ') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import subprocess | |
| import re | |
| # Convert to html using pandoc and capture output | |
| fn = 'sources/re_docx/reflexive_letuchiy_20141102_nst_site.docx' | |
| txt = subprocess.check_output(['pandoc', | |
| '-f', 'docx', | |
| '-t', 'html', | |
| fn]).decode('utf8') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Extract selected sound (time from 0) | |
| endeditor | |
| duration = Get total duration | |
| writeInfoLine: fixed$ (duration, 3) | |
| # Replace 5000 with 5500 for female voice. | |
| # You may need to tweak the number of formants (2nd parameter) based on | |
| # your data. Sometimes, if two formants are close to each other, | |
| # it is necessary to ask for 6 formants so that they may be decoupled. | |
| # In other cases, however, this may introduce spurious formants, and | |
| # it is always advisable to check both the spectrogram and |