Skip to content

Instantly share code, notes, and snippets.

@kleem
Last active December 4, 2015 19:33
Show Gist options
  • Save kleem/f86169284cc1a403011e to your computer and use it in GitHub Desktop.
Save kleem/f86169284cc1a403011e to your computer and use it in GitHub Desktop.
langid.py

An example of usage of the langid.py language identification system.

Language codes returned by langid.py are converted into iso639-3. Hover on the results to see the confidence score as well as the language readable name.

This is just a test; the response is very delayed, since every time a query is issued the script loads the language models. The delay is avoidable by running langid as a service.

langid_langs = [{
"part2B": "afr",
"part1": "af",
"part2T": "afr",
"scope": "I",
"type": "L",
"id": "afr",
"ref_name": "Afrikaans"
},
{
"part2B": "amh",
"part1": "am",
"part2T": "amh",
"scope": "I",
"type": "L",
"id": "amh",
"ref_name": "Amharic"
},
{
"part2B": "arg",
"part1": "an",
"part2T": "arg",
"scope": "I",
"type": "L",
"id": "arg",
"ref_name": "Aragonese"
},
{
"part2B": "ara",
"part1": "ar",
"part2T": "ara",
"scope": "M",
"type": "L",
"id": "ara",
"ref_name": "Arabic"
},
{
"part2B": "asm",
"part1": "as",
"part2T": "asm",
"scope": "I",
"type": "L",
"id": "asm",
"ref_name": "Assamese"
},
{
"part2B": "aze",
"part1": "az",
"part2T": "aze",
"scope": "M",
"type": "L",
"id": "aze",
"ref_name": "Azerbaijani"
},
{
"part2B": "bel",
"part1": "be",
"part2T": "bel",
"scope": "I",
"type": "L",
"id": "bel",
"ref_name": "Belarusian"
},
{
"part2B": "bul",
"part1": "bg",
"part2T": "bul",
"scope": "I",
"type": "L",
"id": "bul",
"ref_name": "Bulgarian"
},
{
"part2B": "ben",
"part1": "bn",
"part2T": "ben",
"scope": "I",
"type": "L",
"id": "ben",
"ref_name": "Bengali"
},
{
"part2B": "bre",
"part1": "br",
"part2T": "bre",
"scope": "I",
"type": "L",
"id": "bre",
"ref_name": "Breton"
},
{
"part2B": "bos",
"part1": "bs",
"part2T": "bos",
"scope": "I",
"type": "L",
"id": "bos",
"ref_name": "Bosnian"
},
{
"part2B": "cat",
"part1": "ca",
"part2T": "cat",
"scope": "I",
"type": "L",
"id": "cat",
"ref_name": "Catalan"
},
{
"part2B": "cze",
"part1": "cs",
"part2T": "ces",
"scope": "I",
"type": "L",
"id": "ces",
"ref_name": "Czech"
},
{
"part2B": "wel",
"part1": "cy",
"part2T": "cym",
"scope": "I",
"type": "L",
"id": "cym",
"ref_name": "Welsh"
},
{
"part2B": "dan",
"part1": "da",
"part2T": "dan",
"scope": "I",
"type": "L",
"id": "dan",
"ref_name": "Danish"
},
{
"part2B": "ger",
"part1": "de",
"part2T": "deu",
"scope": "I",
"type": "L",
"id": "deu",
"ref_name": "German"
},
{
"part2B": "dzo",
"part1": "dz",
"part2T": "dzo",
"scope": "I",
"type": "L",
"id": "dzo",
"ref_name": "Dzongkha"
},
{
"part2B": "gre",
"part1": "el",
"part2T": "ell",
"scope": "I",
"type": "L",
"id": "ell",
"ref_name": "Modern Greek (1453-)"
},
{
"part2B": "eng",
"part1": "en",
"part2T": "eng",
"scope": "I",
"type": "L",
"id": "eng",
"ref_name": "English"
},
{
"part2B": "epo",
"part1": "eo",
"part2T": "epo",
"scope": "I",
"type": "C",
"id": "epo",
"ref_name": "Esperanto"
},
{
"part2B": "est",
"part1": "et",
"part2T": "est",
"scope": "M",
"type": "L",
"id": "est",
"ref_name": "Estonian"
},
{
"part2B": "baq",
"part1": "eu",
"part2T": "eus",
"scope": "I",
"type": "L",
"id": "eus",
"ref_name": "Basque"
},
{
"part2B": "fao",
"part1": "fo",
"part2T": "fao",
"scope": "I",
"type": "L",
"id": "fao",
"ref_name": "Faroese"
},
{
"part2B": "per",
"part1": "fa",
"part2T": "fas",
"scope": "M",
"type": "L",
"id": "fas",
"ref_name": "Persian"
},
{
"part2B": "fin",
"part1": "fi",
"part2T": "fin",
"scope": "I",
"type": "L",
"id": "fin",
"ref_name": "Finnish"
},
{
"part2B": "fre",
"part1": "fr",
"part2T": "fra",
"scope": "I",
"type": "L",
"id": "fra",
"ref_name": "French"
},
{
"part2B": "gle",
"part1": "ga",
"part2T": "gle",
"scope": "I",
"type": "L",
"id": "gle",
"ref_name": "Irish"
},
{
"part2B": "glg",
"part1": "gl",
"part2T": "glg",
"scope": "I",
"type": "L",
"id": "glg",
"ref_name": "Galician"
},
{
"part2B": "guj",
"part1": "gu",
"part2T": "guj",
"scope": "I",
"type": "L",
"id": "guj",
"ref_name": "Gujarati"
},
{
"part2B": "hat",
"part1": "ht",
"part2T": "hat",
"scope": "I",
"type": "L",
"id": "hat",
"ref_name": "Haitian"
},
{
"part2B": "heb",
"part1": "he",
"part2T": "heb",
"scope": "I",
"type": "L",
"id": "heb",
"ref_name": "Hebrew"
},
{
"part2B": "hin",
"part1": "hi",
"part2T": "hin",
"scope": "I",
"type": "L",
"id": "hin",
"ref_name": "Hindi"
},
{
"part2B": "hrv",
"part1": "hr",
"part2T": "hrv",
"scope": "I",
"type": "L",
"id": "hrv",
"ref_name": "Croatian"
},
{
"part2B": "hun",
"part1": "hu",
"part2T": "hun",
"scope": "I",
"type": "L",
"id": "hun",
"ref_name": "Hungarian"
},
{
"part2B": "arm",
"part1": "hy",
"part2T": "hye",
"scope": "I",
"type": "L",
"id": "hye",
"ref_name": "Armenian"
},
{
"part2B": "ice",
"part1": "is",
"part2T": "isl",
"scope": "I",
"type": "L",
"id": "isl",
"ref_name": "Icelandic"
},
{
"part2B": "ita",
"part1": "it",
"part2T": "ita",
"scope": "I",
"type": "L",
"id": "ita",
"ref_name": "Italian"
},
{
"part2B": "jav",
"part1": "jv",
"part2T": "jav",
"scope": "I",
"type": "L",
"id": "jav",
"ref_name": "Javanese"
},
{
"part2B": "jpn",
"part1": "ja",
"part2T": "jpn",
"scope": "I",
"type": "L",
"id": "jpn",
"ref_name": "Japanese"
},
{
"part2B": "kan",
"part1": "kn",
"part2T": "kan",
"scope": "I",
"type": "L",
"id": "kan",
"ref_name": "Kannada"
},
{
"part2B": "geo",
"part1": "ka",
"part2T": "kat",
"scope": "I",
"type": "L",
"id": "kat",
"ref_name": "Georgian"
},
{
"part2B": "kaz",
"part1": "kk",
"part2T": "kaz",
"scope": "I",
"type": "L",
"id": "kaz",
"ref_name": "Kazakh"
},
{
"part2B": "khm",
"part1": "km",
"part2T": "khm",
"scope": "I",
"type": "L",
"id": "khm",
"ref_name": "Central Khmer"
},
{
"part2B": "kin",
"part1": "rw",
"part2T": "kin",
"scope": "I",
"type": "L",
"id": "kin",
"ref_name": "Kinyarwanda"
},
{
"part2B": "kir",
"part1": "ky",
"part2T": "kir",
"scope": "I",
"type": "L",
"id": "kir",
"ref_name": "Kirghiz"
},
{
"part2B": "kor",
"part1": "ko",
"part2T": "kor",
"scope": "I",
"type": "L",
"id": "kor",
"ref_name": "Korean"
},
{
"part2B": "kur",
"part1": "ku",
"part2T": "kur",
"scope": "M",
"type": "L",
"id": "kur",
"ref_name": "Kurdish"
},
{
"part2B": "lao",
"part1": "lo",
"part2T": "lao",
"scope": "I",
"type": "L",
"id": "lao",
"ref_name": "Lao"
},
{
"part2B": "lat",
"part1": "la",
"part2T": "lat",
"scope": "I",
"type": "A",
"id": "lat",
"ref_name": "Latin"
},
{
"part2B": "lav",
"part1": "lv",
"part2T": "lav",
"scope": "M",
"type": "L",
"id": "lav",
"ref_name": "Latvian"
},
{
"part2B": "lit",
"part1": "lt",
"part2T": "lit",
"scope": "I",
"type": "L",
"id": "lit",
"ref_name": "Lithuanian"
},
{
"part2B": "ltz",
"part1": "lb",
"part2T": "ltz",
"scope": "I",
"type": "L",
"id": "ltz",
"ref_name": "Luxembourgish"
},
{
"part2B": "mal",
"part1": "ml",
"part2T": "mal",
"scope": "I",
"type": "L",
"id": "mal",
"ref_name": "Malayalam"
},
{
"part2B": "mar",
"part1": "mr",
"part2T": "mar",
"scope": "I",
"type": "L",
"id": "mar",
"ref_name": "Marathi"
},
{
"part2B": "mac",
"part1": "mk",
"part2T": "mkd",
"scope": "I",
"type": "L",
"id": "mkd",
"ref_name": "Macedonian"
},
{
"part2B": "mlg",
"part1": "mg",
"part2T": "mlg",
"scope": "M",
"type": "L",
"id": "mlg",
"ref_name": "Malagasy"
},
{
"part2B": "mlt",
"part1": "mt",
"part2T": "mlt",
"scope": "I",
"type": "L",
"id": "mlt",
"ref_name": "Maltese"
},
{
"part2B": "mon",
"part1": "mn",
"part2T": "mon",
"scope": "M",
"type": "L",
"id": "mon",
"ref_name": "Mongolian"
},
{
"part2B": "may",
"part1": "ms",
"part2T": "msa",
"scope": "M",
"type": "L",
"id": "msa",
"ref_name": "Malay (macrolanguage)"
},
{
"part2B": "nep",
"part1": "ne",
"part2T": "nep",
"scope": "M",
"type": "L",
"id": "nep",
"ref_name": "Nepali macrolanguage whose constituent members are Nepali [npi] and Dotyali [dty]"
},
{
"part2B": "dut",
"part1": "nl",
"part2T": "nld",
"scope": "I",
"type": "L",
"id": "nld",
"ref_name": "Dutch"
},
{
"part2B": "nno",
"part1": "nn",
"part2T": "nno",
"scope": "I",
"type": "L",
"id": "nno",
"ref_name": "Norwegian Nynorsk"
},
{
"part2B": "nob",
"part1": "nb",
"part2T": "nob",
"scope": "I",
"type": "L",
"id": "nob",
"ref_name": "Norwegian Bokmål"
},
{
"part2B": "nor",
"part1": "no",
"part2T": "nor",
"scope": "M",
"type": "L",
"id": "nor",
"ref_name": "Norwegian"
},
{
"part2B": "oci",
"part1": "oc",
"part2T": "oci",
"scope": "I",
"type": "L",
"id": "oci",
"ref_name": "Occitan (post 1500)"
},
{
"part2B": "ori",
"part1": "or",
"part2T": "ori",
"scope": "M",
"type": "L",
"id": "ori",
"ref_name": "Oriya macrolanguage with constituent members Oriya [ory] and Sambalpuri [spv]"
},
{
"part2B": "pan",
"part1": "pa",
"part2T": "pan",
"scope": "I",
"type": "L",
"id": "pan",
"ref_name": "Panjabi"
},
{
"part2B": "pol",
"part1": "pl",
"part2T": "pol",
"scope": "I",
"type": "L",
"id": "pol",
"ref_name": "Polish"
},
{
"part2B": "por",
"part1": "pt",
"part2T": "por",
"scope": "I",
"type": "L",
"id": "por",
"ref_name": "Portuguese"
},
{
"part2B": "pus",
"part1": "ps",
"part2T": "pus",
"scope": "M",
"type": "L",
"id": "pus",
"ref_name": "Pushto"
},
{
"part2B": "que",
"part1": "qu",
"part2T": "que",
"scope": "M",
"type": "L",
"id": "que",
"ref_name": "Quechua"
},
{
"part2B": "rum",
"part1": "ro",
"part2T": "ron",
"scope": "I",
"type": "L",
"id": "ron",
"ref_name": "Romanian"
},
{
"part2B": "rus",
"part1": "ru",
"part2T": "rus",
"scope": "I",
"type": "L",
"id": "rus",
"ref_name": "Russian"
},
{
"part2B": "sin",
"part1": "si",
"part2T": "sin",
"scope": "I",
"type": "L",
"id": "sin",
"ref_name": "Sinhala"
},
{
"part2B": "slo",
"part1": "sk",
"part2T": "slk",
"scope": "I",
"type": "L",
"id": "slk",
"ref_name": "Slovak"
},
{
"part2B": "slv",
"part1": "sl",
"part2T": "slv",
"scope": "I",
"type": "L",
"id": "slv",
"ref_name": "Slovenian"
},
{
"part2B": "sme",
"part1": "se",
"part2T": "sme",
"scope": "I",
"type": "L",
"id": "sme",
"ref_name": "Northern Sami"
},
{
"part2B": "spa",
"part1": "es",
"part2T": "spa",
"scope": "I",
"type": "L",
"id": "spa",
"ref_name": "Spanish"
},
{
"part2B": "alb",
"part1": "sq",
"part2T": "sqi",
"scope": "M",
"type": "L",
"id": "sqi",
"ref_name": "Albanian"
},
{
"part2B": "srp",
"part1": "sr",
"part2T": "srp",
"scope": "I",
"type": "L",
"id": "srp",
"ref_name": "Serbian"
},
{
"part2B": "swa",
"part1": "sw",
"part2T": "swa",
"scope": "M",
"type": "L",
"id": "swa",
"ref_name": "Swahili (macrolanguage)"
},
{
"part2B": "swe",
"part1": "sv",
"part2T": "swe",
"scope": "I",
"type": "L",
"id": "swe",
"ref_name": "Swedish"
},
{
"part2B": "tam",
"part1": "ta",
"part2T": "tam",
"scope": "I",
"type": "L",
"id": "tam",
"ref_name": "Tamil"
},
{
"part2B": "tel",
"part1": "te",
"part2T": "tel",
"scope": "I",
"type": "L",
"id": "tel",
"ref_name": "Telugu"
},
{
"part2B": "tgl",
"part1": "tl",
"part2T": "tgl",
"scope": "I",
"type": "L",
"id": "tgl",
"ref_name": "Tagalog"
},
{
"part2B": "tha",
"part1": "th",
"part2T": "tha",
"scope": "I",
"type": "L",
"id": "tha",
"ref_name": "Thai"
},
{
"part2B": "tur",
"part1": "tr",
"part2T": "tur",
"scope": "I",
"type": "L",
"id": "tur",
"ref_name": "Turkish"
},
{
"part2B": "uig",
"part1": "ug",
"part2T": "uig",
"scope": "I",
"type": "L",
"id": "uig",
"ref_name": "Uighur"
},
{
"part2B": "ukr",
"part1": "uk",
"part2T": "ukr",
"scope": "I",
"type": "L",
"id": "ukr",
"ref_name": "Ukrainian"
},
{
"part2B": "urd",
"part1": "ur",
"part2T": "urd",
"scope": "I",
"type": "L",
"id": "urd",
"ref_name": "Urdu"
},
{
"part2B": "vie",
"part1": "vi",
"part2T": "vie",
"scope": "I",
"type": "L",
"id": "vie",
"ref_name": "Vietnamese"
},
{
"part2B": "vol",
"part1": "vo",
"part2T": "vol",
"scope": "I",
"type": "C",
"id": "vol",
"ref_name": "Volapük"
},
{
"part2B": "wln",
"part1": "wa",
"part2T": "wln",
"scope": "I",
"type": "L",
"id": "wln",
"ref_name": "Walloon"
},
{
"part2B": "xho",
"part1": "xh",
"part2T": "xho",
"scope": "I",
"type": "L",
"id": "xho",
"ref_name": "Xhosa"
},
{
"part2B": "chi",
"part1": "zh",
"part2T": "zho",
"scope": "M",
"type": "L",
"id": "zho",
"ref_name": "Chinese"
},
{
"part2B": "zul",
"part1": "zu",
"part2T": "zul",
"scope": "I",
"type": "L",
"id": "zul",
"ref_name": "Zulu"
},
{
"part2B": "ind",
"part1": "id",
"part2T": "ind",
"scope": "I",
"type": "L",
"id": "ind",
"ref_name": "Indonesian"
}
]
iso639_1_index = {}
langid_langs.forEach (d) ->
iso639_1_index[d.part1] = d
MARGIN = 40
BAR_HEIGHT = 100
svg = d3.select('svg')
width = svg.node().getBoundingClientRect().width
height = svg.node().getBoundingClientRect().height
x = d3.scale.linear()
.domain([0,1])
.range([0, width-2*MARGIN])
color = d3.scale.category20b()
# translate a group for accommodating the plot
# x scale has to start from zero, without margin
plot = svg.append('g')
.attr
transform: "translate(#{MARGIN}, #{2*MARGIN})"
d3.select('#detect_btn').on 'click', () -> detect()
d3.select('#text_input').on 'keyup', () -> detect() if d3.event.keyCode is 13 and d3.event.ctrlKey # CTRL+ENTER
detect = () ->
d3.select('body').classed('busy', true)
text = d3.select('#text_input').node().value
d3.json('main.php')
.post text, (error, data) ->
throw error if error
# keep the same color-language pairs between sessions
if not langs?
langs = data.map (d) -> d[0]
langs.sort()
color.domain(langs)
# score rects
scores = plot.selectAll('.score')
.data data, (d) -> d[0]
enter_scores = scores.enter().append('rect')
.attr
class: 'score'
height: BAR_HEIGHT
fill: (d) -> color(d[0])
enter_scores.append('title')
scores
.attr
x: (d,i) -> d3.sum data[0...i], (e) -> x(e[1])
width: (d) -> x(d[1])
scores.select('title')
.text (d) -> "#{iso639_1_index[d[0]].ref_name} #{d3.format('%')(d[1])}"
scores.exit()
.remove()
# score labels
labels = plot.selectAll('.label')
.data data, (d) -> d[0]
labels.enter().append('text')
.text (d) -> iso639_1_index[d[0]].id
.attr
class: 'label'
dy: '0.35em'
labels
.classed('hidden', (d) -> x(d[1]) < 30 )
.attr
x: (d,i) -> (d3.sum data[0...i], (e) -> x(e[1]) ) + x(d[1]) / 2
y: BAR_HEIGHT/2
labels.exit()
.remove()
d3.select('body').classed('busy', false)
html, body {
margin: 0;
padding: 0;
width: 100%;
height: 100%;
overflow: hidden;
}
body {
display: flex;
flex-direction: column;
}
.bar {
height: 120px;
background: whitesmoke;
display: flex;
flex-direction: row;
align-items: stretch;
padding: 8px;
box-shadow: 0 -2px 10px 0px black;
}
#text_input {
flex-grow: 1;
resize: none;
font-family: inherit;
margin-right: 6px;
}
#detect_btn {
padding: 12px;
}
svg {
flex-grow: 1;
width: 100%;
}
.score {
stroke: white;
stroke-width: 1px;
shape-rendering: crispEdges;
fill-opacity: 0.9;
}
.score:hover {
fill-opacity: 1;
}
.label {
font-family: sans-serif;
text-anchor: middle;
text-transform: uppercase;
font-size: 12px;
pointer-events: none;
}
.hidden.label {
display: none;
}
.busy svg {
opacity: 0.5;
}
.busy * {
cursor: progress !important;
}
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>langid.py</title>
<link type="text/css" href="index.css" rel="stylesheet"/>
<script src="http://d3js.org/d3.v3.min.js"></script>
</head>
<body>
<div class="bar">
<textarea id="text_input" spellcheck="false">This is a simple English sentence.</textarea>
<button id="detect_btn">Detect language<br>(CTRL+ENTER)</button>
</div>
<svg></svg>
<script src="index.js"></script>
</body>
</html>
// Generated by CoffeeScript 1.10.0
(function() {
var BAR_HEIGHT, MARGIN, color, detect, height, iso639_1_index, langid_langs, plot, svg, width, x;
langid_langs = [
{
"part2B": "afr",
"part1": "af",
"part2T": "afr",
"scope": "I",
"type": "L",
"id": "afr",
"ref_name": "Afrikaans"
}, {
"part2B": "amh",
"part1": "am",
"part2T": "amh",
"scope": "I",
"type": "L",
"id": "amh",
"ref_name": "Amharic"
}, {
"part2B": "arg",
"part1": "an",
"part2T": "arg",
"scope": "I",
"type": "L",
"id": "arg",
"ref_name": "Aragonese"
}, {
"part2B": "ara",
"part1": "ar",
"part2T": "ara",
"scope": "M",
"type": "L",
"id": "ara",
"ref_name": "Arabic"
}, {
"part2B": "asm",
"part1": "as",
"part2T": "asm",
"scope": "I",
"type": "L",
"id": "asm",
"ref_name": "Assamese"
}, {
"part2B": "aze",
"part1": "az",
"part2T": "aze",
"scope": "M",
"type": "L",
"id": "aze",
"ref_name": "Azerbaijani"
}, {
"part2B": "bel",
"part1": "be",
"part2T": "bel",
"scope": "I",
"type": "L",
"id": "bel",
"ref_name": "Belarusian"
}, {
"part2B": "bul",
"part1": "bg",
"part2T": "bul",
"scope": "I",
"type": "L",
"id": "bul",
"ref_name": "Bulgarian"
}, {
"part2B": "ben",
"part1": "bn",
"part2T": "ben",
"scope": "I",
"type": "L",
"id": "ben",
"ref_name": "Bengali"
}, {
"part2B": "bre",
"part1": "br",
"part2T": "bre",
"scope": "I",
"type": "L",
"id": "bre",
"ref_name": "Breton"
}, {
"part2B": "bos",
"part1": "bs",
"part2T": "bos",
"scope": "I",
"type": "L",
"id": "bos",
"ref_name": "Bosnian"
}, {
"part2B": "cat",
"part1": "ca",
"part2T": "cat",
"scope": "I",
"type": "L",
"id": "cat",
"ref_name": "Catalan"
}, {
"part2B": "cze",
"part1": "cs",
"part2T": "ces",
"scope": "I",
"type": "L",
"id": "ces",
"ref_name": "Czech"
}, {
"part2B": "wel",
"part1": "cy",
"part2T": "cym",
"scope": "I",
"type": "L",
"id": "cym",
"ref_name": "Welsh"
}, {
"part2B": "dan",
"part1": "da",
"part2T": "dan",
"scope": "I",
"type": "L",
"id": "dan",
"ref_name": "Danish"
}, {
"part2B": "ger",
"part1": "de",
"part2T": "deu",
"scope": "I",
"type": "L",
"id": "deu",
"ref_name": "German"
}, {
"part2B": "dzo",
"part1": "dz",
"part2T": "dzo",
"scope": "I",
"type": "L",
"id": "dzo",
"ref_name": "Dzongkha"
}, {
"part2B": "gre",
"part1": "el",
"part2T": "ell",
"scope": "I",
"type": "L",
"id": "ell",
"ref_name": "Modern Greek (1453-)"
}, {
"part2B": "eng",
"part1": "en",
"part2T": "eng",
"scope": "I",
"type": "L",
"id": "eng",
"ref_name": "English"
}, {
"part2B": "epo",
"part1": "eo",
"part2T": "epo",
"scope": "I",
"type": "C",
"id": "epo",
"ref_name": "Esperanto"
}, {
"part2B": "est",
"part1": "et",
"part2T": "est",
"scope": "M",
"type": "L",
"id": "est",
"ref_name": "Estonian"
}, {
"part2B": "baq",
"part1": "eu",
"part2T": "eus",
"scope": "I",
"type": "L",
"id": "eus",
"ref_name": "Basque"
}, {
"part2B": "fao",
"part1": "fo",
"part2T": "fao",
"scope": "I",
"type": "L",
"id": "fao",
"ref_name": "Faroese"
}, {
"part2B": "per",
"part1": "fa",
"part2T": "fas",
"scope": "M",
"type": "L",
"id": "fas",
"ref_name": "Persian"
}, {
"part2B": "fin",
"part1": "fi",
"part2T": "fin",
"scope": "I",
"type": "L",
"id": "fin",
"ref_name": "Finnish"
}, {
"part2B": "fre",
"part1": "fr",
"part2T": "fra",
"scope": "I",
"type": "L",
"id": "fra",
"ref_name": "French"
}, {
"part2B": "gle",
"part1": "ga",
"part2T": "gle",
"scope": "I",
"type": "L",
"id": "gle",
"ref_name": "Irish"
}, {
"part2B": "glg",
"part1": "gl",
"part2T": "glg",
"scope": "I",
"type": "L",
"id": "glg",
"ref_name": "Galician"
}, {
"part2B": "guj",
"part1": "gu",
"part2T": "guj",
"scope": "I",
"type": "L",
"id": "guj",
"ref_name": "Gujarati"
}, {
"part2B": "hat",
"part1": "ht",
"part2T": "hat",
"scope": "I",
"type": "L",
"id": "hat",
"ref_name": "Haitian"
}, {
"part2B": "heb",
"part1": "he",
"part2T": "heb",
"scope": "I",
"type": "L",
"id": "heb",
"ref_name": "Hebrew"
}, {
"part2B": "hin",
"part1": "hi",
"part2T": "hin",
"scope": "I",
"type": "L",
"id": "hin",
"ref_name": "Hindi"
}, {
"part2B": "hrv",
"part1": "hr",
"part2T": "hrv",
"scope": "I",
"type": "L",
"id": "hrv",
"ref_name": "Croatian"
}, {
"part2B": "hun",
"part1": "hu",
"part2T": "hun",
"scope": "I",
"type": "L",
"id": "hun",
"ref_name": "Hungarian"
}, {
"part2B": "arm",
"part1": "hy",
"part2T": "hye",
"scope": "I",
"type": "L",
"id": "hye",
"ref_name": "Armenian"
}, {
"part2B": "ice",
"part1": "is",
"part2T": "isl",
"scope": "I",
"type": "L",
"id": "isl",
"ref_name": "Icelandic"
}, {
"part2B": "ita",
"part1": "it",
"part2T": "ita",
"scope": "I",
"type": "L",
"id": "ita",
"ref_name": "Italian"
}, {
"part2B": "jav",
"part1": "jv",
"part2T": "jav",
"scope": "I",
"type": "L",
"id": "jav",
"ref_name": "Javanese"
}, {
"part2B": "jpn",
"part1": "ja",
"part2T": "jpn",
"scope": "I",
"type": "L",
"id": "jpn",
"ref_name": "Japanese"
}, {
"part2B": "kan",
"part1": "kn",
"part2T": "kan",
"scope": "I",
"type": "L",
"id": "kan",
"ref_name": "Kannada"
}, {
"part2B": "geo",
"part1": "ka",
"part2T": "kat",
"scope": "I",
"type": "L",
"id": "kat",
"ref_name": "Georgian"
}, {
"part2B": "kaz",
"part1": "kk",
"part2T": "kaz",
"scope": "I",
"type": "L",
"id": "kaz",
"ref_name": "Kazakh"
}, {
"part2B": "khm",
"part1": "km",
"part2T": "khm",
"scope": "I",
"type": "L",
"id": "khm",
"ref_name": "Central Khmer"
}, {
"part2B": "kin",
"part1": "rw",
"part2T": "kin",
"scope": "I",
"type": "L",
"id": "kin",
"ref_name": "Kinyarwanda"
}, {
"part2B": "kir",
"part1": "ky",
"part2T": "kir",
"scope": "I",
"type": "L",
"id": "kir",
"ref_name": "Kirghiz"
}, {
"part2B": "kor",
"part1": "ko",
"part2T": "kor",
"scope": "I",
"type": "L",
"id": "kor",
"ref_name": "Korean"
}, {
"part2B": "kur",
"part1": "ku",
"part2T": "kur",
"scope": "M",
"type": "L",
"id": "kur",
"ref_name": "Kurdish"
}, {
"part2B": "lao",
"part1": "lo",
"part2T": "lao",
"scope": "I",
"type": "L",
"id": "lao",
"ref_name": "Lao"
}, {
"part2B": "lat",
"part1": "la",
"part2T": "lat",
"scope": "I",
"type": "A",
"id": "lat",
"ref_name": "Latin"
}, {
"part2B": "lav",
"part1": "lv",
"part2T": "lav",
"scope": "M",
"type": "L",
"id": "lav",
"ref_name": "Latvian"
}, {
"part2B": "lit",
"part1": "lt",
"part2T": "lit",
"scope": "I",
"type": "L",
"id": "lit",
"ref_name": "Lithuanian"
}, {
"part2B": "ltz",
"part1": "lb",
"part2T": "ltz",
"scope": "I",
"type": "L",
"id": "ltz",
"ref_name": "Luxembourgish"
}, {
"part2B": "mal",
"part1": "ml",
"part2T": "mal",
"scope": "I",
"type": "L",
"id": "mal",
"ref_name": "Malayalam"
}, {
"part2B": "mar",
"part1": "mr",
"part2T": "mar",
"scope": "I",
"type": "L",
"id": "mar",
"ref_name": "Marathi"
}, {
"part2B": "mac",
"part1": "mk",
"part2T": "mkd",
"scope": "I",
"type": "L",
"id": "mkd",
"ref_name": "Macedonian"
}, {
"part2B": "mlg",
"part1": "mg",
"part2T": "mlg",
"scope": "M",
"type": "L",
"id": "mlg",
"ref_name": "Malagasy"
}, {
"part2B": "mlt",
"part1": "mt",
"part2T": "mlt",
"scope": "I",
"type": "L",
"id": "mlt",
"ref_name": "Maltese"
}, {
"part2B": "mon",
"part1": "mn",
"part2T": "mon",
"scope": "M",
"type": "L",
"id": "mon",
"ref_name": "Mongolian"
}, {
"part2B": "may",
"part1": "ms",
"part2T": "msa",
"scope": "M",
"type": "L",
"id": "msa",
"ref_name": "Malay (macrolanguage)"
}, {
"part2B": "nep",
"part1": "ne",
"part2T": "nep",
"scope": "M",
"type": "L",
"id": "nep",
"ref_name": "Nepali macrolanguage whose constituent members are Nepali [npi] and Dotyali [dty]"
}, {
"part2B": "dut",
"part1": "nl",
"part2T": "nld",
"scope": "I",
"type": "L",
"id": "nld",
"ref_name": "Dutch"
}, {
"part2B": "nno",
"part1": "nn",
"part2T": "nno",
"scope": "I",
"type": "L",
"id": "nno",
"ref_name": "Norwegian Nynorsk"
}, {
"part2B": "nob",
"part1": "nb",
"part2T": "nob",
"scope": "I",
"type": "L",
"id": "nob",
"ref_name": "Norwegian Bokmål"
}, {
"part2B": "nor",
"part1": "no",
"part2T": "nor",
"scope": "M",
"type": "L",
"id": "nor",
"ref_name": "Norwegian"
}, {
"part2B": "oci",
"part1": "oc",
"part2T": "oci",
"scope": "I",
"type": "L",
"id": "oci",
"ref_name": "Occitan (post 1500)"
}, {
"part2B": "ori",
"part1": "or",
"part2T": "ori",
"scope": "M",
"type": "L",
"id": "ori",
"ref_name": "Oriya macrolanguage with constituent members Oriya [ory] and Sambalpuri [spv]"
}, {
"part2B": "pan",
"part1": "pa",
"part2T": "pan",
"scope": "I",
"type": "L",
"id": "pan",
"ref_name": "Panjabi"
}, {
"part2B": "pol",
"part1": "pl",
"part2T": "pol",
"scope": "I",
"type": "L",
"id": "pol",
"ref_name": "Polish"
}, {
"part2B": "por",
"part1": "pt",
"part2T": "por",
"scope": "I",
"type": "L",
"id": "por",
"ref_name": "Portuguese"
}, {
"part2B": "pus",
"part1": "ps",
"part2T": "pus",
"scope": "M",
"type": "L",
"id": "pus",
"ref_name": "Pushto"
}, {
"part2B": "que",
"part1": "qu",
"part2T": "que",
"scope": "M",
"type": "L",
"id": "que",
"ref_name": "Quechua"
}, {
"part2B": "rum",
"part1": "ro",
"part2T": "ron",
"scope": "I",
"type": "L",
"id": "ron",
"ref_name": "Romanian"
}, {
"part2B": "rus",
"part1": "ru",
"part2T": "rus",
"scope": "I",
"type": "L",
"id": "rus",
"ref_name": "Russian"
}, {
"part2B": "sin",
"part1": "si",
"part2T": "sin",
"scope": "I",
"type": "L",
"id": "sin",
"ref_name": "Sinhala"
}, {
"part2B": "slo",
"part1": "sk",
"part2T": "slk",
"scope": "I",
"type": "L",
"id": "slk",
"ref_name": "Slovak"
}, {
"part2B": "slv",
"part1": "sl",
"part2T": "slv",
"scope": "I",
"type": "L",
"id": "slv",
"ref_name": "Slovenian"
}, {
"part2B": "sme",
"part1": "se",
"part2T": "sme",
"scope": "I",
"type": "L",
"id": "sme",
"ref_name": "Northern Sami"
}, {
"part2B": "spa",
"part1": "es",
"part2T": "spa",
"scope": "I",
"type": "L",
"id": "spa",
"ref_name": "Spanish"
}, {
"part2B": "alb",
"part1": "sq",
"part2T": "sqi",
"scope": "M",
"type": "L",
"id": "sqi",
"ref_name": "Albanian"
}, {
"part2B": "srp",
"part1": "sr",
"part2T": "srp",
"scope": "I",
"type": "L",
"id": "srp",
"ref_name": "Serbian"
}, {
"part2B": "swa",
"part1": "sw",
"part2T": "swa",
"scope": "M",
"type": "L",
"id": "swa",
"ref_name": "Swahili (macrolanguage)"
}, {
"part2B": "swe",
"part1": "sv",
"part2T": "swe",
"scope": "I",
"type": "L",
"id": "swe",
"ref_name": "Swedish"
}, {
"part2B": "tam",
"part1": "ta",
"part2T": "tam",
"scope": "I",
"type": "L",
"id": "tam",
"ref_name": "Tamil"
}, {
"part2B": "tel",
"part1": "te",
"part2T": "tel",
"scope": "I",
"type": "L",
"id": "tel",
"ref_name": "Telugu"
}, {
"part2B": "tgl",
"part1": "tl",
"part2T": "tgl",
"scope": "I",
"type": "L",
"id": "tgl",
"ref_name": "Tagalog"
}, {
"part2B": "tha",
"part1": "th",
"part2T": "tha",
"scope": "I",
"type": "L",
"id": "tha",
"ref_name": "Thai"
}, {
"part2B": "tur",
"part1": "tr",
"part2T": "tur",
"scope": "I",
"type": "L",
"id": "tur",
"ref_name": "Turkish"
}, {
"part2B": "uig",
"part1": "ug",
"part2T": "uig",
"scope": "I",
"type": "L",
"id": "uig",
"ref_name": "Uighur"
}, {
"part2B": "ukr",
"part1": "uk",
"part2T": "ukr",
"scope": "I",
"type": "L",
"id": "ukr",
"ref_name": "Ukrainian"
}, {
"part2B": "urd",
"part1": "ur",
"part2T": "urd",
"scope": "I",
"type": "L",
"id": "urd",
"ref_name": "Urdu"
}, {
"part2B": "vie",
"part1": "vi",
"part2T": "vie",
"scope": "I",
"type": "L",
"id": "vie",
"ref_name": "Vietnamese"
}, {
"part2B": "vol",
"part1": "vo",
"part2T": "vol",
"scope": "I",
"type": "C",
"id": "vol",
"ref_name": "Volapük"
}, {
"part2B": "wln",
"part1": "wa",
"part2T": "wln",
"scope": "I",
"type": "L",
"id": "wln",
"ref_name": "Walloon"
}, {
"part2B": "xho",
"part1": "xh",
"part2T": "xho",
"scope": "I",
"type": "L",
"id": "xho",
"ref_name": "Xhosa"
}, {
"part2B": "chi",
"part1": "zh",
"part2T": "zho",
"scope": "M",
"type": "L",
"id": "zho",
"ref_name": "Chinese"
}, {
"part2B": "zul",
"part1": "zu",
"part2T": "zul",
"scope": "I",
"type": "L",
"id": "zul",
"ref_name": "Zulu"
}, {
"part2B": "ind",
"part1": "id",
"part2T": "ind",
"scope": "I",
"type": "L",
"id": "ind",
"ref_name": "Indonesian"
}
];
iso639_1_index = {};
langid_langs.forEach(function(d) {
return iso639_1_index[d.part1] = d;
});
MARGIN = 40;
BAR_HEIGHT = 100;
svg = d3.select('svg');
width = svg.node().getBoundingClientRect().width;
height = svg.node().getBoundingClientRect().height;
x = d3.scale.linear().domain([0, 1]).range([0, width - 2 * MARGIN]);
color = d3.scale.category20b();
plot = svg.append('g').attr({
transform: "translate(" + MARGIN + ", " + (2 * MARGIN) + ")"
});
d3.select('#detect_btn').on('click', function() {
return detect();
});
d3.select('#text_input').on('keyup', function() {
if (d3.event.keyCode === 13 && d3.event.ctrlKey) {
return detect();
}
});
detect = function() {
var text;
d3.select('body').classed('busy', true);
text = d3.select('#text_input').node().value;
return d3.json('main.php').post(text, function(error, data) {
var enter_scores, labels, langs, scores;
if (error) {
throw error;
}
if (typeof langs === "undefined" || langs === null) {
langs = data.map(function(d) {
return d[0];
});
langs.sort();
color.domain(langs);
}
scores = plot.selectAll('.score').data(data, function(d) {
return d[0];
});
enter_scores = scores.enter().append('rect').attr({
"class": 'score',
height: BAR_HEIGHT,
fill: function(d) {
return color(d[0]);
}
});
enter_scores.append('title');
scores.attr({
x: function(d, i) {
return d3.sum(data.slice(0, i), function(e) {
return x(e[1]);
});
},
width: function(d) {
return x(d[1]);
}
});
scores.select('title').text(function(d) {
return iso639_1_index[d[0]].ref_name + " " + (d3.format('%')(d[1]));
});
scores.exit().remove();
labels = plot.selectAll('.label').data(data, function(d) {
return d[0];
});
labels.enter().append('text').text(function(d) {
return iso639_1_index[d[0]].id;
}).attr({
"class": 'label',
dy: '0.35em'
});
labels.classed('hidden', function(d) {
return x(d[1]) < 30;
}).attr({
x: function(d, i) {
return (d3.sum(data.slice(0, i), function(e) {
return x(e[1]);
})) + x(d[1]) / 2;
},
y: BAR_HEIGHT / 2
});
labels.exit().remove();
return d3.select('body').classed('busy', false);
});
};
}).call(this);
<?php
echo shell_exec('python main.py "' . file_get_contents('php://input') . '"');
?>
import sys
import json
import langid
text = sys.argv[1]
result = langid.rank(text)
print json.dumps(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment