Created
February 14, 2020 14:35
-
-
Save PonteIneptique/e48a0d6cf0299dbc2e6aa0c03b668033 to your computer and use it in GitHub Desktop.
Protogenie Config
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<?xml-model href="https://hipster-philology.github.io/protogenie/protogenie/schema.rng" | |
schematypens="http://relaxng.org/ns/structure/1.0"?> | |
<config> | |
<output column_marker="TAB"> | |
<header name="order"> | |
<key>token</key> | |
<key>lemma</key> | |
<key>pos</key> | |
<key>Dis</key> | |
<key>Entity</key> | |
<key>Gend</key> | |
<key>Numb</key> | |
<key>Case</key> | |
<key>Deg</key> | |
<key>Mood</key> | |
<key>Tense</key> | |
<key>Voice</key> | |
<key>Person</key> | |
</header> | |
</output> | |
<postprocessing> | |
<disambiguation matchPattern="_(\d+)$" new-column="Dis" source-column="lemma" default="_" /> <!-- Extract disambiguation --> | |
<disambiguation matchPattern="_(\w)$" new-column="Entity" source-column="lemma" default="_" /> <!-- Extract disambiguation --> | |
<disambiguation matchPattern="Gend\=([\w-]+)\|?" new-column="Gend" source-column="morph" default="_" /> | |
<disambiguation matchPattern="Numb\=([\w-]+)\|?" new-column="Numb" source-column="morph" default="_" /> | |
<disambiguation matchPattern="Case\=([\w-]+)\|?" new-column="Case" source-column="morph" default="_" /> | |
<disambiguation matchPattern="Deg\=([\w-]+)\|?" new-column="Deg" source-column="morph" default="_" /> | |
<disambiguation matchPattern="Mood\=([\w-]+)\|?" new-column="Mood" source-column="morph" default="_" /> | |
<disambiguation matchPattern="Tense\=([\w-]+)\|?" new-column="Tense" source-column="morph" default="_" /> | |
<disambiguation matchPattern="Voice\=([\w-]+)\|?" new-column="Voice" source-column="morph" default="_" /> | |
<disambiguation matchPattern="Person\=([\w-]+)\|?" new-column="Person" source-column="morph" default="_" /> | |
<replacement matchPattern="$^" replacementPattern="_"> | |
<applyTo source="morph"> | |
<target>morph</target> | |
</applyTo> | |
</replacement> | |
<toolbox name="RomanNumeral" matchPattern="^(M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3}))$"> | |
<applyTo source="token"> | |
<target>token</target> | |
<target>form</target> | |
</applyTo> | |
</toolbox> | |
<skip matchPattern="^\W+$" source="token" /> <!-- Remove ponctuation lines --> | |
</postprocessing> | |
<default-header> | |
<header type="explicit"> | |
<key map-to="token">form</key> | |
<key>lemma</key> | |
<key>morph</key> | |
<key>pos</key> | |
<key>index</key> | |
</header> | |
</default-header> | |
<memory path="memory_$file$.csv"/> | |
<corpora> | |
<corpus path="./output/*.tsv" column_marker="TAB"> | |
<splitter name="empty_line" /> | |
<header type="default"/> | |
</corpus> | |
</corpora> | |
</config> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment