This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('input_dir', help='Directory with wiki json files') | |
parser.add_argument('output', help='Txt file output') | |
args = parser.parse_args() | |
for filename in os.listdir(args.input_dir): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from collections import defaultdict | |
parser = argparse.ArgumentParser() | |
parser.add_argument('input', | |
help='Input file in CoNLLU format') | |
parser.add_argument('-u', action='store_true', dest='upos', | |
help='Use UPOS to disambiguate') | |
parser.add_argument('-x', action='store_true', dest='xpos', | |
help='Use XPOS to disambiguate') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
Functions to read the OpenWordnetPT from RDF files and provide | |
access to it. | |
''' | |
import rdflib | |
from six.moves import cPickle |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('input', help='Single embedding file') | |
parser.add_argument('output', help='Output basename without extension') | |
args = parser.parse_args() | |
embeddings_file = args.output + '.npy' | |
vocabulary_file = args.output + '.txt' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import unicode_literals | |
from nltk.tokenize import RegexpTokenizer | |
import argparse | |
import os | |
""" | |
Script for tokenizing Portuguese text according to the Universal Dependencies | |
(UD) tokenization standards. This script was not created by the UD team; it was |