This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DUMP_DATE="20201001" | |
while read lng; do | |
URL="https://dumps.wikimedia.org/${lng}wiki/${DUMP_DATE}/${lng}wiki-${DUMP_DATE}-pages-articles.xml.bz2" | |
#echo $lng | |
curl -sI $URL | grep -i Content-Length | awk -v l="$lng" '{print l " " $2}' | |
done < langs.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Dict, List, Tuple, Any | |
from flair.data import Dictionary | |
from flair.embeddings import FlairEmbeddings | |
from overrides import overrides | |
from allennlp.common.checks import ConfigurationError | |
from allennlp.common.util import pad_sequence_to_length | |
from allennlp.data.tokenizers.token import Token | |
from allennlp.data.token_indexers.token_indexer import TokenIndexer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
def combine(f1, f2, outf, limit=-1, dim=64): | |
words1 = {} | |
words2 = {} | |
with open(f1) as f: | |
for i,line in enumerate(f): | |
if i > limit > -1: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# I learned this one in allennlp, hence the name. | |
p = "path/to/model/best.th" | |
w = torch.load(p) | |
for k in w.keys(): | |
print(k) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import edu.illinois.cs.cogcomp.core.algorithms.LevensteinDistance; | |
import edu.illinois.cs.cogcomp.core.datastructures.Pair; | |
import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; | |
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence; | |
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; | |
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotationUtilities; | |
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View; | |
import edu.illinois.cs.cogcomp.core.io.LineIO; | |
import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper; | |
import org.apache.commons.io.FilenameUtils; |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Number of vector lines you want | |
N=50000 | |
IN=$1 | |
OUT=$2 | |
# Get the dimension from the header. | |
DIM=$(head -n 1 $IN | cut -d' ' -f2) | |
# Actually take the top... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from allennlp.predictors.predictor import Predictor | |
import time | |
model = "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.04.26.tar.gz" | |
print("Loading model...") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from gensim.models import KeyedVectors | |
# Downloaded from fasttext: https://fasttext.cc/docs/en/english-vectors.html | |
# Converted to word2vec binary format for faster loading (see convert.py) | |
vec = KeyedVectors.load_word2vec_format("~/data/wiki-news-300d-1M.vec.bin", binary=True) | |
from itertools import combinations | |
from nltk.stem import WordNetLemmatizer,PorterStemmer | |
wnl = WordNetLemmatizer() | |
stemmer = PorterStemmer() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from googleapiclient.discovery import build | |
import codecs | |
import HTMLParser | |
import shelve | |
# As of Aug 1 2016 | |
API_KEY = "YOUR_API_KEY_HERE" | |
NewerOlder