This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Lexical Density | |
http://web.archive.org/web/20110810174351/http://www.unisanet.unisa.edu.au/Resources/la/Readability/Content%20words%20and%20lexical%20density.htm | |
""" | |
from __future__ import division | |
import MeCab | |
CONTENT_WORD_POS = ('名詞', '動詞', '形容詞', '副詞') | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"Airline": "Airline Industry Services", | |
"American Restaurant": "New American Restaurant", | |
"Amusement Park Ride": "Roller Coaster", | |
"Amusement": "Arcade", | |
"Amusement": "Bingo Hall", | |
"Amusement": "Go Karting", | |
"Amusement": "Laser Tag", | |
"Antiques & Vintage": "Antique Store", | |
"Antiques & Vintage": "Auction House", |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"401(K)s": ["Finance", "Investing", "Retirement Investments", "401(K)s"], | |
"Accommodations": ["Travel & Tourism", "Accommodations"], | |
"Accounting & Auditing": ["Finance", "Accounting & Auditing"], | |
"Acne": ["Health", "Health Conditions & Concerns", "Skin Conditions & Skin Health", "Acne"], | |
"Air Travel": ["Travel & Tourism", "Air Travel"], | |
"Airline Tickets, Fares & Flights": ["Travel & Tourism", "Air Travel", "Airline Tickets, Fares & Flights"], | |
"Alternative & Natural Medicine": ["Health", "Health Care Services", "Alternative & Natural Medicine"], | |
"Anti-Aging": ["Beauty & Personal Care", "Anti-Aging"], | |
"Anti-Virus Software": ["Computers", "Software", "Internet Software & Web Goodies", "Network Security Software", "Anti-Virus Software"], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import re | |
from encodings.aliases import aliases | |
import nkf | |
import tornado | |
from tornado import httpclient, gen | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git /tmp/mecab-ipadic-neologd | |
bash /tmp/mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y | |
rm -rf /tmp/mecab-ipadic-neologd |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pushd . &> /dev/null | |
cd /tmp | |
git clone --depth=1 https://github.com/tmikolov/word2vec | |
cd word2vec | |
sed -i -e 's/malloc.h/stdlib.h/g' *.c | |
make | |
rm *.c* *.txt makefile LICENSE | |
cp * /usr/local/bin | |
popd &> /dev/null |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import heapq | |
from collections import deque | |
class TopK(): | |
def __init__(self, k=5): | |
self.k = k | |
self._initialize() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def eliminate_zero_raws(x): | |
return x[np.unique(x.nonzero()[0])] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CHINESE_MAP = {'1': '一', '2': '二', '3': '三', '4': '四', '5': '五', '6': '六', '7': '七', '8': '八', '9': '九'} | |
CHINESE_DIGITS = ('十', '百', '千', '万', '十万', '百万', '千万', '億', '十億', '百億', '千億', '兆', '十兆', '百兆', '千兆') | |
def arabic2chinese(arabic): | |
chinese = [] | |
if len(arabic) == '0': | |
return '〇' | |
arabic = arabic.replace(',', '') | |
for (i, num) in enumerate(arabic[::-1]): | |
if num == '0': |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.sparse import csr_matrix | |
def dict2sparse(d): | |
data = list(d.values()) | |
indices = list(d.keys()) | |
indptr = [0, len(d)] | |
return csr_matrix((data, indices, indptr), shape=(1, max(d)+1), dtype=np.uint32) | |