-
-
Save lotz84/d0437573405ea58a481f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(import concurrent.futures) | |
(import sys) | |
(import subprocess) | |
(import time) | |
(import [collections [defaultdict]]) | |
(import [data [sentences]]) | |
(defn sent2kgram [sentence &optional [k (int "2")] [delimiter " "]] | |
(setv sentence (str (.getoutput subprocess (.format "echo {0} | mecab -O wakati" sentence)))) | |
(setv words (.split (.strip sentence))) | |
(setv length (len words)) | |
(setv kgrams []) | |
(for [i (range (- length k))] | |
(setv kgram (.join delimiter (slice words i (+ i k)))) | |
(.append kgrams kgram)) | |
kgrams) | |
(defn timef [f] | |
(defn wrapper [] | |
(setv start (.time time)) | |
(f) | |
(setv end (.time time)) | |
(print (.format "{0}: {1:.2f}ms" f.__name__ (* (int "1000") (- end start))))) | |
wrapper) | |
@timef | |
(defn single_process [] | |
(setv kgram2id (defaultdict (fn [] (len kgram2id)))) | |
(for [sentence sentences] | |
(setv kgram (sent2kgram sentence)) | |
(setv kgram (map (fn [x] (get kgram2id x)) kgram))) | |
kgram2id) | |
@timef | |
(defn multi_process [] | |
(setv kgram2id (defaultdict (fn [] (len kgram2id)))) | |
(with [[executor (.ProcessPoolExecutor concurrent.futures)]] | |
(for [res_kgram (.map executor sent2kgram sentences)] | |
(setv res_kgram (map (fn [x] (get kgram2id x)) res_kgram)))) | |
kgram2id) | |
(defn main [] | |
(single_process) | |
(multi_process)) | |
(if (= __name__ "__main__") (main)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Maybe, you have to delete a head line of multi_vectorizer.py. That is the line of importing hy module.