Created
March 19, 2014 16:56
-
-
Save glorphindale/9646119 to your computer and use it in GitHub Desktop.
Преобразования данных
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns enlive-examples.process | |
(:require [clojure.string :as string] | |
[cheshire.core :as chesh] | |
[clojure.pprint :as pp])) | |
(def data (chesh/parse-string (slurp "codefest-2013-raw.json") true)) | |
;; Sex | |
(defn person->name [person] | |
(-> person | |
first | |
(string/split #" " 2) | |
first | |
string/lower-case)) | |
(def all-names (set (map person->name data))) | |
(count all-names) | |
(defn name->sex [name] | |
(let [letter (last name)] | |
(cond | |
(#{"данила" "тема" "илья" "тёма" "гриша" "никита" "юра" "nikita" "саша" "женя"} name) [name "m"] | |
(#{\a \а \я} letter) [name "f"] | |
(#{\н \й \р \r \n \л \с \в \п \д \x \w \m \s \y \l \м \т \ь \о \г \б \к \i \d \e \k} letter) [name "m"] | |
:default [name "?"] | |
))) | |
(defn person->sex [person] | |
(-> person person->name name->sex)) | |
(frequencies (map second (map name->sex all-names))) | |
;; Positions | |
(defn person->position [person] | |
(-> person | |
(#(if (nth % 2) (nth % 2) "")) | |
string/lower-case)) | |
(def all-positions (set (map person->position data))) | |
(count all-positions) | |
(defn string-contains? [s vars] | |
(seq (filter true? (map #(.contains s %) vars)))) | |
(defn simplify-position [position] | |
(cond | |
(string-contains? position #{"hr" "персонал" "алексей сухоруков" "людям"}) [position "hr"] | |
(string-contains? position #{"дизайнер" "ui" "интерф" "designer" "ux"}) [position "designer"] | |
(string-contains? position #{"director" "директор" "manager" "начальник" "pm" "cio" | |
"leader" "менеджер" "руководит" "lead" "лидер" "cto" "boss" | |
"рук." "владелец" "ceo" "главный" "лид" "chief" "пм" "mgr" | |
"управля" "гендир" "vp" "соучре" "управл" "coo" "партнер" "head"}) [position "mgmt"] | |
(string-contains? position #{"qa" "тестирован" "качеств" "test" "тестировщик" "поняша" | |
"тестер" "sdet"}) [position "qa"] | |
(string-contains? position #{"аналитик" "архи" "architect"}) [position "analysis"] | |
(string-contains? position #{"developer" "разработчик" "программист" "програмист" | |
"engineer" "rnd" "инженер" "program" | |
"android" "java" "scala" "javascript" "sde"}) [position "developer"] | |
(string-contains? position #{"админ" "admin" "devops"}) [position "admin"] | |
(string-contains? position #{"студент"}) [position "student"] | |
:default [position "na"] | |
)) | |
(frequencies (map second (map simplify-position all-positions))) | |
(frequencies (map identity (map simplify-position all-positions))) | |
(filter #(= (second %) "na") (map simplify-position all-positions)) | |
(defn person->simple-position [person] | |
(-> person person->position simplify-position)) | |
;; Companies | |
(def all-companies (map second data)) | |
(defn simplify-company [company] | |
(cond | |
(#{"2gis"} company) "2гис" | |
(#{"ооо «компания холидей»"} company) "ооо \"компания холидей\"" | |
(#{"playtox llc"} company) "playtox" | |
(#{"новео"} company) "noveo" | |
(#{"Кадровое Агентство Алексея Сухорукова"} company) "Alexey Suhorukov's Recruitment Agency" | |
:default company)) | |
(defn bin-company [[c f]] | |
(cond | |
(< 0 f 2) {c "1"} | |
(< 1 f 6) {c "2-5"} | |
(< 5 f 11) {c "6-10"} | |
(< 10 f 16) {c "11-15"} | |
(< 15 f 21) {c "16-20"} | |
(< 20 f 26) {c "21-25"} | |
(< 25 f 51) {c "26-50"} | |
(< 50 f) {c "50+"})) | |
(def company-freqs | |
(->> all-companies | |
(map simplify-company) | |
frequencies | |
(map bin-company) | |
(apply merge))) | |
(defn person->company-size [person] | |
(-> person second simplify-company company-freqs)) | |
;; Bring it all together | |
(defn transform-person [person] | |
[(-> person person->sex second) (person->company-size person) (-> person person->simple-position second)]) | |
(def transformed-data (map transform-person data)) | |
(def freqs (frequencies transformed-data)) | |
(defn finalize [[[sex company position] v]] | |
{"sex" sex "company" company "position" position "amount" v}) | |
(def result | |
(str "var raw_data =" | |
(chesh/generate-string | |
(map finalize freqs)) | |
";")) | |
(spit "codefest-2013.json" result) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment