Skip to content

Instantly share code, notes, and snippets.

@Kintaro
Created July 8, 2013 05:25
Show Gist options
  • Save Kintaro/5946390 to your computer and use it in GitHub Desktop.
Save Kintaro/5946390 to your computer and use it in GitHub Desktop.
{-# LANGUAGE DeriveDataTypeable #-}
import System.IO
import System.Environment
import Control.Monad
import Control.Monad.Maybe
import Control.Monad.IO.Class
import Control.Applicative
import Text.Regex.Posix
import qualified Data.Text as T
import Data.List (sortBy, sort, intersperse)
import Network.HTTP
import Network.URI
import Text.JSON
import Text.JSON.Generic
import Data.Maybe
import Codec.Binary.UTF8.String
data Sentence = Sentence String [String] deriving Show
data WkJson = WkJson { requested_information :: RequestedInformation } deriving (Eq, Show, Data, Typeable)
data RequestedInformation = RequestedInformation { general :: [Entry] } deriving (Eq, Show, Data, Typeable)
data Entry = Entry { character :: String } deriving (Eq, Show, Data, Typeable)
parseContent :: String -> String
parseContent x = T.unpack $ (T.split (\a -> a == ';') $ T.pack x) !! 0
parseTokenLine :: String -> [String]
parseTokenLine x = filter (\y -> y /= []) $ map filterKanji $ map T.unpack $ T.split (\y -> y == ',') $ (T.split (\a -> a == ';') $ T.pack x) !! 1
parseSentence :: String -> Sentence
parseSentence x = Sentence (parseContent x) (parseTokenLine x)
parseSentences :: [String] -> [Sentence]
parseSentences xs = map parseSentence xs
isHiragana :: Char -> Bool
isHiragana x = x `elem` ['あ'..'ん']
isKatakana :: Char -> Bool
isKatakana x = x `elem` ['ァ'..'ヿ']
filterKanji :: String -> String
filterKanji x = filter (\x -> not $ isHiragana x || isKatakana x || x == ',') x
matchWords' :: [String] -> [String] -> Int -> Bool
matchWords' tokens dictionary i = i == (length $ filter not $ map (flip elem dictionary) tokens)
loadSentences :: IO [Sentence]
loadSentences = do
content <- lines <$> readFile "japanese_processed.dat"
return $ parseSentences content
unpackSentence :: Sentence -> String
unpackSentence (Sentence x _) = x
unpackTokens :: Sentence -> [String]
unpackTokens (Sentence _ x) = x
getList :: WkJson -> [String]
getList (WkJson (RequestedInformation a)) = map (\(Entry x) -> x) a
getVocabulary :: String -> IO ([String])
getVocabulary key = do
putStrLn "Fetching data from API..."
let url = "http://www.wanikani.com/api/user/" ++ key ++ "/vocabulary"
response <- simpleHTTP $ getRequest url
json <- getResponseBody response
putStrLn "Parsing data..."
let parsed = decodeJSON $ decodeString json :: WkJson
putStrLn "Extracting data..."
let list = reverse $ sort $ map filterKanji $ getList parsed
putStrLn $ "Loaded " ++ (show $ length list) ++ " items..."
return list
main = do
hSetEncoding stdout utf8
key <- (!! 0) <$> getArgs
output <- (!! 1) <$> getArgs
i <- (!! 2) <$> getArgs
let amount = read i :: Int
dictionary <- getVocabulary key
sentences <- loadSentences
let jap_sentences = map unpackSentence sentences
let sentence_tokens = map unpackTokens sentences
let sentencesZipped = zip sentence_tokens jap_sentences
let matchings = filter (\(x, y) -> matchWords' x dictionary amount) sentencesZipped
let result = map (\(x, y) -> y) matchings
putStrLn $ unlines result
writeFile output $ unlines result
#include <iostream>
#include <cabocha.h>
#include <fstream>
int main (int argc, char **argv) {
CaboCha::Parser *parser = CaboCha::createParser(argc, argv);
std::ifstream infile("japanese.dat");
std::ofstream outfile("japanese_processed.dat");
std::string line;
while (std::getline(infile, line))
{
const CaboCha::Tree *tree = parser->parse(line.c_str());
outfile << line << ";";
for (size_t i = 0; i < tree->token_size(); ++i) {
const CaboCha::Token *token = tree->token(i);
outfile << token->surface << ",";
}
outfile << std::endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment