Created
July 8, 2013 05:25
-
-
Save Kintaro/5946390 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE DeriveDataTypeable #-} | |
import System.IO | |
import System.Environment | |
import Control.Monad | |
import Control.Monad.Maybe | |
import Control.Monad.IO.Class | |
import Control.Applicative | |
import Text.Regex.Posix | |
import qualified Data.Text as T | |
import Data.List (sortBy, sort, intersperse) | |
import Network.HTTP | |
import Network.URI | |
import Text.JSON | |
import Text.JSON.Generic | |
import Data.Maybe | |
import Codec.Binary.UTF8.String | |
data Sentence = Sentence String [String] deriving Show | |
data WkJson = WkJson { requested_information :: RequestedInformation } deriving (Eq, Show, Data, Typeable) | |
data RequestedInformation = RequestedInformation { general :: [Entry] } deriving (Eq, Show, Data, Typeable) | |
data Entry = Entry { character :: String } deriving (Eq, Show, Data, Typeable) | |
parseContent :: String -> String | |
parseContent x = T.unpack $ (T.split (\a -> a == ';') $ T.pack x) !! 0 | |
parseTokenLine :: String -> [String] | |
parseTokenLine x = filter (\y -> y /= []) $ map filterKanji $ map T.unpack $ T.split (\y -> y == ',') $ (T.split (\a -> a == ';') $ T.pack x) !! 1 | |
parseSentence :: String -> Sentence | |
parseSentence x = Sentence (parseContent x) (parseTokenLine x) | |
parseSentences :: [String] -> [Sentence] | |
parseSentences xs = map parseSentence xs | |
isHiragana :: Char -> Bool | |
isHiragana x = x `elem` ['あ'..'ん'] | |
isKatakana :: Char -> Bool | |
isKatakana x = x `elem` ['ァ'..'ヿ'] | |
filterKanji :: String -> String | |
filterKanji x = filter (\x -> not $ isHiragana x || isKatakana x || x == ',') x | |
matchWords' :: [String] -> [String] -> Int -> Bool | |
matchWords' tokens dictionary i = i == (length $ filter not $ map (flip elem dictionary) tokens) | |
loadSentences :: IO [Sentence] | |
loadSentences = do | |
content <- lines <$> readFile "japanese_processed.dat" | |
return $ parseSentences content | |
unpackSentence :: Sentence -> String | |
unpackSentence (Sentence x _) = x | |
unpackTokens :: Sentence -> [String] | |
unpackTokens (Sentence _ x) = x | |
getList :: WkJson -> [String] | |
getList (WkJson (RequestedInformation a)) = map (\(Entry x) -> x) a | |
getVocabulary :: String -> IO ([String]) | |
getVocabulary key = do | |
putStrLn "Fetching data from API..." | |
let url = "http://www.wanikani.com/api/user/" ++ key ++ "/vocabulary" | |
response <- simpleHTTP $ getRequest url | |
json <- getResponseBody response | |
putStrLn "Parsing data..." | |
let parsed = decodeJSON $ decodeString json :: WkJson | |
putStrLn "Extracting data..." | |
let list = reverse $ sort $ map filterKanji $ getList parsed | |
putStrLn $ "Loaded " ++ (show $ length list) ++ " items..." | |
return list | |
main = do | |
hSetEncoding stdout utf8 | |
key <- (!! 0) <$> getArgs | |
output <- (!! 1) <$> getArgs | |
i <- (!! 2) <$> getArgs | |
let amount = read i :: Int | |
dictionary <- getVocabulary key | |
sentences <- loadSentences | |
let jap_sentences = map unpackSentence sentences | |
let sentence_tokens = map unpackTokens sentences | |
let sentencesZipped = zip sentence_tokens jap_sentences | |
let matchings = filter (\(x, y) -> matchWords' x dictionary amount) sentencesZipped | |
let result = map (\(x, y) -> y) matchings | |
putStrLn $ unlines result | |
writeFile output $ unlines result |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cabocha.h> | |
#include <fstream> | |
int main (int argc, char **argv) { | |
CaboCha::Parser *parser = CaboCha::createParser(argc, argv); | |
std::ifstream infile("japanese.dat"); | |
std::ofstream outfile("japanese_processed.dat"); | |
std::string line; | |
while (std::getline(infile, line)) | |
{ | |
const CaboCha::Tree *tree = parser->parse(line.c_str()); | |
outfile << line << ";"; | |
for (size_t i = 0; i < tree->token_size(); ++i) { | |
const CaboCha::Token *token = tree->token(i); | |
outfile << token->surface << ","; | |
} | |
outfile << std::endl; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment