Kintaro · July 8, 2013 05:25
diff --git a/gistfile1.hs b/gistfile1.hs
 {-# LANGUAGE DeriveDataTypeable #-}

 import System.IO 
 import System.Environment
 import Control.Monad
 import Control.Monad.Maybe
 import Control.Monad.IO.Class
 import Control.Applicative
 import Text.Regex.Posix
 import qualified Data.Text as T
 import Data.List (sortBy, sort, intersperse)
 import Network.HTTP
 import Network.URI
 import Text.JSON
 import Text.JSON.Generic
 import Data.Maybe
 import Codec.Binary.UTF8.String

 data Sentence = Sentence String [String] deriving Show
 data WkJson = WkJson { requested_information :: RequestedInformation } deriving (Eq, Show, Data, Typeable)
 data RequestedInformation = RequestedInformation { general :: [Entry] } deriving (Eq, Show, Data, Typeable)
 data Entry = Entry { character :: String } deriving (Eq, Show, Data, Typeable)

 parseContent :: String -> String
 parseContent x = T.unpack $ (T.split (\a -> a == ';') $ T.pack x) !! 0

 parseTokenLine :: String -> [String]
 parseTokenLine x = filter (\y -> y /= []) $ map filterKanji $ map T.unpack $ T.split (\y -> y == ',') $ (T.split (\a -> a == ';') $ T.pack x) !! 1

 parseSentence :: String -> Sentence
 parseSentence x = Sentence (parseContent x) (parseTokenLine x)

 parseSentences :: [String] -> [Sentence]
 parseSentences xs = map parseSentence xs

 isHiragana :: Char -> Bool
 isHiragana x = x `elem` ['あ'..'ん']

 isKatakana :: Char -> Bool
 isKatakana x = x `elem` ['ァ'..'ヿ']

 filterKanji :: String -> String
 filterKanji x = filter (\x -> not $ isHiragana x || isKatakana x || x == ',') x

 matchWords' :: [String] -> [String] -> Int -> Bool
 matchWords' tokens dictionary i = i == (length $ filter not $ map (flip elem dictionary) tokens)

 loadSentences :: IO [Sentence]
 loadSentences = do
  content <- lines <$> readFile "japanese_processed.dat"
  return $ parseSentences content

 unpackSentence :: Sentence -> String
 unpackSentence (Sentence x _) = x

 unpackTokens :: Sentence -> [String]
 unpackTokens (Sentence _ x) = x

 getList :: WkJson -> [String]
 getList (WkJson (RequestedInformation a)) = map (\(Entry x) -> x) a

 getVocabulary :: String -> IO ([String])
 getVocabulary key = do
  putStrLn "Fetching data from API..."
  let url = "http://www.wanikani.com/api/user/" ++ key ++ "/vocabulary"
  response <- simpleHTTP $ getRequest url
  json <- getResponseBody response
  putStrLn "Parsing data..."
  let parsed = decodeJSON $ decodeString json :: WkJson
  putStrLn "Extracting data..."
  let list = reverse $ sort $ map filterKanji $ getList parsed
  putStrLn $ "Loaded " ++ (show $ length list) ++ " items..."
  return list

 main = do
  hSetEncoding stdout utf8
  key <- (!! 0) <$> getArgs
  output <- (!! 1) <$> getArgs
  i <- (!! 2) <$> getArgs
  let amount = read i :: Int
  dictionary <- getVocabulary key
  sentences <- loadSentences
  let jap_sentences = map unpackSentence sentences
  let sentence_tokens = map unpackTokens sentences
  let sentencesZipped = zip sentence_tokens jap_sentences
  let matchings = filter (\(x, y) -> matchWords' x dictionary amount) sentencesZipped
  let result = map (\(x, y) -> y) matchings
  putStrLn $ unlines result
  writeFile output $ unlines result
diff --git a/gistfile2.cpp b/gistfile2.cpp
 #include <iostream>
 #include <cabocha.h>
 #include <fstream>

 int main (int argc, char **argv) {
  CaboCha::Parser *parser = CaboCha::createParser(argc, argv);

  std::ifstream infile("japanese.dat");
  std::ofstream outfile("japanese_processed.dat");

  std::string line;
  while (std::getline(infile, line))
  {
    const CaboCha::Tree *tree = parser->parse(line.c_str());
    outfile << line << ";";
    for (size_t i = 0; i < tree->token_size(); ++i) {
      const CaboCha::Token *token = tree->token(i);
      outfile << token->surface << ",";
    }
    outfile << std::endl;
  }

  return 0;
 }
	{-# LANGUAGE DeriveDataTypeable #-}

	import System.IO
	import System.Environment
	import Control.Monad
	import Control.Monad.Maybe
	import Control.Monad.IO.Class
	import Control.Applicative
	import Text.Regex.Posix
	import qualified Data.Text as T
	import Data.List (sortBy, sort, intersperse)
	import Network.HTTP
	import Network.URI
	import Text.JSON
	import Text.JSON.Generic
	import Data.Maybe
	import Codec.Binary.UTF8.String

	data Sentence = Sentence String [String] deriving Show
	data WkJson = WkJson { requested_information :: RequestedInformation } deriving (Eq, Show, Data, Typeable)
	data RequestedInformation = RequestedInformation { general :: [Entry] } deriving (Eq, Show, Data, Typeable)
	data Entry = Entry { character :: String } deriving (Eq, Show, Data, Typeable)

	parseContent :: String -> String
	parseContent x = T.unpack $ (T.split (\a -> a == ';') $ T.pack x) !! 0

	parseTokenLine :: String -> [String]
	parseTokenLine x = filter (\y -> y /= []) $ map filterKanji $ map T.unpack $ T.split (\y -> y == ',') $ (T.split (\a -> a == ';') $ T.pack x) !! 1

	parseSentence :: String -> Sentence
	parseSentence x = Sentence (parseContent x) (parseTokenLine x)

	parseSentences :: [String] -> [Sentence]
	parseSentences xs = map parseSentence xs

	isHiragana :: Char -> Bool
	isHiragana x = x `elem` ['あ'..'ん']

	isKatakana :: Char -> Bool
	isKatakana x = x `elem` ['ァ'..'ヿ']

	filterKanji :: String -> String
	filterKanji x = filter (\x -> not $ isHiragana x \|\| isKatakana x \|\| x == ',') x

	matchWords' :: [String] -> [String] -> Int -> Bool
	matchWords' tokens dictionary i = i == (length $ filter not $ map (flip elem dictionary) tokens)

	loadSentences :: IO [Sentence]
	loadSentences = do
	content <- lines <$> readFile "japanese_processed.dat"
	return $ parseSentences content

	unpackSentence :: Sentence -> String
	unpackSentence (Sentence x _) = x

	unpackTokens :: Sentence -> [String]
	unpackTokens (Sentence _ x) = x

	getList :: WkJson -> [String]
	getList (WkJson (RequestedInformation a)) = map (\(Entry x) -> x) a

	getVocabulary :: String -> IO ([String])
	getVocabulary key = do
	putStrLn "Fetching data from API..."
	let url = "http://www.wanikani.com/api/user/" ++ key ++ "/vocabulary"
	response <- simpleHTTP $ getRequest url
	json <- getResponseBody response
	putStrLn "Parsing data..."
	let parsed = decodeJSON $ decodeString json :: WkJson
	putStrLn "Extracting data..."
	let list = reverse $ sort $ map filterKanji $ getList parsed
	putStrLn $ "Loaded " ++ (show $ length list) ++ " items..."
	return list

	main = do
	hSetEncoding stdout utf8
	key <- (!! 0) <$> getArgs
	output <- (!! 1) <$> getArgs
	i <- (!! 2) <$> getArgs
	let amount = read i :: Int
	dictionary <- getVocabulary key
	sentences <- loadSentences
	let jap_sentences = map unpackSentence sentences
	let sentence_tokens = map unpackTokens sentences
	let sentencesZipped = zip sentence_tokens jap_sentences
	let matchings = filter (\(x, y) -> matchWords' x dictionary amount) sentencesZipped
	let result = map (\(x, y) -> y) matchings
	putStrLn $ unlines result
	writeFile output $ unlines result
	#include <iostream>
	#include <cabocha.h>
	#include <fstream>

	int main (int argc, char **argv) {
	CaboCha::Parser *parser = CaboCha::createParser(argc, argv);

	std::ifstream infile("japanese.dat");
	std::ofstream outfile("japanese_processed.dat");

	std::string line;
	while (std::getline(infile, line))
	{
	const CaboCha::Tree *tree = parser->parse(line.c_str());
	outfile << line << ";";
	for (size_t i = 0; i < tree->token_size(); ++i) {
	const CaboCha::Token *token = tree->token(i);
	outfile << token->surface << ",";
	}
	outfile << std::endl;
	}

	return 0;
	}