Skip to content

Instantly share code, notes, and snippets.

<!-- guess what this Mediawiki translates into in HTML -->
Mixed markup redistribution <em>p1
p2</em>
use XML::Simple;
use File::Slurp;
my $foo = read_file "foo.xml";
my $xml = eval { XMLin($foo); };
for my $patch_name (keys %{ $xml->{patch} }) {
print "saw: $patch_name \n";
}
# $xml structure returned looks like this:
-- ----------------------------------------------------------------------
-- Simplify wiki text
-- ----------------------------------------------------------------------
dropEndOfArticle = takeWhile (not . end)
where
end (Header _ [ Str "See", Space, Str "also"]) = True
end (Header _ [ Str "External", Space, Str "links"]) = True
end (Header _ [ Str "References"]) = True
end _ = False
tokenise :: CleanHillEntry -> String -> String
tokenise hill str = unwords (render toks)
where
toks = case parse parseChunks "" str of
Left e -> error $ "bug (chunking should always work): " ++ show e
Right cs -> cs
--
render [] = []
render (Word x : ts) = x : render ts
render (Desig x : ts) = x : render ts
diff -ru mediawiki-0.2.4/MediaWiki/Util/Fetch.hs /home/eykk10/tmp/haskell/mediawiki-0.2.4-kowey/MediaWiki/Util/Fetch.hs
--- mediawiki-0.2.4/MediaWiki/Util/Fetch.hs 2009-01-04 04:15:55.000000000 +0000
+++ /home/eykk10/tmp/haskell/mediawiki-0.2.4-kowey/MediaWiki/Util/Fetch.hs 2009-08-25 18:40:09.000000000 +0100
@@ -49,7 +49,7 @@
Just ur -> return (defaultGETRequest ur)
-- don't like doing this, but HTTP is awfully chatty re: cookie handling..
let nullHandler _ = return ()
go :: LlrType -> FilePath -> IO ()
go unitType d =
do enFiles <- filter (\f -> takeExtension f == ".en") `fmap` getDirectoryContents d
let bnames = map takeBaseName enFiles
hPutStrLn stderr "Reading input files... (step 1)"
pairs <- case unitType of
SentenceLevel -> concat `fmap` mapM (readSentencePairs d) bnames
DocumentLevel -> mapM (readDocPair d) bnames
rnf pairs `seq` hPutStrLn stderr "Computing frequencies... (step 2)"
let freqs = frequencies pairs
c381 6c 76 61 72 6f 20 64 65 Álvaro de
20 4d 65 6e 64 61 c3b1 61 20 Mendaña
64 65 20 4e 65 69 72 61 20 de Neira
6f 72 20 4e 65 79 72 61 20 or Neyra
28 31 35 34 32 20 2d 20 4f (1542 - O
63 74 6f 62 65 72 20 31 35 ctober 15
39 35 29 20 77 61 73 20 61 95) was a
20 53 70 61 6e 69 73 68 20 Spanish
6e 61 76 69 67 61 74 6f 72 navigator
2e 20 42 6f 72 6e 20 69 6e . Born in
#!/bin/bash
if [ $# -ne 5 ]; then
echo "Usage: $0 dir-in dir-out ext-in ext-out cmd" >&2
echo "Eg: $0 foo-d bar-d .xml .txt foo2text" >&2
exit 1
fi
DIR_IN=$1
DIR_OUT=$2
# Attribution lines
color quoted green black # quoted text
color body brightred black "(^[> ]*)-([^-])" # what I really want to is match such lines
# but only colour the -/+ symbol
# was hoping mutt by convention ignores groups 1 & 3
# but I can live with the line being red
color body brightgreen black "(^[> ]*)[+]" # brightgreen seems really garish for the whole
# line, just the plus please
color header white red "^X-GTD:" # for GTD next-actions
set display_filter="list-actions"
# use the /tmp/gtd folder to automatically retrieve messages with an action/WF
macro index \Cr "!email-get-msgids mairix^Mc/tmp/gtd^M"
folder-hook /tmp/gtd 'set index_format="%Z %{%b %d} %-25.25L %-35.35i %s"'