Created
January 25, 2015 10:52
-
-
Save m2ym/a3363f585cd8ea0e33d3 to your computer and use it in GitHub Desktop.
Extract webpage metadata in OCaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open Core.Std | |
let libguess_determine_encoding = | |
let open Ctypes in | |
Foreign.foreign "libguess_determine_encoding" (string @-> int @-> string @-> returning string) | |
type metadata = { | |
title : string option; | |
type_ : string option; | |
description : string option; | |
image : string option; | |
site_name : string option; | |
favicon : string option; | |
} | |
let extract docs = | |
let open Nethtml in | |
let title = ref None in | |
let type_ = ref None in | |
let description = ref None in | |
let image = ref None in | |
let site_name = ref None in | |
let favicon = ref None in | |
let rec loop = function | |
| Element ("title", _, children) -> | |
if Option.is_none !title then | |
title := Some (String.strip | |
(String.concat | |
(List.filter_map children | |
(function | |
| Data data -> Some data | |
| _ -> None)))) | |
| Element ("link", attrs, _) when | |
[%p? Some ("shortcut icon" | "icon")] <-- List.Assoc.find attrs "rel"; | |
Some href <-- List.Assoc.find attrs "href" -> | |
if Option.is_none !favicon then | |
favicon := Some href | |
| Element ("meta", attrs, _) when | |
(List.mem attrs ("itemprop", "title") || List.mem attrs ("property", "og:title")); | |
Some value <-- List.Assoc.find attrs "content" -> | |
if Option.is_none !title then | |
title := Some value | |
| Element ("meta", attrs, _) when | |
List.mem attrs ("property", "og:type"); | |
Some value <-- List.Assoc.find attrs "content" -> | |
type_ := Some value | |
| Element ("meta", attrs, _) when | |
(List.mem attrs ("itemprop", "description") || List.mem attrs ("property", "og:description")); | |
Some value <-- List.Assoc.find attrs "content" -> | |
description := Some value | |
| Element ("meta", attrs, _) when | |
(List.mem attrs ("itemprop", "image") || List.mem attrs ("property", "og:image")); | |
Some value <-- List.Assoc.find attrs "content" -> | |
image := Some value | |
| Element ("meta", attrs, _) when | |
List.mem attrs ("property", "og:site_name"); | |
Some value <-- List.Assoc.find attrs "content" -> | |
site_name := Some value | |
| Element (_, _, children) -> | |
List.iter children loop | |
| _ -> () | |
in | |
List.iter docs loop; | |
{ title = !title; | |
type_ = !type_; | |
description = !description; | |
image = !image; | |
site_name = !site_name; | |
favicon = !favicon } | |
let () = | |
Lwt_main.run begin Lwt.do_; | |
(resp, body) <-- Cohttp_lwt_unix.Client.get (Uri.of_string Sys.argv.(1)); | |
body <-- Cohttp_lwt_body.to_string body; | |
let charset = libguess_determine_encoding body (String.length body) "Japanese" in | |
let body = Encoding.recode_string ~src:charset ~dst:"UTF-8" body in | |
let docs = Nethtml.parse_document (Lexing.from_string body) in | |
let m = extract docs in | |
Lwt_io.printf "title: %s\n" (Option.value m.title ~default:""); | |
Lwt_io.printf "type: %s\n" (Option.value m.type_ ~default:""); | |
Lwt_io.printf "description: %s\n" (Option.value m.description ~default:""); | |
Lwt_io.printf "image: %s\n" (Option.value m.image ~default:""); | |
Lwt_io.printf "site_name: %s\n" (Option.value m.site_name ~default:""); | |
Lwt_io.printf "favicon: %s\n" (Option.value m.favicon ~default:"") | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ ocamlfind ocamlc -package core,ctypes.foreign,text,netstring,cohttp.lwt,ppx_monadic -linkpkg -thread -custom -cclib -lguess -o extract extract.ml | |
$ ./extract "https://github.com/ocaml/ocaml" | |
title: ocaml/ocaml · GitHub | |
type: object | |
description: ocaml - Read-only mirror of INRIA SVN | |
image: https://avatars3.githubusercontent.com/u/1841483?v=3&s=400 | |
site_name: GitHub | |
favicon: https://assets-cdn.github.com/favicon.ico |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment