Skip to content

Instantly share code, notes, and snippets.

@glena
Created March 16, 2015 17:43
Show Gist options
  • Save glena/e622b30fc3af24ffb3ba to your computer and use it in GitHub Desktop.
Save glena/e622b30fc3af24ffb3ba to your computer and use it in GitHub Desktop.
package lib
import (
"bytes"
"io"
"strings"
"errors"
"syscall"
"unicode/utf8"
)
type CharsetISO88591er struct {
r io.ByteReader
buf *bytes.Buffer
}
func NewCharsetISO88591(r io.Reader) *CharsetISO88591er {
buf := bytes.NewBuffer(make([]byte, 0, utf8.UTFMax))
return &CharsetISO88591er{r.(io.ByteReader), buf}
}
func (cs *CharsetISO88591er) ReadByte() (b byte, err error) {
// http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT
// Date: 1999 July 27; Last modified: 27-Feb-2001 05:08
if cs.buf.Len() <= 0 {
r, err := cs.r.ReadByte()
if err != nil {
return 0, err
}
if r < utf8.RuneSelf {
return r, nil
}
cs.buf.WriteRune(rune(r))
}
return cs.buf.ReadByte()
}
func (cs *CharsetISO88591er) Read(p []byte) (int, error) {
// Use ReadByte method.
return 0, syscall.EINVAL
}
func isCharset(charset string, names []string) bool {
charset = strings.ToLower(charset)
for _, n := range names {
if charset == strings.ToLower(n) {
return true
}
}
return false
}
func IsCharsetISO88591(charset string) bool {
// http://www.iana.org/assignments/character-sets
// (last updated 2010-11-04)
names := []string{
// Name
"ISO_8859-1:1987",
// Alias (preferred MIME name)
"ISO-8859-1",
// Aliases
"iso-ir-100",
"ISO_8859-1",
"latin1",
"l1",
"IBM819",
"CP819",
"csISOLatin1",
}
return isCharset(charset, names)
}
func IsCharsetUTF8(charset string) bool {
names := []string{
"UTF-8",
// Default
"",
}
return isCharset(charset, names)
}
func CharsetReader(charset string, input io.Reader) (io.Reader, error) {
switch {
case IsCharsetUTF8(charset):
return input, nil
case IsCharsetISO88591(charset):
return NewCharsetISO88591(input), nil
}
return nil, errors.New("CharsetReader: unexpected charset: " + charset)
}
package main
import (
"fmt"
rss "github.com/jteeuwen/go-pkg-rss"
"os"
"./lib"
)
/*
http://www.infobae.com/rss
http://www.pagina12.com.ar/usuarios/rss.php
http://stackoverflow.com/questions/6002619/unmarshal-an-iso-8859-1-xml-input-in-go
*/
var pendingItems chan *rss.Item
func main() {
fmt.Println("Start")
feeds := []string{
//"http://cdn01.ib.infobae.com/adjuntos/162/rss/politica.xml",
"http://cdn01.ib.infobae.com/adjuntos/162/rss/Infobae.xml",
//"http://cdn01.ib.infobae.com/adjuntos/162/rss/economia.xml",
//"http://cdn01.ib.infobae.com/adjuntos/162/rss/sociedad.xml",
//"http://cdn01.ib.infobae.com/adjuntos/162/rss/finanzas.xml",
//"http://cdn01.ib.infobae.com/adjuntos/162/rss/policiales.xml",
"http://www.pagina12.com.ar/diario/rss/principal.xml",
//"http://www.pagina12.com.ar/diario/rss/ultimas_noticias.xml",
}
pendingItems = make(chan *rss.Item, 50)
PullFeeds(feeds)
for item := range pendingItems {
fmt.Println("\t",item.Title)
fmt.Println("\t","Links")
for j := range item.Links {
link := item.Links[j]
fmt.Println("\t\t", link.Href)
}
fmt.Println("\t","Categories")
for k := range item.Categories {
category := item.Categories[k]
fmt.Println("\t\t", category.Domain, category.Text)
}
}
}
func PullFeeds(feeds []string) {
feed := rss.New(5, true, chanHandler, itemHandler)
for f := range feeds {
uri := feeds[f]
go Pull(feed, uri)
}
}
func Pull(feed *rss.Feed, uri string) {
if err := feed.Fetch(uri, lib.CharsetReader); err != nil {
fmt.Fprintf(os.Stderr, "[e] %s: %s", uri, err)
fmt.Println(err)
return
}
}
func chanHandler(feed *rss.Feed, newchannels []*rss.Channel) {
fmt.Printf("%d new channel(s) in %s\n", len(newchannels), feed.Url)
}
func itemHandler(feed *rss.Feed, ch *rss.Channel, newitems []*rss.Item) {
fmt.Printf("%d new item(s) in %s\n", len(newitems), feed.Url)
for i := range newitems {
pendingItems <- newitems[i]
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment