influx6 · January 4, 2016 13:39 · influx6 · Jan 2, 2016
diff --git a/parser_noaa_forecasts.go b/parser_noaa_forecasts.go
 package main

 import (
 	"bufio"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"regexp"
 	"strings"
 )

 // Contains rune types for checks.
 const (
 	EOF = rune(0)
 )

 var (
 	// markupFormat is a regexp to find defined sections.
 	markupBeginFormat = regexp.MustCompile("<B><FONT SIZE=\\+1 COLOR=\"#[A-Z|0-9]+\">")
 	// extraBaseMarkup is a regexp to check if a expected text only ending
 	// has a markup after its newline
 	extraBaseMarkup = regexp.MustCompile(`</?[A-Z]+>`)
 	// extraLeveledMarkup matches markup with deep level contents
 	extraLeveledMarkup = regexp.MustCompile(`</?[A-Z]+ ([\w\W]+)?>+`)
 	// timeFormat provides a matcher for the time format used.
 	timeFormat = regexp.MustCompile("^\\d+.*\\d+$")
 )

 // MetaForcast provides a page level document of a current forecast page.
 type MetaForcast struct {
 	Station       string
 	Date          string
 	Warning       []string
 	Meta          []string
 	Header        []string
 	Forecasts     [][]string
 	Descriptions  []string
 	Notifications []string
 }

 // parseURL passes a URL path content into pages using parsePage.
 func parseURL(url string) ([]string, error) {
 	res, err := http.Get(url)
 	if err != nil {
 		return nil, err
 	}

 	defer res.Body.Close()
 	return parsePage(res.Body)
 }

 // parsePage takes a url and passes the input by using a double newline(\n\n)
 // delimiter to split the pages into individual page returning a lists of
 // each pages.
 // Returns a non-nil error if the http get fails.
 func parsePage(r io.Reader) ([]string, error) {
 	var pages []string
 	var page []string

 	reader := bufio.NewReader(r)

 	for {

 		item, err := reader.ReadString('\n')
 		if err != nil {
 			pages = append(pages, strings.Join(page, ""))
 			break

 		}

 		// do we have a do not exist content for page.
 		if strings.Contains(item, "does not exists") {
 			return nil, errors.New("Invalid NOAA Document")
 		}

 		// Check if we are at a newline, If so, peek the next to check if its also
 		// a newline(\n) and if so, start a new page
 		if item == string('\n') {
 			var addNext bool

 			// If we have a doubled line-item then skip else schedule to be added
 			// by next.
 			item, _ := reader.ReadString('\n')
 			if item == string('\n') {
 				continue
 			} else {
 				addNext = true
 			}

 			pages = append(pages, strings.Join(page, ""))
 			page = nil

 			if addNext {
 				page = append(page, string(item))
 			}

 			continue
 		}

 		page = append(page, string(item))
 	}

 	return pages, nil
 }

 // cleanMarkup wipes out all html markup related content,leaving only texts in
 // inbetween markup.
 func cleanMarkup(text string) string {
 	return extraLeveledMarkup.ReplaceAllString(extraBaseMarkup.ReplaceAllString(text, ""), "")
 }

 // cleanAllMarkup cleans out markup out of all items in the string slice.
 func cleanAllMarkup(text []string) []string {
 	var clean []string
 	for _, m := range text {
 		if cm := cleanMarkup(m); cm != "" {
 			clean = append(clean, cm)
 		}
 	}
 	return clean
 }

 // toString provides a string version of the value using json.Marshal.
 func toString(value interface{}, indent bool) string {
 	var data []byte
 	var err error

 	if indent {
 		data, err = json.MarshalIndent(value, "", "\t")
 	} else {
 		data, err = json.Marshal(value)
 	}

 	if err != nil {
 		return ""
 	}

 	return string(data)
 }

 // seperatePages takes a lists of parse page contain and wraps
 func processPages(station string, pages []string) (*MetaForcast, error) {
 	if len(pages) == 0 {
 		return nil, errors.New("No Page Data")
 	}

 	station = strings.ToUpper(station)

 	var head, imeta, notification, warning, desc []string
 	var forecasts [][]string
 	var meta MetaForcast
 	var dateTime string
 	var hasMeta bool

 	// Get the header of the current page.
 	head = cleanAllMarkup(strings.Split(pages[0], "\n"))

 	// Get the time and date from the header
 	for n, hl := range head {
 		if timeFormat.MatchString(hl) {
 			dateTime = hl
 			// If we still have content after the Date, its possible, the description
 			// was merge, cut it and add to description list.
 			if left := n + 1; left < len(head) {
 				descjoint := head[left:]
 				if len(descjoint) > 0 {
 					desc = append(desc, descjoint...)
 				}
 				head = head[:left]
 			}
 			break
 		}
 	}

 	// Collect all description header before station information.
 	pages = pages[1:]
 	for n, p := range pages {
 		if strings.HasPrefix(p, station) {
 			pages = pages[n:]
 			break
 		}
 		desc = append(desc, p)
 	}

 	// Collect meta information for station and Region/Area.
 	undmeta := strings.Split(pages[0], "\n")
 	dmeta := cleanAllMarkup(undmeta)

 	// Check to ensure we are not already in forecast instead of being at metadata.
 	if !markupBeginFormat.MatchString(undmeta[0]) {
 		hasMeta = true
 		for n, p := range dmeta {
 			if timeFormat.MatchString(p) {
 				imeta = append(imeta, p)
 				// EDGECASE
 				if left := n + 1; left < len(dmeta) {
 					warning = append(warning, cleanAllMarkup(dmeta[n+1:])...)
 				}
 				break
 			}
 			imeta = append(imeta, p)
 		}
 	}

 	// If we have meta,then skip that page left by 1.
 	if hasMeta {
 		pages = pages[1:]
 	}

 	// EDGECASE
 	// Check if we have leaking Warnings as new lines.
 	for n, p := range pages {
 		parts := strings.Split(p, "\n")

 		if len(parts) == 0 {
 			continue
 		}

 		if len(parts) == 1 {
 			if !markupBeginFormat.MatchString(parts[0]) {
 				break
 			}
 			warning = append(warning, cleanAllMarkup(parts)...)
 			continue
 		}

 		p1, p2 := parts[0], parts[1]

 		if !markupBeginFormat.MatchString(p1) && !markupBeginFormat.MatchString(p2) {
 			notification = append(notification, cleanAllMarkup(parts)...)
 			continue
 		}

 		if markupBeginFormat.MatchString(p1) && (!markupBeginFormat.MatchString(p2) && cleanMarkup(p2) != "") {
 			pages = pages[n:]
 			break
 		}

 		warning = append(warning, cleanAllMarkup(parts)...)
 	}

 	// Are we done ?, No Forecasts? Then end.
 	if len(pages) > 0 {
 		// We expect atleast a large set of forecasts still greater than two,
 		// because we want to skim off the footer details that appear clogged with
 		// the last forecast with the way the pages are usually structure.
 		expectedPos := len(pages) - 2

 		// EDGECASE
 		// If we are in the negative, then run down the forecast as a single
 		// set.
 		if expectedPos < 0 {
 			for _, p := range pages {
 				// Split the current forecast report and break down its component.
 				// Remove any uneeded meta spots apart from the first bolded
 				// header.
 				report := strings.Split(p, "\n")

 				// EDGECASE
 				// If its just one item, its probably a notification.
 				if len(report) == 1 {
 					notification = append(notification, cleanAllMarkup(report)...)
 					continue
 				} else {
 					p1 := report[0]
 					if !markupBeginFormat.MatchString(p1) {
 						notification = append(notification, cleanAllMarkup(report)...)
 						continue
 					}
 				}

 				detail := []string{cleanMarkup(report[0])}

 				for _, d := range report[1:] {
 					if extraBaseMarkup.MatchString(d) {
 						break
 					}
 					detail = append(detail, d)
 				}
 				forecasts = append(forecasts, detail)
 			}
 		} else {
 			// Collect all forecasts description except the last one because it
 			// has the footer merged in.
 			for _, p := range pages[:expectedPos] {
 				// Split the current forecast report and break down its component.
 				parts := strings.Split(p, "\n")

 				// Skip zero length parts.
 				if len(parts) == 0 {
 					continue
 				}

 				// EDGECASE
 				// If its just one item, its probably a notification.
 				if len(parts) == 1 {
 					notification = append(notification, cleanMarkup(p))
 					continue
 				} else {
 					p1 := parts[0]
 					if !markupBeginFormat.MatchString(p1) {
 						notification = append(notification, cleanAllMarkup(parts)...)
 						continue
 					}
 				}
 				forecasts = append(forecasts, cleanAllMarkup(parts))
 			}

 			// Get the last forecast report and split out the footer information.
 			last := strings.Split(pages[len(pages)-1], "\n")

 			if len(last) > 0 {
 				var lastReport []string
 				// Collect the last forecast header,so we only check text+footermarkup
 				lastReport = append(lastReport, cleanMarkup(last[0]))
 				last = last[1:]
 				for _, sl := range last {
 					if extraBaseMarkup.MatchString(sl) {
 						break
 					}
 					lastReport = append(lastReport, cleanMarkup(sl))
 				}
 				forecasts = append(forecasts, lastReport)
 			}
 		}
 	}

 	meta.Station = station
 	meta.Date = dateTime
 	meta.Header = head
 	meta.Meta = imeta
 	meta.Warning = warning
 	meta.Notifications = notification
 	meta.Forecasts = forecasts
 	meta.Descriptions = desc

 	return &meta, nil
 }

 // GetForecast returns the forcast meta information for a specific page.
 func GetForecast(station string, url string) (*MetaForcast, error) {
 	content, err := parseURL(url)
 	if err != nil {
 		return nil, err
 	}

 	return processPages(station, content)
 }

 func main() {

 	var pages = []map[string]string{
 		map[string]string{
 			"station": "GMZ853",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/GM/GMZ853.txt",
 		},
 		map[string]string{
 			"station": "amz115",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/am/amz115.txt",
 		},
 		map[string]string{
 			"station": "anz898",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/an/anz898.txt",
 		},
 		map[string]string{
 			"station": "gmz052",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/gm/gmz052.txt",
 		},
 		map[string]string{
 			"station": "pkz033",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz033.txt",
 		},
 		map[string]string{
 			"station": "pkz132",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz132.txt",
 		},
 		map[string]string{
 			"station": "pkz132",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz132.txt",
 		},
 		map[string]string{
 			"station": "pmz174",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pm/pmz174.txt",
 		},
 		map[string]string{
 			"station": "pzz915",
 			"url":     "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/pz/PZZ915.txt",
 		},
 	}

 	for _, page := range pages {
 		fmt.Printf("\n")
 		fmt.Printf("----------------Station: %+q---------------------------", page["station"])
 		fmt.Printf("\n")
 		f, err := GetForecast(page["station"], page["url"])
 		if err != nil {
 			fmt.Printf("%s", err)
 		} else {
 			fmt.Printf("%s", toString(f, true))
 		}
 		fmt.Printf("\n")
 		fmt.Println("------------------------------------------------------")
 	}
 }
	package main

	import (
	"bufio"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"net/http"
	"regexp"
	"strings"
	)

	// Contains rune types for checks.
	const (
	EOF = rune(0)
	)

	var (
	// markupFormat is a regexp to find defined sections.
	markupBeginFormat = regexp.MustCompile("<B><FONT SIZE=\\+1 COLOR=\"#[A-Z\|0-9]+\">")
	// extraBaseMarkup is a regexp to check if a expected text only ending
	// has a markup after its newline
	extraBaseMarkup = regexp.MustCompile(`</?[A-Z]+>`)
	// extraLeveledMarkup matches markup with deep level contents
	extraLeveledMarkup = regexp.MustCompile(`</?[A-Z]+ ([\w\W]+)?>+`)
	// timeFormat provides a matcher for the time format used.
	timeFormat = regexp.MustCompile("^\\d+.*\\d+$")
	)

	// MetaForcast provides a page level document of a current forecast page.
	type MetaForcast struct {
	Station string
	Date string
	Warning []string
	Meta []string
	Header []string
	Forecasts [][]string
	Descriptions []string
	Notifications []string
	}

	// parseURL passes a URL path content into pages using parsePage.
	func parseURL(url string) ([]string, error) {
	res, err := http.Get(url)
	if err != nil {
	return nil, err
	}

	defer res.Body.Close()
	return parsePage(res.Body)
	}

	// parsePage takes a url and passes the input by using a double newline(\n\n)
	// delimiter to split the pages into individual page returning a lists of
	// each pages.
	// Returns a non-nil error if the http get fails.
	func parsePage(r io.Reader) ([]string, error) {
	var pages []string
	var page []string

	reader := bufio.NewReader(r)

	for {

	item, err := reader.ReadString('\n')
	if err != nil {
	pages = append(pages, strings.Join(page, ""))
	break

	}

	// do we have a do not exist content for page.
	if strings.Contains(item, "does not exists") {
	return nil, errors.New("Invalid NOAA Document")
	}

	// Check if we are at a newline, If so, peek the next to check if its also
	// a newline(\n) and if so, start a new page
	if item == string('\n') {
	var addNext bool

	// If we have a doubled line-item then skip else schedule to be added
	// by next.
	item, _ := reader.ReadString('\n')
	if item == string('\n') {
	continue
	} else {
	addNext = true
	}

	pages = append(pages, strings.Join(page, ""))
	page = nil

	if addNext {
	page = append(page, string(item))
	}

	continue
	}

	page = append(page, string(item))
	}

	return pages, nil
	}

	// cleanMarkup wipes out all html markup related content,leaving only texts in
	// inbetween markup.
	func cleanMarkup(text string) string {
	return extraLeveledMarkup.ReplaceAllString(extraBaseMarkup.ReplaceAllString(text, ""), "")
	}

	// cleanAllMarkup cleans out markup out of all items in the string slice.
	func cleanAllMarkup(text []string) []string {
	var clean []string
	for _, m := range text {
	if cm := cleanMarkup(m); cm != "" {
	clean = append(clean, cm)
	}
	}
	return clean
	}

	// toString provides a string version of the value using json.Marshal.
	func toString(value interface{}, indent bool) string {
	var data []byte
	var err error

	if indent {
	data, err = json.MarshalIndent(value, "", "\t")
	} else {
	data, err = json.Marshal(value)
	}

	if err != nil {
	return ""
	}

	return string(data)
	}

	// seperatePages takes a lists of parse page contain and wraps
	func processPages(station string, pages []string) (*MetaForcast, error) {
	if len(pages) == 0 {
	return nil, errors.New("No Page Data")
	}

	station = strings.ToUpper(station)

	var head, imeta, notification, warning, desc []string
	var forecasts [][]string
	var meta MetaForcast
	var dateTime string
	var hasMeta bool

	// Get the header of the current page.
	head = cleanAllMarkup(strings.Split(pages[0], "\n"))

	// Get the time and date from the header
	for n, hl := range head {
	if timeFormat.MatchString(hl) {
	dateTime = hl
	// If we still have content after the Date, its possible, the description
	// was merge, cut it and add to description list.
	if left := n + 1; left < len(head) {
	descjoint := head[left:]
	if len(descjoint) > 0 {
	desc = append(desc, descjoint...)
	}
	head = head[:left]
	}
	break
	}
	}

	// Collect all description header before station information.
	pages = pages[1:]
	for n, p := range pages {
	if strings.HasPrefix(p, station) {
	pages = pages[n:]
	break
	}
	desc = append(desc, p)
	}

	// Collect meta information for station and Region/Area.
	undmeta := strings.Split(pages[0], "\n")
	dmeta := cleanAllMarkup(undmeta)

	// Check to ensure we are not already in forecast instead of being at metadata.
	if !markupBeginFormat.MatchString(undmeta[0]) {
	hasMeta = true
	for n, p := range dmeta {
	if timeFormat.MatchString(p) {
	imeta = append(imeta, p)
	// EDGECASE
	if left := n + 1; left < len(dmeta) {
	warning = append(warning, cleanAllMarkup(dmeta[n+1:])...)
	}
	break
	}
	imeta = append(imeta, p)
	}
	}

	// If we have meta,then skip that page left by 1.
	if hasMeta {
	pages = pages[1:]
	}

	// EDGECASE
	// Check if we have leaking Warnings as new lines.
	for n, p := range pages {
	parts := strings.Split(p, "\n")

	if len(parts) == 0 {
	continue
	}

	if len(parts) == 1 {
	if !markupBeginFormat.MatchString(parts[0]) {
	break
	}
	warning = append(warning, cleanAllMarkup(parts)...)
	continue
	}

	p1, p2 := parts[0], parts[1]

	if !markupBeginFormat.MatchString(p1) && !markupBeginFormat.MatchString(p2) {
	notification = append(notification, cleanAllMarkup(parts)...)
	continue
	}

	if markupBeginFormat.MatchString(p1) && (!markupBeginFormat.MatchString(p2) && cleanMarkup(p2) != "") {
	pages = pages[n:]
	break
	}

	warning = append(warning, cleanAllMarkup(parts)...)
	}

	// Are we done ?, No Forecasts? Then end.
	if len(pages) > 0 {
	// We expect atleast a large set of forecasts still greater than two,
	// because we want to skim off the footer details that appear clogged with
	// the last forecast with the way the pages are usually structure.
	expectedPos := len(pages) - 2

	// EDGECASE
	// If we are in the negative, then run down the forecast as a single
	// set.
	if expectedPos < 0 {
	for _, p := range pages {
	// Split the current forecast report and break down its component.
	// Remove any uneeded meta spots apart from the first bolded
	// header.
	report := strings.Split(p, "\n")

	// EDGECASE
	// If its just one item, its probably a notification.
	if len(report) == 1 {
	notification = append(notification, cleanAllMarkup(report)...)
	continue
	} else {
	p1 := report[0]
	if !markupBeginFormat.MatchString(p1) {
	notification = append(notification, cleanAllMarkup(report)...)
	continue
	}
	}

	detail := []string{cleanMarkup(report[0])}

	for _, d := range report[1:] {
	if extraBaseMarkup.MatchString(d) {
	break
	}
	detail = append(detail, d)
	}
	forecasts = append(forecasts, detail)
	}
	} else {
	// Collect all forecasts description except the last one because it
	// has the footer merged in.
	for _, p := range pages[:expectedPos] {
	// Split the current forecast report and break down its component.
	parts := strings.Split(p, "\n")

	// Skip zero length parts.
	if len(parts) == 0 {
	continue
	}

	// EDGECASE
	// If its just one item, its probably a notification.
	if len(parts) == 1 {
	notification = append(notification, cleanMarkup(p))
	continue
	} else {
	p1 := parts[0]
	if !markupBeginFormat.MatchString(p1) {
	notification = append(notification, cleanAllMarkup(parts)...)
	continue
	}
	}
	forecasts = append(forecasts, cleanAllMarkup(parts))
	}

	// Get the last forecast report and split out the footer information.
	last := strings.Split(pages[len(pages)-1], "\n")

	if len(last) > 0 {
	var lastReport []string
	// Collect the last forecast header,so we only check text+footermarkup
	lastReport = append(lastReport, cleanMarkup(last[0]))
	last = last[1:]
	for _, sl := range last {
	if extraBaseMarkup.MatchString(sl) {
	break
	}
	lastReport = append(lastReport, cleanMarkup(sl))
	}
	forecasts = append(forecasts, lastReport)
	}
	}
	}

	meta.Station = station
	meta.Date = dateTime
	meta.Header = head
	meta.Meta = imeta
	meta.Warning = warning
	meta.Notifications = notification
	meta.Forecasts = forecasts
	meta.Descriptions = desc

	return &meta, nil
	}

	// GetForecast returns the forcast meta information for a specific page.
	func GetForecast(station string, url string) (*MetaForcast, error) {
	content, err := parseURL(url)
	if err != nil {
	return nil, err
	}

	return processPages(station, content)
	}

	func main() {

	var pages = []map[string]string{
	map[string]string{
	"station": "GMZ853",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/GM/GMZ853.txt",
	},
	map[string]string{
	"station": "amz115",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/am/amz115.txt",
	},
	map[string]string{
	"station": "anz898",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/an/anz898.txt",
	},
	map[string]string{
	"station": "gmz052",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/gm/gmz052.txt",
	},
	map[string]string{
	"station": "pkz033",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz033.txt",
	},
	map[string]string{
	"station": "pkz132",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz132.txt",
	},
	map[string]string{
	"station": "pkz132",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz132.txt",
	},
	map[string]string{
	"station": "pmz174",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pm/pmz174.txt",
	},
	map[string]string{
	"station": "pzz915",
	"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/pz/PZZ915.txt",
	},
	}

	for _, page := range pages {
	fmt.Printf("\n")
	fmt.Printf("----------------Station: %+q---------------------------", page["station"])
	fmt.Printf("\n")
	f, err := GetForecast(page["station"], page["url"])
	if err != nil {
	fmt.Printf("%s", err)
	} else {
	fmt.Printf("%s", toString(f, true))
	}
	fmt.Printf("\n")
	fmt.Println("------------------------------------------------------")
	}
	}