Last active
January 4, 2016 13:39
-
-
Save influx6/88b5d8176ba0204c3084 to your computer and use it in GitHub Desktop.
A gist of a forecast parser for the NOAA Coastal Marine forecast txt files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"encoding/json" | |
"errors" | |
"fmt" | |
"io" | |
"net/http" | |
"regexp" | |
"strings" | |
) | |
// Contains rune types for checks. | |
const ( | |
EOF = rune(0) | |
) | |
var ( | |
// markupFormat is a regexp to find defined sections. | |
markupBeginFormat = regexp.MustCompile("<B><FONT SIZE=\\+1 COLOR=\"#[A-Z|0-9]+\">") | |
// extraBaseMarkup is a regexp to check if a expected text only ending | |
// has a markup after its newline | |
extraBaseMarkup = regexp.MustCompile(`</?[A-Z]+>`) | |
// extraLeveledMarkup matches markup with deep level contents | |
extraLeveledMarkup = regexp.MustCompile(`</?[A-Z]+ ([\w\W]+)?>+`) | |
// timeFormat provides a matcher for the time format used. | |
timeFormat = regexp.MustCompile("^\\d+.*\\d+$") | |
) | |
// MetaForcast provides a page level document of a current forecast page. | |
type MetaForcast struct { | |
Station string | |
Date string | |
Warning []string | |
Meta []string | |
Header []string | |
Forecasts [][]string | |
Descriptions []string | |
Notifications []string | |
} | |
// parseURL passes a URL path content into pages using parsePage. | |
func parseURL(url string) ([]string, error) { | |
res, err := http.Get(url) | |
if err != nil { | |
return nil, err | |
} | |
defer res.Body.Close() | |
return parsePage(res.Body) | |
} | |
// parsePage takes a url and passes the input by using a double newline(\n\n) | |
// delimiter to split the pages into individual page returning a lists of | |
// each pages. | |
// Returns a non-nil error if the http get fails. | |
func parsePage(r io.Reader) ([]string, error) { | |
var pages []string | |
var page []string | |
reader := bufio.NewReader(r) | |
for { | |
item, err := reader.ReadString('\n') | |
if err != nil { | |
pages = append(pages, strings.Join(page, "")) | |
break | |
} | |
// do we have a do not exist content for page. | |
if strings.Contains(item, "does not exists") { | |
return nil, errors.New("Invalid NOAA Document") | |
} | |
// Check if we are at a newline, If so, peek the next to check if its also | |
// a newline(\n) and if so, start a new page | |
if item == string('\n') { | |
var addNext bool | |
// If we have a doubled line-item then skip else schedule to be added | |
// by next. | |
item, _ := reader.ReadString('\n') | |
if item == string('\n') { | |
continue | |
} else { | |
addNext = true | |
} | |
pages = append(pages, strings.Join(page, "")) | |
page = nil | |
if addNext { | |
page = append(page, string(item)) | |
} | |
continue | |
} | |
page = append(page, string(item)) | |
} | |
return pages, nil | |
} | |
// cleanMarkup wipes out all html markup related content,leaving only texts in | |
// inbetween markup. | |
func cleanMarkup(text string) string { | |
return extraLeveledMarkup.ReplaceAllString(extraBaseMarkup.ReplaceAllString(text, ""), "") | |
} | |
// cleanAllMarkup cleans out markup out of all items in the string slice. | |
func cleanAllMarkup(text []string) []string { | |
var clean []string | |
for _, m := range text { | |
if cm := cleanMarkup(m); cm != "" { | |
clean = append(clean, cm) | |
} | |
} | |
return clean | |
} | |
// toString provides a string version of the value using json.Marshal. | |
func toString(value interface{}, indent bool) string { | |
var data []byte | |
var err error | |
if indent { | |
data, err = json.MarshalIndent(value, "", "\t") | |
} else { | |
data, err = json.Marshal(value) | |
} | |
if err != nil { | |
return "" | |
} | |
return string(data) | |
} | |
// seperatePages takes a lists of parse page contain and wraps | |
func processPages(station string, pages []string) (*MetaForcast, error) { | |
if len(pages) == 0 { | |
return nil, errors.New("No Page Data") | |
} | |
station = strings.ToUpper(station) | |
var head, imeta, notification, warning, desc []string | |
var forecasts [][]string | |
var meta MetaForcast | |
var dateTime string | |
var hasMeta bool | |
// Get the header of the current page. | |
head = cleanAllMarkup(strings.Split(pages[0], "\n")) | |
// Get the time and date from the header | |
for n, hl := range head { | |
if timeFormat.MatchString(hl) { | |
dateTime = hl | |
// If we still have content after the Date, its possible, the description | |
// was merge, cut it and add to description list. | |
if left := n + 1; left < len(head) { | |
descjoint := head[left:] | |
if len(descjoint) > 0 { | |
desc = append(desc, descjoint...) | |
} | |
head = head[:left] | |
} | |
break | |
} | |
} | |
// Collect all description header before station information. | |
pages = pages[1:] | |
for n, p := range pages { | |
if strings.HasPrefix(p, station) { | |
pages = pages[n:] | |
break | |
} | |
desc = append(desc, p) | |
} | |
// Collect meta information for station and Region/Area. | |
undmeta := strings.Split(pages[0], "\n") | |
dmeta := cleanAllMarkup(undmeta) | |
// Check to ensure we are not already in forecast instead of being at metadata. | |
if !markupBeginFormat.MatchString(undmeta[0]) { | |
hasMeta = true | |
for n, p := range dmeta { | |
if timeFormat.MatchString(p) { | |
imeta = append(imeta, p) | |
// EDGECASE | |
if left := n + 1; left < len(dmeta) { | |
warning = append(warning, cleanAllMarkup(dmeta[n+1:])...) | |
} | |
break | |
} | |
imeta = append(imeta, p) | |
} | |
} | |
// If we have meta,then skip that page left by 1. | |
if hasMeta { | |
pages = pages[1:] | |
} | |
// EDGECASE | |
// Check if we have leaking Warnings as new lines. | |
for n, p := range pages { | |
parts := strings.Split(p, "\n") | |
if len(parts) == 0 { | |
continue | |
} | |
if len(parts) == 1 { | |
if !markupBeginFormat.MatchString(parts[0]) { | |
break | |
} | |
warning = append(warning, cleanAllMarkup(parts)...) | |
continue | |
} | |
p1, p2 := parts[0], parts[1] | |
if !markupBeginFormat.MatchString(p1) && !markupBeginFormat.MatchString(p2) { | |
notification = append(notification, cleanAllMarkup(parts)...) | |
continue | |
} | |
if markupBeginFormat.MatchString(p1) && (!markupBeginFormat.MatchString(p2) && cleanMarkup(p2) != "") { | |
pages = pages[n:] | |
break | |
} | |
warning = append(warning, cleanAllMarkup(parts)...) | |
} | |
// Are we done ?, No Forecasts? Then end. | |
if len(pages) > 0 { | |
// We expect atleast a large set of forecasts still greater than two, | |
// because we want to skim off the footer details that appear clogged with | |
// the last forecast with the way the pages are usually structure. | |
expectedPos := len(pages) - 2 | |
// EDGECASE | |
// If we are in the negative, then run down the forecast as a single | |
// set. | |
if expectedPos < 0 { | |
for _, p := range pages { | |
// Split the current forecast report and break down its component. | |
// Remove any uneeded meta spots apart from the first bolded | |
// header. | |
report := strings.Split(p, "\n") | |
// EDGECASE | |
// If its just one item, its probably a notification. | |
if len(report) == 1 { | |
notification = append(notification, cleanAllMarkup(report)...) | |
continue | |
} else { | |
p1 := report[0] | |
if !markupBeginFormat.MatchString(p1) { | |
notification = append(notification, cleanAllMarkup(report)...) | |
continue | |
} | |
} | |
detail := []string{cleanMarkup(report[0])} | |
for _, d := range report[1:] { | |
if extraBaseMarkup.MatchString(d) { | |
break | |
} | |
detail = append(detail, d) | |
} | |
forecasts = append(forecasts, detail) | |
} | |
} else { | |
// Collect all forecasts description except the last one because it | |
// has the footer merged in. | |
for _, p := range pages[:expectedPos] { | |
// Split the current forecast report and break down its component. | |
parts := strings.Split(p, "\n") | |
// Skip zero length parts. | |
if len(parts) == 0 { | |
continue | |
} | |
// EDGECASE | |
// If its just one item, its probably a notification. | |
if len(parts) == 1 { | |
notification = append(notification, cleanMarkup(p)) | |
continue | |
} else { | |
p1 := parts[0] | |
if !markupBeginFormat.MatchString(p1) { | |
notification = append(notification, cleanAllMarkup(parts)...) | |
continue | |
} | |
} | |
forecasts = append(forecasts, cleanAllMarkup(parts)) | |
} | |
// Get the last forecast report and split out the footer information. | |
last := strings.Split(pages[len(pages)-1], "\n") | |
if len(last) > 0 { | |
var lastReport []string | |
// Collect the last forecast header,so we only check text+footermarkup | |
lastReport = append(lastReport, cleanMarkup(last[0])) | |
last = last[1:] | |
for _, sl := range last { | |
if extraBaseMarkup.MatchString(sl) { | |
break | |
} | |
lastReport = append(lastReport, cleanMarkup(sl)) | |
} | |
forecasts = append(forecasts, lastReport) | |
} | |
} | |
} | |
meta.Station = station | |
meta.Date = dateTime | |
meta.Header = head | |
meta.Meta = imeta | |
meta.Warning = warning | |
meta.Notifications = notification | |
meta.Forecasts = forecasts | |
meta.Descriptions = desc | |
return &meta, nil | |
} | |
// GetForecast returns the forcast meta information for a specific page. | |
func GetForecast(station string, url string) (*MetaForcast, error) { | |
content, err := parseURL(url) | |
if err != nil { | |
return nil, err | |
} | |
return processPages(station, content) | |
} | |
func main() { | |
var pages = []map[string]string{ | |
map[string]string{ | |
"station": "GMZ853", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/GM/GMZ853.txt", | |
}, | |
map[string]string{ | |
"station": "amz115", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/am/amz115.txt", | |
}, | |
map[string]string{ | |
"station": "anz898", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/an/anz898.txt", | |
}, | |
map[string]string{ | |
"station": "gmz052", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/gm/gmz052.txt", | |
}, | |
map[string]string{ | |
"station": "pkz033", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz033.txt", | |
}, | |
map[string]string{ | |
"station": "pkz132", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz132.txt", | |
}, | |
map[string]string{ | |
"station": "pkz132", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pk/pkz132.txt", | |
}, | |
map[string]string{ | |
"station": "pmz174", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/coastal/pm/pmz174.txt", | |
}, | |
map[string]string{ | |
"station": "pzz915", | |
"url": "http://weather.noaa.gov/cgi-bin/fmtbltn.pl?file=forecasts/marine/offshore/pz/PZZ915.txt", | |
}, | |
} | |
for _, page := range pages { | |
fmt.Printf("\n") | |
fmt.Printf("----------------Station: %+q---------------------------", page["station"]) | |
fmt.Printf("\n") | |
f, err := GetForecast(page["station"], page["url"]) | |
if err != nil { | |
fmt.Printf("%s", err) | |
} else { | |
fmt.Printf("%s", toString(f, true)) | |
} | |
fmt.Printf("\n") | |
fmt.Println("------------------------------------------------------") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Expected Output