Created
April 28, 2017 19:43
-
-
Save valarauca/3bfe94ae3cca2c14ee2b82cdf70f26d2 to your computer and use it in GitHub Desktop.
Testing parsing java definations from javadocs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| import ( | |
| "fmt" | |
| html "golang.org/x/net/html" | |
| atom "golang.org/x/net/html/atom" | |
| "net/http" | |
| "os" | |
| "regexp" | |
| "strings" | |
| ) | |
| // TableSearch is a parallel graph search algorithm | |
| // this (ab)uses the linked list structure of the | |
| // HTML Node to do a depth AND breadthfirst search | |
| // in parallel because go gives us parallel go readers for free | |
| func TableSearch(n *html.Node, item chan<- Defination) { | |
| if nil == n { | |
| return | |
| } | |
| if n.DataAtom == atom.Table { | |
| SplitSearch(n.FirstChild, item) | |
| } else { | |
| TableSearch(n.FirstChild, item) | |
| } | |
| TableSearch(n.NextSibling, item) | |
| } | |
| // Look for a constructor | |
| func SplitSearch(n *html.Node, item chan<- Defination) { | |
| if nil == n { | |
| return | |
| } | |
| if n.DataAtom == atom.Tr && len(n.Attr) == 1 && n.Attr[0].Key == "class" { | |
| if n.Attr[0].Val == "altColor" || n.Attr[0].Val == "rowColor" { | |
| LookForTD(n.FirstChild, item) | |
| } | |
| } | |
| SplitSearch(n.NextSibling, item) | |
| SplitSearch(n.FirstChild, item) | |
| } | |
| //MethoRetValue takes an HTML that contains the return type and gets that | |
| //return type | |
| func MethodRetValue(n *html.Node) (retval string, err bool) { | |
| if n.FirstChild != nil && n.FirstChild.DataAtom == atom.Code && len(n.FirstChild.Attr) > 0 { | |
| aRef := n.FirstChild | |
| a := aRef.Attr[0].Val | |
| b := strings.Replace(a, "../", "", -1) | |
| c := strings.Replace(b, "/", ".", -1) | |
| d := strings.Replace(c, ".html", "", -1) | |
| retval = d | |
| err = false | |
| } else { | |
| retval = n.Data | |
| err = false | |
| } | |
| return retval, err | |
| } | |
| //LookForTD searches for a constructor Table Divide value | |
| func LookForTD(n *html.Node, item chan<- Defination) { | |
| if nil == n { | |
| return | |
| } | |
| // General validate | |
| if n.DataAtom == atom.Td && len(n.Attr) == 1 && n.Attr[0].Key == "class" { | |
| // constructor specific path | |
| if n.Attr[0].Val == "colOne" { | |
| codeChild := n.FirstChild | |
| if codeChild == nil || codeChild.DataAtom != atom.Code { | |
| goto NOTCONSTRUCTOR | |
| } | |
| strongChild := codeChild.FirstChild | |
| if strongChild == nil || strongChild.DataAtom != atom.Strong { | |
| goto NOTCONSTRUCTOR | |
| } | |
| aChild := strongChild.FirstChild | |
| if aChild == nil || aChild.DataAtom != atom.A || len(aChild.Attr) == 0 { | |
| goto NOTCONSTRUCTOR | |
| } | |
| item <- CleanConstructor(aChild.Attr[0].Val) | |
| } | |
| NOTCONSTRUCTOR: | |
| // method specific path | |
| if n.Attr[0].Val == "colFirst" && n.NextSibling != nil { | |
| /* | |
| s := n.NextSibling | |
| if s == nil { | |
| fmt.Printf("I hate go\n") | |
| goto TERMINATION | |
| } | |
| if len(s.Attr) == 0 || s.Attr[0].Val != "colLast" { | |
| fmt.Printf("Bad colname\n") | |
| goto TERMINATION | |
| } | |
| */ | |
| childCode := n.FirstChild | |
| if nil == childCode { | |
| goto TERMINATION | |
| } | |
| retVal, err0 := MethodRetValue(childCode) | |
| if err0 { | |
| goto TERMINATION | |
| } | |
| /* | |
| thing, err1 := CleanMethod(childCode, retVal) | |
| if err1 { | |
| goto TERMINATION | |
| } | |
| */ | |
| fmt.Printf("RetVal %s\n", retVal) | |
| } | |
| } | |
| TERMINATION: | |
| LookForTD(n.NextSibling, item) | |
| } | |
| //Defination holds a structure that represents a single | |
| //function defination scrapped from an online javadoc | |
| type Defination struct { | |
| InteriorMut bool | |
| Constructor bool | |
| Ret string | |
| Symbol string | |
| Args []string | |
| } | |
| // CleanConstructor takes the raw HTML attribute strings and formats | |
| // it into a well fromed Defination type | |
| func CleanConstructor(arg string) Defination { | |
| var argbase string | |
| ret_type := regexp.MustCompile(`([^:]+):`) | |
| symbol := regexp.MustCompile(`:([^\(]+)\(`) | |
| get_args := regexp.MustCompile(`:[^\(]+\(([^\)]+)\)`) | |
| a := strings.Replace(arg, "%20", " ", -1) | |
| b := strings.Replace(a, "../", "", -1) | |
| c := strings.Replace(b, "/", ".", -1) | |
| d := strings.Replace(c, ".html#", ":", -1) | |
| arg0 := get_args.FindAllStringSubmatch(d, -1) | |
| if len(arg0) > 0 { | |
| if len(arg0[0]) == 2 { | |
| argbase = arg0[0][1] | |
| } else { | |
| argbase = "" | |
| } | |
| } else { | |
| argbase = "" | |
| } | |
| args := strings.Split(argbase, `,`) | |
| r := ret_type.FindAllStringSubmatch(d, -1)[0][1] | |
| s := symbol.FindAllStringSubmatch(d, -1)[0][1] | |
| return Defination{ | |
| InteriorMut: false, | |
| Constructor: true, | |
| Ret: r, | |
| Symbol: s, | |
| Args: args, | |
| } | |
| } | |
| //CleanMethod takes an HTML node and returns the defination of that node | |
| //This needs to have its return value overriden | |
| func CleanMethod(n *html.Node, ret string) (def Defination, err bool) { | |
| def = Defination{ | |
| InteriorMut: false, | |
| Constructor: false, | |
| Ret: ret, | |
| Symbol: "", | |
| Args: make([]string, 0), | |
| } | |
| codeNode := n.FirstChild | |
| if codeNode == nil || codeNode.DataAtom != atom.Code { | |
| return def, true | |
| } | |
| strongNode := codeNode.FirstChild | |
| if strongNode == nil || strongNode.DataAtom != atom.Strong { | |
| return def, true | |
| } | |
| aNode := strongNode.FirstChild | |
| if aNode == nil || aNode.DataAtom != atom.A || len(aNode.Attr) == 0 { | |
| return def, true | |
| } | |
| div := codeNode.NextSibling | |
| if div == nil { | |
| return def, true | |
| } | |
| a := strings.Replace(aNode.Attr[0].Val, "../", "", -1) | |
| b := strings.Replace(a, "%20", " ", -1) | |
| c := strings.Replace(b, ".html#", ":", -1) | |
| symbol := regexp.MustCompile(`([^\(]+)\(`) | |
| get_args := regexp.MustCompile(`:[^\(]+\(([^\)]+)\)`) | |
| argbase := get_args.FindAllStringSubmatch(c, -1)[0][1] | |
| def.Args = strings.Split(argbase, `,`) | |
| def.Symbol = symbol.FindAllStringSubmatch(c, -1)[0][1] | |
| def.InteriorMut = strings.Contains(div.Data, "this") | |
| return def, false | |
| } | |
| func main() { | |
| resp, err := http.Get("https://docs.oracle.com/javase/7/docs/api/java/util/Date.html") | |
| if nil != err { | |
| fmt.Printf("Error on fetch %s\n", err) | |
| os.Exit(1) | |
| } | |
| node, err := html.Parse(resp.Body) | |
| if nil != err { | |
| fmt.Printf("Could not parse document %s\n", err) | |
| os.Exit(1) | |
| } | |
| outchan := make(chan Defination) | |
| TableSearch(node, outchan) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment