Skip to content

Instantly share code, notes, and snippets.

@valarauca
Created April 28, 2017 19:43
Show Gist options
  • Select an option

  • Save valarauca/3bfe94ae3cca2c14ee2b82cdf70f26d2 to your computer and use it in GitHub Desktop.

Select an option

Save valarauca/3bfe94ae3cca2c14ee2b82cdf70f26d2 to your computer and use it in GitHub Desktop.
Testing parsing java definations from javadocs
package main
import (
"fmt"
html "golang.org/x/net/html"
atom "golang.org/x/net/html/atom"
"net/http"
"os"
"regexp"
"strings"
)
// TableSearch is a parallel graph search algorithm
// this (ab)uses the linked list structure of the
// HTML Node to do a depth AND breadthfirst search
// in parallel because go gives us parallel go readers for free
func TableSearch(n *html.Node, item chan<- Defination) {
if nil == n {
return
}
if n.DataAtom == atom.Table {
SplitSearch(n.FirstChild, item)
} else {
TableSearch(n.FirstChild, item)
}
TableSearch(n.NextSibling, item)
}
// Look for a constructor
func SplitSearch(n *html.Node, item chan<- Defination) {
if nil == n {
return
}
if n.DataAtom == atom.Tr && len(n.Attr) == 1 && n.Attr[0].Key == "class" {
if n.Attr[0].Val == "altColor" || n.Attr[0].Val == "rowColor" {
LookForTD(n.FirstChild, item)
}
}
SplitSearch(n.NextSibling, item)
SplitSearch(n.FirstChild, item)
}
//MethoRetValue takes an HTML that contains the return type and gets that
//return type
func MethodRetValue(n *html.Node) (retval string, err bool) {
if n.FirstChild != nil && n.FirstChild.DataAtom == atom.Code && len(n.FirstChild.Attr) > 0 {
aRef := n.FirstChild
a := aRef.Attr[0].Val
b := strings.Replace(a, "../", "", -1)
c := strings.Replace(b, "/", ".", -1)
d := strings.Replace(c, ".html", "", -1)
retval = d
err = false
} else {
retval = n.Data
err = false
}
return retval, err
}
//LookForTD searches for a constructor Table Divide value
func LookForTD(n *html.Node, item chan<- Defination) {
if nil == n {
return
}
// General validate
if n.DataAtom == atom.Td && len(n.Attr) == 1 && n.Attr[0].Key == "class" {
// constructor specific path
if n.Attr[0].Val == "colOne" {
codeChild := n.FirstChild
if codeChild == nil || codeChild.DataAtom != atom.Code {
goto NOTCONSTRUCTOR
}
strongChild := codeChild.FirstChild
if strongChild == nil || strongChild.DataAtom != atom.Strong {
goto NOTCONSTRUCTOR
}
aChild := strongChild.FirstChild
if aChild == nil || aChild.DataAtom != atom.A || len(aChild.Attr) == 0 {
goto NOTCONSTRUCTOR
}
item <- CleanConstructor(aChild.Attr[0].Val)
}
NOTCONSTRUCTOR:
// method specific path
if n.Attr[0].Val == "colFirst" && n.NextSibling != nil {
/*
s := n.NextSibling
if s == nil {
fmt.Printf("I hate go\n")
goto TERMINATION
}
if len(s.Attr) == 0 || s.Attr[0].Val != "colLast" {
fmt.Printf("Bad colname\n")
goto TERMINATION
}
*/
childCode := n.FirstChild
if nil == childCode {
goto TERMINATION
}
retVal, err0 := MethodRetValue(childCode)
if err0 {
goto TERMINATION
}
/*
thing, err1 := CleanMethod(childCode, retVal)
if err1 {
goto TERMINATION
}
*/
fmt.Printf("RetVal %s\n", retVal)
}
}
TERMINATION:
LookForTD(n.NextSibling, item)
}
//Defination holds a structure that represents a single
//function defination scrapped from an online javadoc
type Defination struct {
InteriorMut bool
Constructor bool
Ret string
Symbol string
Args []string
}
// CleanConstructor takes the raw HTML attribute strings and formats
// it into a well fromed Defination type
func CleanConstructor(arg string) Defination {
var argbase string
ret_type := regexp.MustCompile(`([^:]+):`)
symbol := regexp.MustCompile(`:([^\(]+)\(`)
get_args := regexp.MustCompile(`:[^\(]+\(([^\)]+)\)`)
a := strings.Replace(arg, "%20", " ", -1)
b := strings.Replace(a, "../", "", -1)
c := strings.Replace(b, "/", ".", -1)
d := strings.Replace(c, ".html#", ":", -1)
arg0 := get_args.FindAllStringSubmatch(d, -1)
if len(arg0) > 0 {
if len(arg0[0]) == 2 {
argbase = arg0[0][1]
} else {
argbase = ""
}
} else {
argbase = ""
}
args := strings.Split(argbase, `,`)
r := ret_type.FindAllStringSubmatch(d, -1)[0][1]
s := symbol.FindAllStringSubmatch(d, -1)[0][1]
return Defination{
InteriorMut: false,
Constructor: true,
Ret: r,
Symbol: s,
Args: args,
}
}
//CleanMethod takes an HTML node and returns the defination of that node
//This needs to have its return value overriden
func CleanMethod(n *html.Node, ret string) (def Defination, err bool) {
def = Defination{
InteriorMut: false,
Constructor: false,
Ret: ret,
Symbol: "",
Args: make([]string, 0),
}
codeNode := n.FirstChild
if codeNode == nil || codeNode.DataAtom != atom.Code {
return def, true
}
strongNode := codeNode.FirstChild
if strongNode == nil || strongNode.DataAtom != atom.Strong {
return def, true
}
aNode := strongNode.FirstChild
if aNode == nil || aNode.DataAtom != atom.A || len(aNode.Attr) == 0 {
return def, true
}
div := codeNode.NextSibling
if div == nil {
return def, true
}
a := strings.Replace(aNode.Attr[0].Val, "../", "", -1)
b := strings.Replace(a, "%20", " ", -1)
c := strings.Replace(b, ".html#", ":", -1)
symbol := regexp.MustCompile(`([^\(]+)\(`)
get_args := regexp.MustCompile(`:[^\(]+\(([^\)]+)\)`)
argbase := get_args.FindAllStringSubmatch(c, -1)[0][1]
def.Args = strings.Split(argbase, `,`)
def.Symbol = symbol.FindAllStringSubmatch(c, -1)[0][1]
def.InteriorMut = strings.Contains(div.Data, "this")
return def, false
}
func main() {
resp, err := http.Get("https://docs.oracle.com/javase/7/docs/api/java/util/Date.html")
if nil != err {
fmt.Printf("Error on fetch %s\n", err)
os.Exit(1)
}
node, err := html.Parse(resp.Body)
if nil != err {
fmt.Printf("Could not parse document %s\n", err)
os.Exit(1)
}
outchan := make(chan Defination)
TableSearch(node, outchan)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment