Created
September 13, 2017 11:49
-
-
Save jbaiter/0978b391f4a2254e676ee70fd98e3057 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"archive/tar" | |
"bytes" | |
"compress/gzip" | |
"encoding/json" | |
"errors" | |
"flag" | |
"fmt" | |
"github.com/beevik/etree" | |
"github.com/vanng822/go-solr/solr" | |
"gopkg.in/cheggaaa/pb.v1" | |
"io" | |
"io/ioutil" | |
"math" | |
"os" | |
"regexp" | |
"runtime" | |
"strconv" | |
"strings" | |
"sync" | |
) | |
type ParseTask struct { | |
zendId string | |
pageNo int | |
xmlData string | |
} | |
type OcrBox struct { | |
zendId string | |
pageNo int | |
wordIdx int | |
text string | |
offsetX float32 | |
offsetY float32 | |
width float32 | |
height float32 | |
} | |
var PAGE_PAT = regexp.MustCompile("(bsb\\d{8})_(\\d{5})") | |
var BBOX_PAT = regexp.MustCompile("bbox (-?\\d+) (-?\\d+) (-?\\d+) (-?\\d+)") | |
var LANG_PAT = regexp.MustCompile("lang ([a-z]+)") | |
var CONF_PAT = regexp.MustCompile("x_wconf ([0-9.]+)") | |
func parseBoxes(xml string, zendId string, pageNo int) ([]OcrBox, error) { | |
doc := etree.NewDocument() | |
if err := doc.ReadFromString(xml); err != nil { | |
return nil, errors.New("Could not parse XML") | |
} | |
page := doc.FindElement(".//div[@class='ocr_page']") | |
if page == nil { | |
return nil, errors.New("Could not find page element") | |
} | |
pageCoords := BBOX_PAT.FindStringSubmatch(page.SelectAttrValue("title", "")) | |
if len(pageCoords) == 0 { | |
return nil, errors.New("Page has no bbox!") | |
} | |
pageWidth, _ := strconv.Atoi(pageCoords[3]) | |
pageHeight, _ := strconv.Atoi(pageCoords[4]) | |
words := doc.FindElements("//span[@class='ocrx_word']") | |
boxes := make([]OcrBox, 0, len(words)) | |
i := 0 | |
for _, word := range words { | |
title := word.SelectAttrValue("title", "") | |
coords := BBOX_PAT.FindStringSubmatch(title) | |
if len(coords) == 0 { | |
fmt.Println("WARNING:", zendId, pageNo, i, "Word has no bbox, skipping") | |
fmt.Println("DEBUG: Title was ", title) | |
continue | |
} | |
if word.Text() == "" { | |
continue | |
} | |
ulx, _ := strconv.Atoi(coords[1]) | |
uly, _ := strconv.Atoi(coords[2]) | |
lrx, _ := strconv.Atoi(coords[3]) | |
lry, _ := strconv.Atoi(coords[4]) | |
boxes = append(boxes, OcrBox{ | |
zendId: zendId, pageNo: pageNo, | |
offsetX: float32(math.Min(float64(ulx)/float64(pageWidth), 0.99)), | |
offsetY: float32(math.Min(float64(uly)/float64(pageHeight), 0.99)), | |
width: float32(math.Min(float64(lrx-ulx)/float64(pageWidth), 0.99)), | |
height: float32(math.Min(float64(lry-uly)/float64(pageHeight), 0.99)), | |
wordIdx: i, | |
text: strings.Replace(word.Text(), "|", "", -1), | |
}) | |
i += 1 | |
} | |
return boxes, nil | |
} | |
func readXmls(srcFile string, taskChan chan ParseTask) { | |
f, err := os.Open(srcFile) | |
if err != nil { | |
fmt.Println(srcFile) | |
panic(err) | |
} | |
defer f.Close() | |
gzf, err := gzip.NewReader(f) | |
if err != nil { | |
fmt.Println(srcFile) | |
panic(err) | |
} | |
tarReader := tar.NewReader(gzf) | |
for { | |
header, err := tarReader.Next() | |
if err == io.EOF { | |
break | |
} | |
if err != nil { | |
panic(err) | |
} | |
name := header.Name | |
if strings.HasSuffix(name, ".html") && !strings.HasSuffix(name, "index.html") { | |
parts := PAGE_PAT.FindStringSubmatch(name) | |
zendId := parts[1] | |
pageNo, _ := strconv.Atoi(parts[2]) | |
xmlBytes, _ := ioutil.ReadAll(tarReader) | |
xml := string(xmlBytes) | |
xml = strings.Replace(xml, "­", "-", -1) | |
taskChan <- ParseTask{xmlData: xml, zendId: zendId, pageNo: pageNo} | |
} | |
} | |
close(taskChan) | |
} | |
func makeSolrToken(box OcrBox) string { | |
if len(box.text) > 224 { | |
box.text = box.text[:224] | |
} | |
return fmt.Sprintf( | |
"%s|%05d%05d%05d%05d%05d%05d", | |
box.text, box.pageNo, box.wordIdx, | |
int(100000*box.offsetX), int(100000*box.offsetY), | |
int(100000*box.width), int(100000*box.height)) | |
} | |
func xmlReader(srcFile string) chan ParseTask { | |
c := make(chan ParseTask, 128) | |
go readXmls(srcFile, c) | |
return c | |
} | |
func indexBundle(srcFile string, solrUrl string, solrCollection string) { | |
si, _ := solr.NewSolrInterface(solrUrl, solrCollection) | |
currentBoxes := make([]OcrBox, 0) | |
for task := range xmlReader(srcFile) { | |
if len(currentBoxes) > 0 && task.zendId != currentBoxes[0].zendId { | |
var buffer bytes.Buffer | |
for _, box := range currentBoxes { | |
token := makeSolrToken(box) | |
buffer.WriteString(token) | |
buffer.WriteString(" ") | |
} | |
doc := solr.Document{ | |
"id": currentBoxes[0].zendId, | |
"ocr_text": buffer.String(), | |
} | |
res, err := si.Add([]solr.Document{doc}, 1, nil) | |
if err != nil { | |
panic(err) | |
} | |
if !res.Success { | |
fmt.Println("Could not index ", currentBoxes[0].zendId) | |
res.Result["doc"] = doc | |
marshalled, _ := json.MarshalIndent(res.Result, "", " ") | |
ioutil.WriteFile( | |
"errors/"+currentBoxes[0].zendId+".json", marshalled, 0644) | |
} | |
currentBoxes = make([]OcrBox, 0) | |
} | |
boxes, err := parseBoxes(task.xmlData, task.zendId, task.pageNo) | |
if err != nil { | |
fmt.Println("ERROR parseBoxes(", task.zendId, ".", task.pageNo, "): ", err) | |
} else { | |
currentBoxes = append(currentBoxes, boxes...) | |
} | |
} | |
si.Commit() | |
} | |
func main() { | |
flag.Parse() | |
bundles := flag.Args() | |
solrUrl := "http://localhost:8983/solr" | |
solrCollection := "fulltext" | |
inChan := make(chan string, len(bundles)) | |
var wg sync.WaitGroup | |
numWorkers := runtime.NumCPU() | |
progressChan := make(chan string, numWorkers) | |
wg.Add(numWorkers) | |
for i := 1; i <= numWorkers; i++ { | |
go func() { | |
for bundlePath := range inChan { | |
indexBundle(bundlePath, solrUrl, solrCollection) | |
progressChan <- bundlePath | |
} | |
wg.Done() | |
}() | |
} | |
for _, bundle := range bundles { | |
inChan <- bundle | |
} | |
close(inChan) | |
// Reporter goroutine | |
wg.Add(1) | |
go func() { | |
bar := pb.StartNew(len(bundles)) | |
bar.ShowTimeLeft = true | |
for i := 0; i < len(bundles); i++ { | |
<-progressChan | |
bar.Increment() | |
} | |
wg.Done() | |
}() | |
wg.Wait() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment