Skip to content

Instantly share code, notes, and snippets.

@t-kashima
Created September 11, 2016 15:27
Show Gist options
  • Save t-kashima/38d09b1851461a283b344827c55efd25 to your computer and use it in GitHub Desktop.
Save t-kashima/38d09b1851461a283b344827c55efd25 to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"regexp"
"strings"
)
const url = "https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E3%81%AE%E9%AB%98%E7%AD%89%E5%AD%A6%E6%A0%A1%E4%B8%80%E8%A6%A7"
type Pref struct {
name string
link string
}
type School struct {
name string
yomigana string
link string
pref *Pref
}
func (school School) toString() string {
return "name=" + school.name + ", yomigana=" + school.yomigana + ", pref=" + school.pref.name + ", link=" + school.link
}
func (pref Pref) Get(url string) []*Pref {
prefs := []*Pref{}
doc, _ := goquery.NewDocument(url)
prefRegexp := regexp.MustCompile(`^(.+[県道都])高等学校一覧$`)
doc.Find(".navbox a[title$='高等学校一覧']").Each(func(_ int, s *goquery.Selection) {
title, _ := s.Attr("title")
href, _ := s.Attr("href")
if prefRegexp.MatchString(title) {
pref := Pref{name: prefRegexp.ReplaceAllString(title, "$1"), link: "https://ja.wikipedia.org" + href}
prefs = append(prefs, &pref)
}
})
return prefs
}
func (pref Pref) getSchools() []*School {
schools := []*School{}
notSchoolRegexp := regexp.MustCompile(`^.+?:.+?$|^高等学校$`)
doc, _ := goquery.NewDocument(pref.link)
doc.Find("a[title$='高等学校']").Each(func(_ int, s *goquery.Selection) {
title, _ := s.Attr("title")
href, _ := s.Attr("href")
if !notSchoolRegexp.MatchString(title) {
school := School{name: title, link: "https://ja.wikipedia.org" + href, pref: &pref}
schools = append(schools, &school)
}
})
return schools
}
func (school School) getDetail() *School {
spaceRegexp := strings.NewReplacer(" ", "")
yomiganaRegexp := regexp.MustCompile(`((.+?))`)
doc, _ := goquery.NewDocument(school.link)
doc.Find("#mw-content-text > p").First().Each(func(_ int, s *goquery.Selection) {
paragraph := s.Text()
if yomiganaRegexp.MatchString(paragraph) {
result := yomiganaRegexp.FindAllStringSubmatch(paragraph, -1)
yomigana := spaceRegexp.Replace(strings.Split(result[0][1], "、")[0])
school.yomigana = yomigana
}
})
return &school
}
func main() {
prefs := Pref{}.Get(url)
for _, pref := range prefs {
schools := pref.getSchools()
for _, school := range schools {
fmt.Printf(school.toString())
fmt.Println()
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment