Skip to content

Instantly share code, notes, and snippets.

@rusco
Created December 10, 2019 14:51
Show Gist options
  • Save rusco/ee56b6d0734a0d823fc61fe792b52fa9 to your computer and use it in GitHub Desktop.
Save rusco/ee56b6d0734a0d823fc61fe792b52fa9 to your computer and use it in GitHub Desktop.
parsing of lawyers from https://portal.oa.pt/ website
//
// date 11.11.2019
// author jr
// purpose parsing of lawyers from https://portal.oa.pt/ website
//
package main
import (
"fmt"
"io/ioutil"
"log"
"net/http"
"strings"
"github.com/PuerkitoBio/goquery"
)
func main() {
cpLines := readCP()
for _, cp := range cpLines {
parseOA(cp)
}
}
func readCP() []string {
content, err := ioutil.ReadFile("cp.txt") //test: cp-test.txt
if err != nil {
log.Fatal(err)
}
lines := strings.Split(string(content), "\n")
return lines
}
func parseOA(cp string) {
cp = strings.TrimSpace(cp)
url := getURL(cp, 1)
getDataByURL(url)
parseLinks(url)
}
func getURL(cp string, page int32) string {
const URL = "https://portal.oa.pt/advogados/pesquisa-de-advogados/?cp=%s&a=on&o=1&page=%v"
urlNew := fmt.Sprintf(URL, cp, page)
return urlNew
}
func getDataByURL(url string) {
res, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer res.Body.Close()
if res.StatusCode != 200 {
log.Fatalf("status code error: %d %s", res.StatusCode, res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
log.Fatal(err)
}
person := "article.search-results__article-person"
name := "h4.search-results__article-person-title"
list := "li.search-results__details-list-item"
desc := "span.search-results__details-list-item-description"
doc.Find(person).Each(func(i int, s *goquery.Selection) {
name := s.Find(name).Text()
fmt.Printf("%s#", name)
s.Find(list).Each(func(i int, s *goquery.Selection) {
desc := s.Find(desc).Text()
fmt.Printf("%s#", desc)
})
fmt.Printf("\n")
})
}
func parseLinks(url string) {
res, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer res.Body.Close()
if res.StatusCode != 200 {
log.Fatalf("status code error: %d %s", res.StatusCode, res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
log.Fatal(err)
}
navigation := "div.ws-pagination"
doc.Find(navigation).Each(func(i int, s *goquery.Selection) {
s.Find("a").Each(func(j int, s2 *goquery.Selection) {
link, exists := s2.Attr("href")
txt := strings.TrimSpace(s2.Text())
if exists && len(txt) > 0 && txt != "1" {
getDataByURL(link)
}
})
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment