Created
March 5, 2021 11:44
-
-
Save deliro/3c09f0b7b0e6c82fd29ddc7d619ce9b6 to your computer and use it in GitHub Desktop.
Walk through Wikipedia link graph (close to BFS) and find path from the main page to the page contain search term
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"crypto/tls" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"net/http" | |
"regexp" | |
"strings" | |
"sync" | |
) | |
var ( | |
pattern = regexp.MustCompile("<a href=\".*?\">") | |
titlePn = regexp.MustCompile("<title>.*?</title>") | |
seen = &sync.Map{} | |
sem = make(chan struct{}, 30) | |
searchTerm = regexp.MustCompile("(?i)dota 2") | |
requests = 0 | |
) | |
func formatURL(x string) string { | |
x = strings.ReplaceAll(x, "<a href=\"", "") | |
x = strings.ReplaceAll(x, "\">", "") | |
firstSpace := strings.Index(x, " ") | |
if firstSpace != -1 { | |
x = x[:firstSpace] | |
} | |
x = strings.ReplaceAll(x, "\"", "") | |
if strings.HasPrefix(x, "//") { | |
return "" | |
} | |
if strings.HasPrefix(x, "/") { | |
return "https://ru.wikipedia.org" + x | |
} | |
if strings.HasPrefix(x, "https://ru.wikipedia.org") { | |
return x | |
} | |
return "" | |
} | |
func request(req *DownReq) []string { | |
result := make([]string, 0) | |
_, ok := seen.Load(req.url) | |
if ok { | |
return result | |
} | |
seen.Store(req.url, true) | |
resp, err := http.Get(req.url) | |
requests++ | |
if err != nil { | |
log.Println("http error \n", " ", req.url, "\n ", err, "\n...") | |
return result | |
} | |
defer resp.Body.Close() | |
if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") { | |
return result | |
} | |
body, err := ioutil.ReadAll(resp.Body) | |
if err != nil { | |
log.Println("parse err ", err) | |
return result | |
} | |
titleRaw := titlePn.Find(body) | |
title := strings.ReplaceAll(string(titleRaw), "<title>", "") | |
title = strings.ReplaceAll(title, "</title>", "") | |
title = strings.ReplaceAll(title, " — Википедия", "") | |
title = strings.ReplaceAll(title, "Википедия — свободная энциклопедия", "*") | |
req.Title = title | |
log.Println(req.Pretty()) | |
foundTerm := searchTerm.Find(body) | |
if len(foundTerm) > 0 { | |
fmt.Println(req.Pretty()) | |
log.Fatalf("FOUND IN %d reqs", requests) | |
} | |
matches := pattern.FindAll(body, -1) | |
for _, match := range matches { | |
strMatch := string(match) | |
cleanURL := formatURL(strMatch) | |
if cleanURL == "" { | |
continue | |
} | |
_, ok := seen.Load(cleanURL) | |
if !ok { | |
result = append(result, cleanURL) | |
} | |
} | |
return result | |
} | |
type DownReq struct { | |
parent *DownReq | |
url string | |
Title string | |
} | |
func (dr *DownReq) Pretty() string { | |
parents := make([]string, 0) | |
cur := dr | |
for cur != nil { | |
parents = append(parents, cur.Title) | |
cur = cur.parent | |
} | |
for i := len(parents)/2 - 1; i >= 0; i-- { | |
opp := len(parents) - 1 - i | |
parents[i], parents[opp] = parents[opp], parents[i] | |
} | |
return strings.Join(parents, " > ") | |
} | |
func worker(q chan *DownReq) { | |
wg := &sync.WaitGroup{} | |
for req := range q { | |
sem <- struct{}{} | |
wg.Add(1) | |
if (requests % 1000) == 0 { | |
log.Println(requests, " requests made") | |
} | |
go func(req *DownReq) { | |
newUrls := request(req) | |
go func() { | |
for _, url := range newUrls { | |
_, ok := seen.Load(url) | |
if !ok { | |
newReq := &DownReq{parent: req, url: url, Title: ""} | |
q <- newReq | |
} | |
} | |
}() | |
<-sem | |
wg.Done() | |
}(req) | |
} | |
wg.Wait() | |
} | |
func main() { | |
http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true} | |
download := make(chan *DownReq, 1_000_000) | |
startFrom := "https://ru.wikipedia.org/wiki/Заглавная_страница" | |
download <- &DownReq{parent: nil, url: startFrom, Title: ""} | |
worker(download) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment