Skip to content

Instantly share code, notes, and snippets.

@computerphysicslab
Created July 30, 2020 07:51
Show Gist options
  • Save computerphysicslab/3d2b911ad967d5ec05a94dafff88e2e3 to your computer and use it in GitHub Desktop.
Save computerphysicslab/3d2b911ad967d5ec05a94dafff88e2e3 to your computer and use it in GitHub Desktop.
// https://jdanger.com/build-a-web-crawler-in-go.html
package main
import (
"bytes"
"fmt"
"io/ioutil"
"net/http"
// "github.com/k3a/html2text" // Poor functionality
"github.com/jackdanger/collectlinks"
"jaytaylor.com/html2text"
)
func main() {
// resp, err := http.Get("https://www.nytimes.com/2020/07/27/us/lamborghini-ppp-covid-19.html")
resp, err := http.Get("https://www.nature.com/articles/s41564-020-0771-4")
if err != nil {
panic(fmt.Errorf("Http transport error: %s", err))
}
// Get content
bodyBytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
panic(fmt.Errorf("Read error: %s", err))
}
resp.Body.Close() // must close
resp.Body = ioutil.NopCloser(bytes.NewBuffer(bodyBytes))
// Get links
links := collectlinks.All(resp.Body) // Here we use the collectlinks package
for _, link := range links {
fmt.Println(link)
}
// Save html content to disk
err = ioutil.WriteFile("./output.html", bodyBytes, 0644)
if err != nil {
panic(fmt.Errorf("Saving html to disk: %s", err))
}
// Get plain text content
plain, err := html2text.FromString(string(bodyBytes), html2text.Options{PrettyTables: true})
if err != nil {
panic(fmt.Errorf("html2text.FromString fails: %s", err))
}
fmt.Println(plain)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment