Created
July 30, 2020 07:51
-
-
Save computerphysicslab/3d2b911ad967d5ec05a94dafff88e2e3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://jdanger.com/build-a-web-crawler-in-go.html | |
package main | |
import ( | |
"bytes" | |
"fmt" | |
"io/ioutil" | |
"net/http" | |
// "github.com/k3a/html2text" // Poor functionality | |
"github.com/jackdanger/collectlinks" | |
"jaytaylor.com/html2text" | |
) | |
func main() { | |
// resp, err := http.Get("https://www.nytimes.com/2020/07/27/us/lamborghini-ppp-covid-19.html") | |
resp, err := http.Get("https://www.nature.com/articles/s41564-020-0771-4") | |
if err != nil { | |
panic(fmt.Errorf("Http transport error: %s", err)) | |
} | |
// Get content | |
bodyBytes, err := ioutil.ReadAll(resp.Body) | |
if err != nil { | |
panic(fmt.Errorf("Read error: %s", err)) | |
} | |
resp.Body.Close() // must close | |
resp.Body = ioutil.NopCloser(bytes.NewBuffer(bodyBytes)) | |
// Get links | |
links := collectlinks.All(resp.Body) // Here we use the collectlinks package | |
for _, link := range links { | |
fmt.Println(link) | |
} | |
// Save html content to disk | |
err = ioutil.WriteFile("./output.html", bodyBytes, 0644) | |
if err != nil { | |
panic(fmt.Errorf("Saving html to disk: %s", err)) | |
} | |
// Get plain text content | |
plain, err := html2text.FromString(string(bodyBytes), html2text.Options{PrettyTables: true}) | |
if err != nil { | |
panic(fmt.Errorf("html2text.FromString fails: %s", err)) | |
} | |
fmt.Println(plain) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment