Last active
February 11, 2021 05:27
-
-
Save Miqueas/b92073803b658c207fbdfaebc10a1512 to your computer and use it in GitHub Desktop.
[Go] Basic scrapper example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
// For the exit status | |
OS "os" | |
// For printing | |
Fmt "fmt" | |
// For commandline arguments | |
Flag "flag" | |
// For requests | |
HTTP "net/http" | |
// For some regular expressions | |
Regex "regexp" | |
// For search the required HTML elements | |
// Install with 'go get github.com/PuerkitoBio/goquery' if needed | |
GoQuery "github.com/PuerkitoBio/goquery" | |
) | |
// Error checking function | |
func check(e error, msg string, args ...interface{}) { | |
if e != nil { | |
Fmt.Printf(msg, args...) | |
panic(e) | |
} | |
} | |
func main() { | |
// Initializes the argument parsing | |
Flag.Parse() | |
// Gets all the arguments | |
var args = Flag.Args() | |
// Length of arguments | |
var argc = len(args) | |
// Format string for print gists | |
var fmt = "\x1b[2m%02d. \x1b[0;1;32mFile: \x1b[0m%s. \x1b[1;32mDescription: \x1b[0m%s.\n" | |
switch argc { | |
// If no arguments, then just exit with status code 1 | |
case 0: | |
Fmt.Println("No arguments, exiting.") | |
OS.Exit(1) | |
// If only 1 argument, then prints all gists for the user name given | |
case 1: | |
var user = args[0] | |
var gists = GetGists(user) | |
if len(gists) == 0 { | |
Fmt.Printf("User '%s' has no gists.\n", user) | |
OS.Exit(0) | |
} else { | |
Fmt.Printf("User '%s' has the following gists:\n", user) | |
for i, v := range gists { | |
Fmt.Printf(fmt, i + 1, v["File"], v["Desc"]) | |
} | |
} | |
// More than 1 argument, then do the same for all the user names given | |
default: | |
for _, user := range args { | |
var gists = GetGists(user) | |
if len(gists) == 0 { | |
Fmt.Printf("User '%s' has no gists.\n", user) | |
} else { | |
Fmt.Printf("User '%s' has the following gists:\n", user) | |
for i, v := range gists { | |
Fmt.Printf(fmt, i + 1, v["File"], v["Desc"]) | |
} | |
} | |
} | |
} | |
} | |
// The main function that fetch user gists | |
func GetGists(user string) []map[string]string { | |
// The return value | |
var arr []map[string]string | |
// Holds errors | |
var err error | |
// The url to fetch | |
var url string = "https://gist.github.com/" + user | |
// Makes a 'GET' request to 'url' | |
res, err := HTTP.Get(url) | |
check(err, "Error fetching url: %s.\n", url) | |
// Creates a new goquery document from the response | |
doc, err := GoQuery.NewDocumentFromReader(res.Body) | |
check(err, "Error reading the response content.\n") | |
// Close the response content after this function end | |
defer res.Body.Close() | |
// For some reason, the description text of gists has spaces at the start | |
// and the end, so... These RegExp pattern is for remove them | |
var start = Regex.MustCompile(`^\s+`) | |
var end = Regex.MustCompile(`\s+$`) | |
// The HTML element with CSS class 'gist-snippet' has al the info that we need | |
var elems = doc.Find(".gist-snippet") | |
// For each element found, we use a function to find the info | |
elems.Each(func(idx int, sel *GoQuery.Selection) { | |
// Top div, with the info: | |
// UserName / FileName | |
// Date Time Created | |
// Description | |
var divtag = sel.Find(".d-inline-block.px-lg-2.px-0") | |
// The gist file name (element) | |
var filetag = divtag.Find("span a + a") | |
// The gist description (element) | |
var desctag = divtag.Find("span.f6.text-gray") | |
// We remove the mentioned spaces in description | |
var descstr = start.ReplaceAllString(end.ReplaceAllString(desctag.Text(), ""), "") | |
// File name (text) | |
var filestr = filetag.Text() | |
// Append the data in the return value | |
arr = append(arr, map[string]string { "File": filestr, "Desc": descstr }) | |
}) | |
return arr | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment