Skip to content

Instantly share code, notes, and snippets.

@josuebrunel
Created October 17, 2024 18:35
Show Gist options
  • Save josuebrunel/d32a3babf04cac9e2d1317f193fa7791 to your computer and use it in GitHub Desktop.
Save josuebrunel/d32a3babf04cac9e2d1317f193fa7791 to your computer and use it in GitHub Desktop.
package main
import (
"log/slog"
"sync"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
)
const (
URLQuotes = "quotes"
URLTurtle = "turtle"
URLAliExpress = "aliexpress"
)
// AutherNText represents a quote with its author and text.
type AutherNText struct {
Author string `json:"author"`
Text string `json:"text"`
}
// AliExpressProduct represents a product from AliExpress with its name and price.
type AliExpressProduct struct {
Name string `json:"name"`
Price string `json:"price"`
}
// Turtle represents a turtle species with its name, description, and image URL.
type Turtle struct {
Name string `json:"name"`
Description string `json:"description"`
Image string `json:"image"`
}
// DataExporter is a generic struct that can export data of any type.
type DataExporter[T any] struct {
Data []T
}
// Export adds scraped data to the DataExporter's Data slice.
// It receives data through a channel and appends it to the Data slice.
func (de *DataExporter[T]) Export(scraped chan any) error {
if len(de.Data) == 0 {
de.Data = []T{}
}
for pd := range scraped {
d := pd.(T)
de.Data = append(de.Data, d)
}
return nil
}
func main() {
var (
exporter = DataExporter[AutherNText]{}
urls = map[string][]string{
URLQuotes: {
"http://quotes.toscrape.com/",
},
URLTurtle: {
"https://www.scrapethissite.com/pages/frames/?frame=i&family=Cheloniidae",
"https://www.scrapethissite.com/pages/frames/?frame=i&family=Chelydridae",
"https://www.scrapethissite.com/pages/frames/?frame=i&family=Carettochelyidae",
},
URLAliExpress: {
"https://www.aliexpress.com/item/1005006959851087.html",
"https://www.aliexpress.com/item/1005007265735821.html",
},
}
)
// Scrape quotes
slog.Info("Processing quotes")
g := geziyor.NewGeziyor(&geziyor.Options{
StartURLs: urls[URLQuotes],
ParseFunc: quotesParse,
// Exporters: []export.Exporter{&export.JSON{}, &export.CSV{}}
Exporters: []export.Exporter{&exporter},
})
g.Start()
// slog.Info("exportor content", "data", exporter)
// Scrape turtles
slog.Info("Processing turtles concurrently")
turtleExporter := DataExporter[Turtle]{}
g = geziyor.NewGeziyor(&geziyor.Options{
StartURLs: urls[URLTurtle],
ParseFunc: turtleScraper,
Exporters: []export.Exporter{&turtleExporter},
})
g.Start()
slog.Info("turtle exportor content", "data", turtleExporter.Data)
// Scrape AliExpress products
slog.Info("Processing aliexpress concurrently but manually")
aliExport := DataExporter[AliExpressProduct]{}
g = geziyor.NewGeziyor(&geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {
var wg sync.WaitGroup
for _, url := range urls[URLAliExpress] {
wg.Add(1)
go func() {
slog.Info("processing product", "url", url)
g.GetRendered(url, aliexpressProduct)
wg.Done()
}()
}
wg.Wait()
},
Exporters: []export.Exporter{&aliExport},
BrowserEndpoint: "ws://127.0.0.1:9222",
})
g.Start()
slog.Info("aliexpress exportor content", "data", aliExport.Data)
}
// quotesParse is a parsing function for quote pages.
// It extracts author and text information from each quote on the page,
// exports the data as AutherNText structs, and follows pagination links.
func quotesParse(g *geziyor.Geziyor, r *client.Response) {
// Find all quote elements and process each one
r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
// Extract author and text from the quote
g.Exports <- AutherNText{
Author: s.Find("small.author").Text(),
Text: s.Find("span.text").Text(),
}
})
// Check for a "next" pagination link
if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok {
// If found, follow the link and continue parsing
g.Get(r.JoinURL(href), quotesParse)
}
}
// turtleScraper is a parsing function for turtle family pages.
// It extracts the turtle's name, description, and image URL from the HTML
// and exports the data as a Turtle struct.
func turtleScraper(g *geziyor.Geziyor, r *client.Response) {
g.Exports <- Turtle{
Name: r.HTMLDoc.Find(".turtle-family-detail > h3").Text(),
Description: r.HTMLDoc.Find(".turtle-family-detail > p").Text(),
Image: r.HTMLDoc.Find(".turtle-family-detail > img").AttrOr("src", ""),
}
}
// aliexpressProduct is a parsing function for AliExpress product pages.
// It extracts the product title and price from the rendered HTML and exports
// the data as an AliExpressProduct struct.
// Note: This function uses JavaScript rendering to access dynamic content.
func aliexpressProduct(g *geziyor.Geziyor, r *client.Response) {
// Extract the product title from the h1 element with data-pl="product-title"
title := r.HTMLDoc.Find("h1[data-pl=product-title]").Text()
// Extract the product price from the span element with class "product-price-value"
price := r.HTMLDoc.Find("span.product-price-value").Text()
// Export the extracted data as an AliExpressProduct struct
g.Exports <- AliExpressProduct{Name: title, Price: price}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment