josuebrunel · October 17, 2024 18:35
diff --git a/geziyor.go b/geziyor.go
 package main

 import (
 	"log/slog"
 	"sync"

 	"github.com/PuerkitoBio/goquery"
 	"github.com/geziyor/geziyor"
 	"github.com/geziyor/geziyor/client"
 	"github.com/geziyor/geziyor/export"
 )

 const (
 	URLQuotes     = "quotes"
 	URLTurtle     = "turtle"
 	URLAliExpress = "aliexpress"
 )

 // AutherNText represents a quote with its author and text.
 type AutherNText struct {
 	Author string `json:"author"`
 	Text   string `json:"text"`
 }

 // AliExpressProduct represents a product from AliExpress with its name and price.
 type AliExpressProduct struct {
 	Name  string `json:"name"`
 	Price string `json:"price"`
 }

 // Turtle represents a turtle species with its name, description, and image URL.
 type Turtle struct {
 	Name        string `json:"name"`
 	Description string `json:"description"`
 	Image       string `json:"image"`
 }

 // DataExporter is a generic struct that can export data of any type.
 type DataExporter[T any] struct {
 	Data []T
 }

 // Export adds scraped data to the DataExporter's Data slice.
 // It receives data through a channel and appends it to the Data slice.
 func (de *DataExporter[T]) Export(scraped chan any) error {
 	if len(de.Data) == 0 {
 		de.Data = []T{}
 	}
 	for pd := range scraped {
 		d := pd.(T)
 		de.Data = append(de.Data, d)
 	}
 	return nil
 }

 func main() {
 	var (
 		exporter = DataExporter[AutherNText]{}
 		urls     = map[string][]string{
 			URLQuotes: {
 				"http://quotes.toscrape.com/",
 			},
 			URLTurtle: {
 				"https://www.scrapethissite.com/pages/frames/?frame=i&family=Cheloniidae",
 				"https://www.scrapethissite.com/pages/frames/?frame=i&family=Chelydridae",
 				"https://www.scrapethissite.com/pages/frames/?frame=i&family=Carettochelyidae",
 			},
 			URLAliExpress: {
 				"https://www.aliexpress.com/item/1005006959851087.html",
 				"https://www.aliexpress.com/item/1005007265735821.html",
 			},
 		}
 	)

 	// Scrape quotes
 	slog.Info("Processing quotes")
 	g := geziyor.NewGeziyor(&geziyor.Options{
 		StartURLs: urls[URLQuotes],
 		ParseFunc: quotesParse,
 		// Exporters:       []export.Exporter{&export.JSON{}, &export.CSV{}}
 		Exporters: []export.Exporter{&exporter},
 	})
 	g.Start()
 	// slog.Info("exportor content", "data", exporter)

 	// Scrape turtles
 	slog.Info("Processing turtles concurrently")
 	turtleExporter := DataExporter[Turtle]{}
 	g = geziyor.NewGeziyor(&geziyor.Options{
 		StartURLs: urls[URLTurtle],
 		ParseFunc: turtleScraper,
 		Exporters: []export.Exporter{&turtleExporter},
 	})
 	g.Start()
 	slog.Info("turtle exportor content", "data", turtleExporter.Data)

 	// Scrape AliExpress products
 	slog.Info("Processing aliexpress concurrently but manually")
 	aliExport := DataExporter[AliExpressProduct]{}
 	g = geziyor.NewGeziyor(&geziyor.Options{
 		StartRequestsFunc: func(g *geziyor.Geziyor) {
 			var wg sync.WaitGroup
 			for _, url := range urls[URLAliExpress] {
 				wg.Add(1)
 				go func() {
 					slog.Info("processing product", "url", url)
 					g.GetRendered(url, aliexpressProduct)
 					wg.Done()
 				}()
 			}
 			wg.Wait()
 		},
 		Exporters:       []export.Exporter{&aliExport},
 		BrowserEndpoint: "ws://127.0.0.1:9222",
 	})
 	g.Start()
 	slog.Info("aliexpress exportor content", "data", aliExport.Data)
 }

 // quotesParse is a parsing function for quote pages.
 // It extracts author and text information from each quote on the page,
 // exports the data as AutherNText structs, and follows pagination links.
 func quotesParse(g *geziyor.Geziyor, r *client.Response) {
 	// Find all quote elements and process each one
 	r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
 		// Extract author and text from the quote
 		g.Exports <- AutherNText{
 			Author: s.Find("small.author").Text(),
 			Text:   s.Find("span.text").Text(),
 		}
 	})

 	// Check for a "next" pagination link
 	if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok {
 		// If found, follow the link and continue parsing
 		g.Get(r.JoinURL(href), quotesParse)
 	}
 }

 // turtleScraper is a parsing function for turtle family pages.
 // It extracts the turtle's name, description, and image URL from the HTML
 // and exports the data as a Turtle struct.
 func turtleScraper(g *geziyor.Geziyor, r *client.Response) {
 	g.Exports <- Turtle{
 		Name:        r.HTMLDoc.Find(".turtle-family-detail > h3").Text(),
 		Description: r.HTMLDoc.Find(".turtle-family-detail > p").Text(),
 		Image:       r.HTMLDoc.Find(".turtle-family-detail > img").AttrOr("src", ""),
 	}
 }

 // aliexpressProduct is a parsing function for AliExpress product pages.
 // It extracts the product title and price from the rendered HTML and exports
 // the data as an AliExpressProduct struct.
 // Note: This function uses JavaScript rendering to access dynamic content.
 func aliexpressProduct(g *geziyor.Geziyor, r *client.Response) {
 	// Extract the product title from the h1 element with data-pl="product-title"
 	title := r.HTMLDoc.Find("h1[data-pl=product-title]").Text()

 	// Extract the product price from the span element with class "product-price-value"
 	price := r.HTMLDoc.Find("span.product-price-value").Text()

 	// Export the extracted data as an AliExpressProduct struct
 	g.Exports <- AliExpressProduct{Name: title, Price: price}
 }
	package main

	import (
	"log/slog"
	"sync"

	"github.com/PuerkitoBio/goquery"
	"github.com/geziyor/geziyor"
	"github.com/geziyor/geziyor/client"
	"github.com/geziyor/geziyor/export"
	)

	const (
	URLQuotes = "quotes"
	URLTurtle = "turtle"
	URLAliExpress = "aliexpress"
	)

	// AutherNText represents a quote with its author and text.
	type AutherNText struct {
	Author string `json:"author"`
	Text string `json:"text"`
	}

	// AliExpressProduct represents a product from AliExpress with its name and price.
	type AliExpressProduct struct {
	Name string `json:"name"`
	Price string `json:"price"`
	}

	// Turtle represents a turtle species with its name, description, and image URL.
	type Turtle struct {
	Name string `json:"name"`
	Description string `json:"description"`
	Image string `json:"image"`
	}

	// DataExporter is a generic struct that can export data of any type.
	type DataExporter[T any] struct {
	Data []T
	}

	// Export adds scraped data to the DataExporter's Data slice.
	// It receives data through a channel and appends it to the Data slice.
	func (de *DataExporter[T]) Export(scraped chan any) error {
	if len(de.Data) == 0 {
	de.Data = []T{}
	}
	for pd := range scraped {
	d := pd.(T)
	de.Data = append(de.Data, d)
	}
	return nil
	}

	func main() {
	var (
	exporter = DataExporter[AutherNText]{}
	urls = map[string][]string{
	URLQuotes: {
	"http://quotes.toscrape.com/",
	},
	URLTurtle: {
	"https://www.scrapethissite.com/pages/frames/?frame=i&family=Cheloniidae",
	"https://www.scrapethissite.com/pages/frames/?frame=i&family=Chelydridae",
	"https://www.scrapethissite.com/pages/frames/?frame=i&family=Carettochelyidae",
	},
	URLAliExpress: {
	"https://www.aliexpress.com/item/1005006959851087.html",
	"https://www.aliexpress.com/item/1005007265735821.html",
	},
	}
	)

	// Scrape quotes
	slog.Info("Processing quotes")
	g := geziyor.NewGeziyor(&geziyor.Options{
	StartURLs: urls[URLQuotes],
	ParseFunc: quotesParse,
	// Exporters: []export.Exporter{&export.JSON{}, &export.CSV{}}
	Exporters: []export.Exporter{&exporter},
	})
	g.Start()
	// slog.Info("exportor content", "data", exporter)

	// Scrape turtles
	slog.Info("Processing turtles concurrently")
	turtleExporter := DataExporter[Turtle]{}
	g = geziyor.NewGeziyor(&geziyor.Options{
	StartURLs: urls[URLTurtle],
	ParseFunc: turtleScraper,
	Exporters: []export.Exporter{&turtleExporter},
	})
	g.Start()
	slog.Info("turtle exportor content", "data", turtleExporter.Data)

	// Scrape AliExpress products
	slog.Info("Processing aliexpress concurrently but manually")
	aliExport := DataExporter[AliExpressProduct]{}
	g = geziyor.NewGeziyor(&geziyor.Options{
	StartRequestsFunc: func(g *geziyor.Geziyor) {
	var wg sync.WaitGroup
	for _, url := range urls[URLAliExpress] {
	wg.Add(1)
	go func() {
	slog.Info("processing product", "url", url)
	g.GetRendered(url, aliexpressProduct)
	wg.Done()
	}()
	}
	wg.Wait()
	},
	Exporters: []export.Exporter{&aliExport},
	BrowserEndpoint: "ws://127.0.0.1:9222",
	})
	g.Start()
	slog.Info("aliexpress exportor content", "data", aliExport.Data)
	}

	// quotesParse is a parsing function for quote pages.
	// It extracts author and text information from each quote on the page,
	// exports the data as AutherNText structs, and follows pagination links.
	func quotesParse(g geziyor.Geziyor, r client.Response) {
	// Find all quote elements and process each one
	r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
	// Extract author and text from the quote
	g.Exports <- AutherNText{
	Author: s.Find("small.author").Text(),
	Text: s.Find("span.text").Text(),
	}
	})

	// Check for a "next" pagination link
	if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok {
	// If found, follow the link and continue parsing
	g.Get(r.JoinURL(href), quotesParse)
	}
	}

	// turtleScraper is a parsing function for turtle family pages.
	// It extracts the turtle's name, description, and image URL from the HTML
	// and exports the data as a Turtle struct.
	func turtleScraper(g geziyor.Geziyor, r client.Response) {
	g.Exports <- Turtle{
	Name: r.HTMLDoc.Find(".turtle-family-detail > h3").Text(),
	Description: r.HTMLDoc.Find(".turtle-family-detail > p").Text(),
	Image: r.HTMLDoc.Find(".turtle-family-detail > img").AttrOr("src", ""),
	}
	}

	// aliexpressProduct is a parsing function for AliExpress product pages.
	// It extracts the product title and price from the rendered HTML and exports
	// the data as an AliExpressProduct struct.
	// Note: This function uses JavaScript rendering to access dynamic content.
	func aliexpressProduct(g geziyor.Geziyor, r client.Response) {
	// Extract the product title from the h1 element with data-pl="product-title"
	title := r.HTMLDoc.Find("h1[data-pl=product-title]").Text()

	// Extract the product price from the span element with class "product-price-value"
	price := r.HTMLDoc.Find("span.product-price-value").Text()

	// Export the extracted data as an AliExpressProduct struct
	g.Exports <- AliExpressProduct{Name: title, Price: price}
	}